diff --git a/static-lang-word-lists/src/lib.rs b/static-lang-word-lists/src/lib.rs index 01b40df..d8dbcb9 100644 --- a/static-lang-word-lists/src/lib.rs +++ b/static-lang-word-lists/src/lib.rs @@ -66,9 +66,10 @@ //! download by setting the environment variable `STATIC_LANG_WORD_LISTS_LOCAL`. //! Otherwise, you're welcome to audit the [build script](https://github.com/googlefonts/fontheight/blob/main/static-lang-word-lists/build.rs). +mod metadata; mod word_lists; -pub(crate) use word_lists::WordListMetadata; +pub use metadata::*; #[cfg(feature = "rayon")] pub use word_lists::rayon::ParWordListIter; pub use word_lists::{WordList, WordListError, WordListIter}; @@ -124,4 +125,5 @@ macro_rules! word_list { // Module declaration has to be below macro definition to be able to use it mod declarations; + pub use declarations::*; diff --git a/static-lang-word-lists/src/metadata.rs b/static-lang-word-lists/src/metadata.rs new file mode 100644 index 0000000..fb3509c --- /dev/null +++ b/static-lang-word-lists/src/metadata.rs @@ -0,0 +1,119 @@ +use std::{borrow::Cow, fs, path::Path}; + +use serde::Deserialize; + +use crate::WordListError; + +/// Metadata about a [`WordList`](crate::WordList). +/// +/// If you don't want to mess around with the 🐄s, convenience methods are +/// provided for reading fields: +/// - [`WordListMetadata::name`] +/// - [`WordListMetadata::script`] +/// - [`WordListMetadata::language`] +#[derive(Debug, Clone, Eq, PartialEq, Hash, Deserialize)] +#[serde(deny_unknown_fields)] +pub struct WordListMetadata { + /// The cosmetic name for the word list + pub name: Cow<'static, str>, + /// The script of the word list, if known. + /// + /// The script is expected to be an [ISO 15924](https://en.wikipedia.org/wiki/ISO_15924) + /// four-letter capitalised code. + pub script: Option>, + /// The language of the word list, if known. + /// + /// The language is expected to be an [ISO 639-1](https://en.wikipedia.org/wiki/ISO_639-1) + /// two-letter code. + pub language: Option>, +} + +impl WordListMetadata { + // Used by word_list! + #[must_use] + pub(crate) const fn new( + name: &'static str, + script: Option<&'static str>, + language: Option<&'static str>, + ) -> Self { + // Can't use Option::map in const context + let script = match script { + Some(script) => Some(Cow::Borrowed(script)), + None => None, + }; + let language = match language { + Some(language) => Some(Cow::Borrowed(language)), + None => None, + }; + WordListMetadata { + name: Cow::Borrowed(name), + script, + language, + } + } + + /// Load metadata from an on-disk TOML file + #[allow(clippy::result_large_err)] + pub fn load( + metadata_path: impl AsRef, + ) -> Result { + let path = metadata_path.as_ref(); + let metadata_content = fs::read_to_string(path).map_err(|io_err| { + WordListError::FailedToRead(path.to_owned(), io_err) + })?; + let metadata: WordListMetadata = toml::from_str(&metadata_content) + .map_err(|json_err| { + WordListError::MetadataError(path.to_owned(), json_err) + })?; + Ok(metadata) + } + + pub(crate) fn new_from_name(name: impl Into) -> Self { + WordListMetadata { + name: Cow::Owned(name.into()), + script: None, + language: None, + } + } + + /// Get the name of the word list. + #[inline] + #[must_use] + pub fn name(&self) -> &str { + self.name.as_ref() + } + + /// Get the script of the word list, if known. + /// + /// The script is expected to be an [ISO 15924](https://en.wikipedia.org/wiki/ISO_15924) + /// four-letter capitalised code, but this is only guaranteed for built-in + /// word lists. + #[inline] + #[must_use] + pub fn script(&self) -> Option<&str> { + self.script.as_deref() + } + + /// Get the language of the word list, if known. + /// + /// The language is expected to be an [ISO 639-1](https://en.wikipedia.org/wiki/ISO_639-1) + /// two-letter code, but this is only guaranteed for built-in word lists. + #[inline] + #[must_use] + pub fn language(&self) -> Option<&str> { + self.language.as_deref() + } +} + +impl From for WordListMetadata +where + S: Into>, +{ + fn from(word_list_name: S) -> Self { + WordListMetadata { + name: word_list_name.into(), + script: None, + language: None, + } + } +} diff --git a/static-lang-word-lists/src/word_lists.rs b/static-lang-word-lists/src/word_lists.rs index f7ad642..43da153 100644 --- a/static-lang-word-lists/src/word_lists.rs +++ b/static-lang-word-lists/src/word_lists.rs @@ -7,74 +7,30 @@ use std::{ sync::LazyLock, }; -use serde::Deserialize; use thiserror::Error; -use crate::newline_delimited_words; +use crate::{metadata::WordListMetadata, newline_delimited_words}; // TODO: this can be Box pub(crate) type Word = String; pub(crate) type WordSource = Box<[Word]>; -#[derive(Debug, Deserialize)] -#[serde(deny_unknown_fields)] -pub(crate) struct WordListMetadata { - name: Cow<'static, str>, - script: Option>, - language: Option>, -} - -impl WordListMetadata { - // Used by word_list! - #[must_use] - pub(crate) const fn new( - name: &'static str, - script: Option<&'static str>, - language: Option<&'static str>, - ) -> Self { - // Can't use Option::map in const context - let script = match script { - Some(script) => Some(Cow::Borrowed(script)), - None => None, - }; - let language = match language { - Some(language) => Some(Cow::Borrowed(language)), - None => None, - }; - WordListMetadata { - name: Cow::Borrowed(name), - script, - language, - } - } - - #[allow(clippy::result_large_err)] - fn load(metadata_path: impl AsRef) -> Result { - let path = metadata_path.as_ref(); - let metadata_content = fs::read_to_string(path).map_err(|io_err| { - WordListError::FailedToRead(path.to_owned(), io_err) - })?; - let metadata: WordListMetadata = toml::from_str(&metadata_content) - .map_err(|json_err| { - WordListError::MetadataError(path.to_owned(), json_err) - })?; - Ok(metadata) - } - - fn new_from_name(name: impl Into) -> Self { - WordListMetadata { - name: Cow::Owned(name.into()), - script: None, - language: None, - } - } -} - /// A list of words, with optional additional metadata. #[derive(Debug)] pub struct WordList { words: EagerOrLazy, - metadata: WordListMetadata, + /// Metadata associated with this word list. + /// + /// Includes the word list's name, script (if known), and language (if + /// known). + /// + /// You usually only need to access this directly if you plan to edit the + /// metadata, as otherwise you can access metadata from the `WordList` + /// directly: + /// - [`WordList::name`] + /// - [`WordList::script`] + /// - [`WordList::language`] + pub metadata: WordListMetadata, } impl WordList { @@ -132,14 +88,17 @@ impl WordList { /// Create a new word list from an iterable. /// - /// Metadata is unspecified. + /// Types that `impl Into`: + /// - [`&str`] (used as name of word list) + /// - [`String`] (used as name of word list) + /// - [`WordListMetadata`] #[must_use] pub fn define( - name: impl Into, + name_or_metadata: impl Into, words: impl IntoIterator>, ) -> Self { WordList { - metadata: WordListMetadata::new_from_name(name.into()), + metadata: name_or_metadata.into(), words: words.into_iter().map(Into::into).collect::>().into(), } } @@ -173,7 +132,7 @@ impl WordList { #[inline] #[must_use] pub fn name(&self) -> &str { - &self.metadata.name + self.metadata.name() } /// Get the script of the word list, if known. @@ -184,7 +143,7 @@ impl WordList { #[inline] #[must_use] pub fn script(&self) -> Option<&str> { - self.metadata.script.as_deref() + self.metadata.script() } /// Get the language of the word list, if known. @@ -194,7 +153,7 @@ impl WordList { #[inline] #[must_use] pub fn language(&self) -> Option<&str> { - self.metadata.language.as_deref() + self.metadata.language() } /// Iterate through the word list. @@ -216,6 +175,43 @@ impl WordList { pub fn is_empty(&self) -> bool { self.words.is_empty() } + + /// Create a new word list by removing words from an existing one, according + /// to the `predicate`. + /// + /// You can think of this similar to calling [`Vec::retain`], except it + /// returns a new list instead of modifying the old one in-place. + /// Metadata isn't modified. + pub fn filter(&self, mut predicate: F) -> Self + where + F: FnMut(&str) -> bool, + { + let reduced_words = self + .words + .iter() + .filter(|word| predicate(word)) + .cloned() + .collect::>(); + let reduced_words = + EagerOrLazy::Eager(reduced_words.into_boxed_slice()); + Self { + metadata: self.metadata.clone(), + words: reduced_words, + } + } +} + +impl Clone for WordList { + /// Returns a duplicate of the value. + /// + /// Note: this will load the word list for `&self` and the newly returned + /// word list. + fn clone(&self) -> Self { + Self { + metadata: self.metadata.clone(), + words: EagerOrLazy::Eager(self.words.deref().clone()), + } + } } impl Index for WordList {