diff options
Diffstat (limited to 'plugins/tokenize-ja/src')
| -rw-r--r-- | plugins/tokenize-ja/src/lib.rs | 20 | ||||
| -rw-r--r-- | plugins/tokenize-ja/src/tokenize.rs | 34 |
2 files changed, 54 insertions, 0 deletions
diff --git a/plugins/tokenize-ja/src/lib.rs b/plugins/tokenize-ja/src/lib.rs new file mode 100644 index 0000000..fb58250 --- /dev/null +++ b/plugins/tokenize-ja/src/lib.rs @@ -0,0 +1,20 @@ +use ciborium::ser::into_writer; +use wasm_minimal_protocol::*; + +initiate_protocol!(); + +mod tokenize; + +#[wasm_func] +pub fn init() -> Result<Vec<u8>, anyhow::Error> { + tokenize::init()?; + Ok(Vec::new()) +} + +#[wasm_func] +pub fn tokenize(text: &[u8]) -> Result<Vec<u8>, anyhow::Error> { + let result = tokenize::tokenize(str::from_utf8(text)?)?; + let mut out = Vec::new(); + into_writer(&result, &mut out)?; + Ok(out) +} diff --git a/plugins/tokenize-ja/src/tokenize.rs b/plugins/tokenize-ja/src/tokenize.rs new file mode 100644 index 0000000..b5c13db --- /dev/null +++ b/plugins/tokenize-ja/src/tokenize.rs @@ -0,0 +1,34 @@ +use anyhow::anyhow; +use anyhow::Context; +use lindera::dictionary::load_dictionary; +use lindera::mode::Mode; +use lindera::segmenter::Segmenter; +use lindera::tokenizer::Tokenizer; +use std::sync::OnceLock; + +static TOKENIZER: OnceLock<Tokenizer> = OnceLock::new(); + +pub fn init() -> Result<(), anyhow::Error> { + let dictionary = load_dictionary("embedded://ipadic").context("failed to load dictionary")?; + let segmenter = Segmenter::new(Mode::Normal, dictionary, None).keep_whitespace(true); + let tokenizer = Tokenizer::new(segmenter); + TOKENIZER + .set(tokenizer) + .map_err(|_| anyhow!("failed to initialize tokenizer"))?; + Ok(()) +} + +pub fn tokenize(text: &str) -> Result<Vec<(String, Option<String>)>, anyhow::Error> { + let tokenizer = TOKENIZER.get().context("call load() first")?; + let mut tokens = tokenizer + .tokenize(text) + .context("failed to tokenize text")?; + Ok(tokens + .iter_mut() + .map(|t| { + let surface: String = t.surface.to_string(); + let detail = t.get_detail(0).map(|s| s.into()); + (surface, detail) + }) + .collect()) +} |
