aboutsummaryrefslogtreecommitdiffhomepage
path: root/plugins/tokenize-ja/src
diff options
context:
space:
mode:
Diffstat (limited to 'plugins/tokenize-ja/src')
-rw-r--r--plugins/tokenize-ja/src/lib.rs20
-rw-r--r--plugins/tokenize-ja/src/tokenize.rs34
2 files changed, 54 insertions, 0 deletions
diff --git a/plugins/tokenize-ja/src/lib.rs b/plugins/tokenize-ja/src/lib.rs
new file mode 100644
index 0000000..fb58250
--- /dev/null
+++ b/plugins/tokenize-ja/src/lib.rs
@@ -0,0 +1,20 @@
+use ciborium::ser::into_writer;
+use wasm_minimal_protocol::*;
+
+initiate_protocol!();
+
+mod tokenize;
+
+#[wasm_func]
+pub fn init() -> Result<Vec<u8>, anyhow::Error> {
+ tokenize::init()?;
+ Ok(Vec::new())
+}
+
+#[wasm_func]
+pub fn tokenize(text: &[u8]) -> Result<Vec<u8>, anyhow::Error> {
+ let result = tokenize::tokenize(str::from_utf8(text)?)?;
+ let mut out = Vec::new();
+ into_writer(&result, &mut out)?;
+ Ok(out)
+}
diff --git a/plugins/tokenize-ja/src/tokenize.rs b/plugins/tokenize-ja/src/tokenize.rs
new file mode 100644
index 0000000..b5c13db
--- /dev/null
+++ b/plugins/tokenize-ja/src/tokenize.rs
@@ -0,0 +1,34 @@
+use anyhow::anyhow;
+use anyhow::Context;
+use lindera::dictionary::load_dictionary;
+use lindera::mode::Mode;
+use lindera::segmenter::Segmenter;
+use lindera::tokenizer::Tokenizer;
+use std::sync::OnceLock;
+
+static TOKENIZER: OnceLock<Tokenizer> = OnceLock::new();
+
+pub fn init() -> Result<(), anyhow::Error> {
+ let dictionary = load_dictionary("embedded://ipadic").context("failed to load dictionary")?;
+ let segmenter = Segmenter::new(Mode::Normal, dictionary, None).keep_whitespace(true);
+ let tokenizer = Tokenizer::new(segmenter);
+ TOKENIZER
+ .set(tokenizer)
+ .map_err(|_| anyhow!("failed to initialize tokenizer"))?;
+ Ok(())
+}
+
+pub fn tokenize(text: &str) -> Result<Vec<(String, Option<String>)>, anyhow::Error> {
+ let tokenizer = TOKENIZER.get().context("call load() first")?;
+ let mut tokens = tokenizer
+ .tokenize(text)
+ .context("failed to tokenize text")?;
+ Ok(tokens
+ .iter_mut()
+ .map(|t| {
+ let surface: String = t.surface.to_string();
+ let detail = t.get_detail(0).map(|s| s.into());
+ (surface, detail)
+ })
+ .collect())
+}