summaryrefslogtreecommitdiffhomepage
path: root/plugins/tokenize-ja/src/tokenize.rs
diff options
context:
space:
mode:
authornsfisis <nsfisis@gmail.com>2025-10-20 02:32:55 +0900
committernsfisis <nsfisis@gmail.com>2025-10-26 22:47:53 +0900
commit751f595ff1d2142688dc833b4a7ec3e643a5c8a9 (patch)
tree17a32750558e2493f4998dd0341f37647c322b80 /plugins/tokenize-ja/src/tokenize.rs
parent5201520a74d8fa49f77a588204d768d8157f73f7 (diff)
downloadphpstudy-180-slides-751f595ff1d2142688dc833b4a7ec3e643a5c8a9.tar.gz
phpstudy-180-slides-751f595ff1d2142688dc833b4a7ec3e643a5c8a9.tar.zst
phpstudy-180-slides-751f595ff1d2142688dc833b4a7ec3e643a5c8a9.zip
draft2
Diffstat (limited to 'plugins/tokenize-ja/src/tokenize.rs')
-rw-r--r--plugins/tokenize-ja/src/tokenize.rs34
1 files changed, 34 insertions, 0 deletions
diff --git a/plugins/tokenize-ja/src/tokenize.rs b/plugins/tokenize-ja/src/tokenize.rs
new file mode 100644
index 0000000..f7e0643
--- /dev/null
+++ b/plugins/tokenize-ja/src/tokenize.rs
@@ -0,0 +1,34 @@
+use anyhow::anyhow;
+use anyhow::Context;
+use lindera::dictionary::load_dictionary;
+use lindera::mode::Mode;
+use lindera::segmenter::Segmenter;
+use lindera::tokenizer::Tokenizer;
+use std::sync::OnceLock;
+
+static TOKENIZER: OnceLock<Tokenizer> = OnceLock::new();
+
+pub fn init() -> Result<(), anyhow::Error> {
+ let dictionary = load_dictionary("embedded://ipadic").context("failed to load dictionary")?;
+ let segmenter = Segmenter::new(Mode::Normal, dictionary, None);
+ let tokenizer = Tokenizer::new(segmenter);
+ TOKENIZER
+ .set(tokenizer)
+ .map_err(|_| anyhow!("failed to initialize tokenizer"))?;
+ Ok(())
+}
+
+pub fn tokenize(text: &str) -> Result<Vec<(String, Option<String>)>, anyhow::Error> {
+ let tokenizer = TOKENIZER.get().context("call load() first")?;
+ let mut tokens = tokenizer
+ .tokenize(text)
+ .context("failed to tokenize text")?;
+ Ok(tokens
+ .iter_mut()
+ .map(|t| {
+ let surface: String = t.surface.to_string();
+ let detail = t.get_detail(0).map(|s| s.into());
+ (surface, detail)
+ })
+ .collect())
+}