diff options
| author | nsfisis <nsfisis@gmail.com> | 2025-11-24 12:07:52 +0900 |
|---|---|---|
| committer | nsfisis <nsfisis@gmail.com> | 2025-11-24 12:07:52 +0900 |
| commit | 8b2dbea5f367bf925f2370a753dddb4c6af0cc0d (patch) | |
| tree | 1b12c186def3c4b66a7ca42b6a54250387627bb3 /plugins/tokenize-ja/src/tokenize.rs | |
| parent | af847a4fd3a2dbcc6840944e0385b5c2830b13aa (diff) | |
| download | phpconkagawa-2025-slides-8b2dbea5f367bf925f2370a753dddb4c6af0cc0d.tar.gz phpconkagawa-2025-slides-8b2dbea5f367bf925f2370a753dddb4c6af0cc0d.tar.zst phpconkagawa-2025-slides-8b2dbea5f367bf925f2370a753dddb4c6af0cc0d.zip | |
Diffstat (limited to 'plugins/tokenize-ja/src/tokenize.rs')
| -rw-r--r-- | plugins/tokenize-ja/src/tokenize.rs | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/plugins/tokenize-ja/src/tokenize.rs b/plugins/tokenize-ja/src/tokenize.rs new file mode 100644 index 0000000..b5c13db --- /dev/null +++ b/plugins/tokenize-ja/src/tokenize.rs @@ -0,0 +1,34 @@ +use anyhow::anyhow; +use anyhow::Context; +use lindera::dictionary::load_dictionary; +use lindera::mode::Mode; +use lindera::segmenter::Segmenter; +use lindera::tokenizer::Tokenizer; +use std::sync::OnceLock; + +static TOKENIZER: OnceLock<Tokenizer> = OnceLock::new(); + +pub fn init() -> Result<(), anyhow::Error> { + let dictionary = load_dictionary("embedded://ipadic").context("failed to load dictionary")?; + let segmenter = Segmenter::new(Mode::Normal, dictionary, None).keep_whitespace(true); + let tokenizer = Tokenizer::new(segmenter); + TOKENIZER + .set(tokenizer) + .map_err(|_| anyhow!("failed to initialize tokenizer"))?; + Ok(()) +} + +pub fn tokenize(text: &str) -> Result<Vec<(String, Option<String>)>, anyhow::Error> { + let tokenizer = TOKENIZER.get().context("call load() first")?; + let mut tokens = tokenizer + .tokenize(text) + .context("failed to tokenize text")?; + Ok(tokens + .iter_mut() + .map(|t| { + let surface: String = t.surface.to_string(); + let detail = t.get_detail(0).map(|s| s.into()); + (surface, detail) + }) + .collect()) +} |
