diff options
| author | nsfisis <nsfisis@gmail.com> | 2025-10-31 22:54:00 +0900 |
|---|---|---|
| committer | nsfisis <nsfisis@gmail.com> | 2025-10-31 22:54:00 +0900 |
| commit | 151461d55c16a96a305a67d0923acdd574687056 (patch) | |
| tree | 7464865fe584f9ee8b563083cf52a8d4c40a5e92 /plugins/tokenize-ja/src/tokenize.rs | |
| parent | 8c9dfe3472933cf466bf62c83a8303fa35c28409 (diff) | |
| download | phpstudy-N-slides-template-151461d55c16a96a305a67d0923acdd574687056.tar.gz phpstudy-N-slides-template-151461d55c16a96a305a67d0923acdd574687056.tar.zst phpstudy-N-slides-template-151461d55c16a96a305a67d0923acdd574687056.zip | |
Diffstat (limited to 'plugins/tokenize-ja/src/tokenize.rs')
| -rw-r--r-- | plugins/tokenize-ja/src/tokenize.rs | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/plugins/tokenize-ja/src/tokenize.rs b/plugins/tokenize-ja/src/tokenize.rs new file mode 100644 index 0000000..b5c13db --- /dev/null +++ b/plugins/tokenize-ja/src/tokenize.rs @@ -0,0 +1,34 @@ +use anyhow::anyhow; +use anyhow::Context; +use lindera::dictionary::load_dictionary; +use lindera::mode::Mode; +use lindera::segmenter::Segmenter; +use lindera::tokenizer::Tokenizer; +use std::sync::OnceLock; + +static TOKENIZER: OnceLock<Tokenizer> = OnceLock::new(); + +pub fn init() -> Result<(), anyhow::Error> { + let dictionary = load_dictionary("embedded://ipadic").context("failed to load dictionary")?; + let segmenter = Segmenter::new(Mode::Normal, dictionary, None).keep_whitespace(true); + let tokenizer = Tokenizer::new(segmenter); + TOKENIZER + .set(tokenizer) + .map_err(|_| anyhow!("failed to initialize tokenizer"))?; + Ok(()) +} + +pub fn tokenize(text: &str) -> Result<Vec<(String, Option<String>)>, anyhow::Error> { + let tokenizer = TOKENIZER.get().context("call load() first")?; + let mut tokens = tokenizer + .tokenize(text) + .context("failed to tokenize text")?; + Ok(tokens + .iter_mut() + .map(|t| { + let surface: String = t.surface.to_string(); + let detail = t.get_detail(0).map(|s| s.into()); + (surface, detail) + }) + .collect()) +} |
