diff options
| author | nsfisis <nsfisis@gmail.com> | 2026-03-21 15:16:17 +0900 |
|---|---|---|
| committer | nsfisis <nsfisis@gmail.com> | 2026-03-21 15:16:17 +0900 |
| commit | 56547d348ccaca4369560a5fe69aba44c2521ebd (patch) | |
| tree | d4c977985900ee1ee87ced1e43986653932c791d /plugins/tokenize-ja/src/tokenize.rs | |
| parent | b00d51802e6a5bd5dc46d9ab64ce6376d7f0121b (diff) | |
| download | phperkaigi-2026-php-modification-slides-56547d348ccaca4369560a5fe69aba44c2521ebd.tar.gz phperkaigi-2026-php-modification-slides-56547d348ccaca4369560a5fe69aba44c2521ebd.tar.zst phperkaigi-2026-php-modification-slides-56547d348ccaca4369560a5fe69aba44c2521ebd.zip | |
wip
Diffstat (limited to 'plugins/tokenize-ja/src/tokenize.rs')
| -rw-r--r-- | plugins/tokenize-ja/src/tokenize.rs | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/plugins/tokenize-ja/src/tokenize.rs b/plugins/tokenize-ja/src/tokenize.rs new file mode 100644 index 0000000..b5c13db --- /dev/null +++ b/plugins/tokenize-ja/src/tokenize.rs @@ -0,0 +1,34 @@ +use anyhow::anyhow; +use anyhow::Context; +use lindera::dictionary::load_dictionary; +use lindera::mode::Mode; +use lindera::segmenter::Segmenter; +use lindera::tokenizer::Tokenizer; +use std::sync::OnceLock; + +static TOKENIZER: OnceLock<Tokenizer> = OnceLock::new(); + +pub fn init() -> Result<(), anyhow::Error> { + let dictionary = load_dictionary("embedded://ipadic").context("failed to load dictionary")?; + let segmenter = Segmenter::new(Mode::Normal, dictionary, None).keep_whitespace(true); + let tokenizer = Tokenizer::new(segmenter); + TOKENIZER + .set(tokenizer) + .map_err(|_| anyhow!("failed to initialize tokenizer"))?; + Ok(()) +} + +pub fn tokenize(text: &str) -> Result<Vec<(String, Option<String>)>, anyhow::Error> { + let tokenizer = TOKENIZER.get().context("call load() first")?; + let mut tokens = tokenizer + .tokenize(text) + .context("failed to tokenize text")?; + Ok(tokens + .iter_mut() + .map(|t| { + let surface: String = t.surface.to_string(); + let detail = t.get_detail(0).map(|s| s.into()); + (surface, detail) + }) + .collect()) +} |
