From 151461d55c16a96a305a67d0923acdd574687056 Mon Sep 17 00:00:00 2001 From: nsfisis Date: Fri, 31 Oct 2025 22:54:00 +0900 Subject: migrate from SATySFi to Typst --- plugins/tokenize-ja/src/tokenize.rs | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 plugins/tokenize-ja/src/tokenize.rs (limited to 'plugins/tokenize-ja/src/tokenize.rs') diff --git a/plugins/tokenize-ja/src/tokenize.rs b/plugins/tokenize-ja/src/tokenize.rs new file mode 100644 index 0000000..b5c13db --- /dev/null +++ b/plugins/tokenize-ja/src/tokenize.rs @@ -0,0 +1,34 @@ +use anyhow::anyhow; +use anyhow::Context; +use lindera::dictionary::load_dictionary; +use lindera::mode::Mode; +use lindera::segmenter::Segmenter; +use lindera::tokenizer::Tokenizer; +use std::sync::OnceLock; + +static TOKENIZER: OnceLock = OnceLock::new(); + +pub fn init() -> Result<(), anyhow::Error> { + let dictionary = load_dictionary("embedded://ipadic").context("failed to load dictionary")?; + let segmenter = Segmenter::new(Mode::Normal, dictionary, None).keep_whitespace(true); + let tokenizer = Tokenizer::new(segmenter); + TOKENIZER + .set(tokenizer) + .map_err(|_| anyhow!("failed to initialize tokenizer"))?; + Ok(()) +} + +pub fn tokenize(text: &str) -> Result)>, anyhow::Error> { + let tokenizer = TOKENIZER.get().context("call load() first")?; + let mut tokens = tokenizer + .tokenize(text) + .context("failed to tokenize text")?; + Ok(tokens + .iter_mut() + .map(|t| { + let surface: String = t.surface.to_string(); + let detail = t.get_detail(0).map(|s| s.into()); + (surface, detail) + }) + .collect()) +} -- cgit v1.2.3-70-g09d2