blob: f7e06434913fc2de9b13e19f029ba950b565002b (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
use anyhow::anyhow;
use anyhow::Context;
use lindera::dictionary::load_dictionary;
use lindera::mode::Mode;
use lindera::segmenter::Segmenter;
use lindera::tokenizer::Tokenizer;
use std::sync::OnceLock;
static TOKENIZER: OnceLock<Tokenizer> = OnceLock::new();
pub fn init() -> Result<(), anyhow::Error> {
let dictionary = load_dictionary("embedded://ipadic").context("failed to load dictionary")?;
let segmenter = Segmenter::new(Mode::Normal, dictionary, None);
let tokenizer = Tokenizer::new(segmenter);
TOKENIZER
.set(tokenizer)
.map_err(|_| anyhow!("failed to initialize tokenizer"))?;
Ok(())
}
pub fn tokenize(text: &str) -> Result<Vec<(String, Option<String>)>, anyhow::Error> {
let tokenizer = TOKENIZER.get().context("call load() first")?;
let mut tokens = tokenizer
.tokenize(text)
.context("failed to tokenize text")?;
Ok(tokens
.iter_mut()
.map(|t| {
let surface: String = t.surface.to_string();
let detail = t.get_detail(0).map(|s| s.into());
(surface, detail)
})
.collect())
}
|