Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

A multilingual morphological analysis library.

License

NotificationsYou must be signed in to change notification settings

lindera/lindera

Repository files navigation

License: MITCrates.io

A morphological analysis library in Rust. This project fork fromkuromoji-rs.

Lindera aims to build a library which is easy to install and provides concise APIs for various Rust applications.

Tokenization examples

Basic tokenization

Put the following in Cargo.toml:

[dependencies]lindera = {version ="0.44.1",features = ["ipadic"] }

This example covers the basic usage of Lindera.

It will:

  • Create a tokenizer in normal mode
  • Tokenize the input text
  • Output the tokens
use lindera::dictionary::{load_dictionary_from_kind,DictionaryKind};use lindera::mode::Mode;use lindera::segmenter::Segmenter;use lindera::tokenizer::Tokenizer;use lindera::LinderaResult;fnmain() ->LinderaResult<()>{letmut config_builder =TokenizerConfigBuilder::new();    config_builder.set_segmenter_dictionary_kind(&DictionaryKind::IPADIC);    config_builder.set_segmenter_mode(&Mode::Normal);let dictionary =load_dictionary_from_kind(DictionaryKind::IPADIC)?;let segmenter =Segmenter::new(Mode::Normal,        dictionary,None,// Assuming no user dictionary is provided);// Create a tokenizer.let tokenizer =Tokenizer::new(segmenter);// Tokenize a text.let text ="関西国際空港限定トートバッグ";letmut tokens = tokenizer.tokenize(text)?;// Print the text and tokens.println!("text:\t{}", text);for tokenin tokens.iter_mut(){let details = token.details().join(",");println!("token:\t{}\t{}", token.text.as_ref(), details);}Ok(())}

The above example can be run as follows:

% cargo run --features=ipadic --example=tokenize

You can see the result as follows:

text:   関西国際空港限定トートバッグtoken:  関西国際空港    名詞,固有名詞,組織,*,*,*,関西国際空港,カンサイコクサイクウコウ,カンサイコクサイクーコーtoken:  限定    名詞,サ変接続,*,*,*,*,限定,ゲンテイ,ゲンテイtoken:  トートバッグ    UNK

Tokenization with user dictionary

You can give user dictionary entries along with the default system dictionary. User dictionary should be a CSV with following format.

<surface>,<part_of_speech>,<reading>

Put the following in Cargo.toml:

[dependencies]lindera = {version ="0.44.1",features = ["ipadic"] }

For example:

% cat ./resources/simple_userdic.csv東京スカイツリー,カスタム名詞,トウキョウスカイツリー東武スカイツリーライン,カスタム名詞,トウブスカイツリーラインとうきょうスカイツリー駅,カスタム名詞,トウキョウスカイツリーエキ

With an user dictionary,Tokenizer will be created as follows:

use std::path::PathBuf;use lindera::dictionary::{    load_dictionary_from_kind, load_user_dictionary_from_csv,DictionaryKind,};use lindera::mode::Mode;use lindera::segmenter::Segmenter;use lindera::tokenizer::Tokenizer;use lindera::LinderaResult;fnmain() ->LinderaResult<()>{let user_dict_path =PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../resources").join("ipadic_simple_userdic.csv");let dictionary =load_dictionary_from_kind(DictionaryKind::IPADIC)?;let user_dictionary =load_user_dictionary_from_csv(DictionaryKind::IPADIC,PathBuf::from("./resources/ipadic_simple_userdic.csv").as_path())?;let segmenter =Segmenter::new(Mode::Normal,        dictionary,Some(user_dictionary),// Assuming no user dictionary is provided);// Create a tokenizer.let tokenizer =Tokenizer::new(segmenter);// Tokenize a text.let text ="東京スカイツリーの最寄り駅はとうきょうスカイツリー駅です";letmut tokens = tokenizer.tokenize(text)?;// Print the text and tokens.println!("text:\t{}", text);for tokenin tokens.iter_mut(){let details = token.details().join(",");println!("token:\t{}\t{}", token.text.as_ref(), details);}Ok(())}

The above example can be bycargo run --example:

% cargo run --features=ipadic --example=tokenize_with_user_dicttext:   東京スカイツリーの最寄り駅はとうきょうスカイツリー駅ですtoken:  東京スカイツリー        カスタム名詞,*,*,*,*,*,東京スカイツリー,トウキョウスカイツリー,*token:  の      助詞,連体化,*,*,*,*,の,ノ,ノtoken:  最寄り駅        名詞,一般,*,*,*,*,最寄り駅,モヨリエキ,モヨリエキtoken:  は      助詞,係助詞,*,*,*,*,は,ハ,ワtoken:  とうきょうスカイツリー駅        カスタム名詞,*,*,*,*,*,とうきょうスカイツリー駅,トウキョウスカイツリーエキ,*token:  です    助動詞,*,*,*,特殊・デス,基本形,です,デス,デス

Tokenize with filters

Put the following in Cargo.toml:

[dependencies]lindera = {version ="0.44.1",features = ["ipadic"] }

This example covers the basic usage of Lindera Analysis Framework.

It will:

  • Apply character filter for Unicode normalization (NFKC)
  • Tokenize the input text with IPADIC
  • Apply token filters for removing stop tags (Part-of-speech) and Japanese Katakana stem filter
use std::collections::HashSet;use lindera::character_filter::japanese_iteration_mark::JapaneseIterationMarkCharacterFilter;use lindera::character_filter::unicode_normalize::{UnicodeNormalizeCharacterFilter,UnicodeNormalizeKind,};use lindera::character_filter::BoxCharacterFilter;use lindera::dictionary::{load_dictionary_from_kind,DictionaryKind};use lindera::mode::Mode;use lindera::segmenter::Segmenter;use lindera::token_filter::japanese_compound_word::JapaneseCompoundWordTokenFilter;use lindera::token_filter::japanese_number::JapaneseNumberTokenFilter;use lindera::token_filter::japanese_stop_tags::JapaneseStopTagsTokenFilter;use lindera::token_filter::BoxTokenFilter;use lindera::tokenizer::Tokenizer;use lindera::LinderaResult;fnmain() ->LinderaResult<()>{let dictionary =load_dictionary_from_kind(DictionaryKind::IPADIC)?;let segmenter =Segmenter::new(Mode::Normal,        dictionary,None,// Assuming no user dictionary is provided);let unicode_normalize_char_filter =UnicodeNormalizeCharacterFilter::new(UnicodeNormalizeKind::NFKC);let japanese_iterration_mark_char_filter =JapaneseIterationMarkCharacterFilter::new(true,true);let japanese_compound_word_token_filter =JapaneseCompoundWordTokenFilter::new(DictionaryKind::IPADIC,vec!["名詞,数".to_string(),"名詞,接尾,助数詞".to_string()].into_iter().collect(),Some("複合語".to_string()),);let japanese_number_token_filter =JapaneseNumberTokenFilter::new(Some(vec!["名詞,数".to_string()].into_iter().collect()));let japanese_stop_tags_token_filter =JapaneseStopTagsTokenFilter::new(vec!["接続詞".to_string(),"助詞".to_string(),"助詞,格助詞".to_string(),"助詞,格助詞,一般".to_string(),"助詞,格助詞,引用".to_string(),"助詞,格助詞,連語".to_string(),"助詞,係助詞".to_string(),"助詞,副助詞".to_string(),"助詞,間投助詞".to_string(),"助詞,並立助詞".to_string(),"助詞,終助詞".to_string(),"助詞,副助詞/並立助詞/終助詞".to_string(),"助詞,連体化".to_string(),"助詞,副詞化".to_string(),"助詞,特殊".to_string(),"助動詞".to_string(),"記号".to_string(),"記号,一般".to_string(),"記号,読点".to_string(),"記号,句点".to_string(),"記号,空白".to_string(),"記号,括弧閉".to_string(),"その他,間投".to_string(),"フィラー".to_string(),"非言語音".to_string(),].into_iter().collect(),);// Create a tokenizer.letmut tokenizer =Tokenizer::new(segmenter);    tokenizer.append_character_filter(BoxCharacterFilter::from(unicode_normalize_char_filter)).append_character_filter(BoxCharacterFilter::from(            japanese_iterration_mark_char_filter,)).append_token_filter(BoxTokenFilter::from(japanese_compound_word_token_filter)).append_token_filter(BoxTokenFilter::from(japanese_number_token_filter)).append_token_filter(BoxTokenFilter::from(japanese_stop_tags_token_filter));// Tokenize a text.let text ="Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。";let tokens = tokenizer.tokenize(text)?;// Print the text and tokens.println!("text: {}", text);for tokenin tokens{println!("token: {:?}, start: {:?}, end: {:?}, details: {:?}",            token.text, token.byte_start, token.byte_end, token.details);}Ok(())}

The above example can be run as follows:

% cargo run --features=ipadic --example=tokenize_with_filters

You can see the result as follows:

text: Linderaは形態素解析エンジンです。ユーザー辞書も利用可能です。token: "Lindera", start: 0, end: 21, details: Some(["UNK"])token: "形態素", start: 24, end: 33, details: Some(["名詞", "一般", "*", "*", "*", "*", "形態素", "ケイタイソ", "ケイタイソ"])token: "解析", start: 33, end: 39, details: Some(["名詞", "サ変接続", "*", "*", "*", "*", "解析", "カイセキ", "カイセキ"])token: "エンジン", start: 39, end: 54, details: Some(["名詞", "一般", "*", "*", "*", "*", "エンジン", "エンジン", "エンジン"])token: "ユーザー", start: 63, end: 75, details: Some(["名詞", "一般", "*", "*", "*", "*", "ユーザー", "ユーザー", "ユーザー"])token: "辞書", start: 75, end: 81, details: Some(["名詞", "一般", "*", "*", "*", "*", "辞書", "ジショ", "ジショ"])token: "利用", start: 84, end: 90, details: Some(["名詞", "サ変接続", "*", "*", "*", "*", "利用", "リヨウ", "リヨー"])token: "可能", start: 90, end: 96, details: Some(["名詞", "形容動詞語幹", "*", "*", "*", "*", "可能", "カノウ", "カノー"])

Configuration file

Lindera is able to read YAML format configuration files.Specify the path to the following file in the environment variable LINDERA_CONFIG_PATH. You can use it easily without having to code the behavior of the tokenizer in Rust code.

segmenter:mode:"normal"dictionary:kind:"ipadic"user_dictionary:path:"./resources/ipadic_simple.csv"kind:"ipadic"character_filters:  -kind:"unicode_normalize"args:kind:"nfkc"  -kind:"japanese_iteration_mark"args:normalize_kanji:truenormalize_kana:true  -kind:mappingargs:mapping:リンデラ:Linderatoken_filters:  -kind:"japanese_compound_word"args:kind:"ipadic"tags:        -"名詞,数"        -"名詞,接尾,助数詞"new_tag:"名詞,数"  -kind:"japanese_number"args:tags:        -"名詞,数"  -kind:"japanese_stop_tags"args:tags:        -"接続詞"        -"助詞"        -"助詞,格助詞"        -"助詞,格助詞,一般"        -"助詞,格助詞,引用"        -"助詞,格助詞,連語"        -"助詞,係助詞"        -"助詞,副助詞"        -"助詞,間投助詞"        -"助詞,並立助詞"        -"助詞,終助詞"        -"助詞,副助詞/並立助詞/終助詞"        -"助詞,連体化"        -"助詞,副詞化"        -"助詞,特殊"        -"助動詞"        -"記号"        -"記号,一般"        -"記号,読点"        -"記号,句点"        -"記号,空白"        -"記号,括弧閉"        -"その他,間投"        -"フィラー"        -"非言語音"  -kind:"japanese_katakana_stem"args:min:3  -kind:"remove_diacritical_mark"args:japanese:false
%export LINDERA_CONFIG_PATH=./resources/lindera.yml
use std::path::PathBuf;use lindera::tokenizer::TokenizerBuilder;use lindera::LinderaResult;fnmain() ->LinderaResult<()>{// Creates a new `TokenizerConfigBuilder` instance.// If the `LINDERA_CONFIG_PATH` environment variable is set, it will attempt to load the initial settings from the specified path.let builder =TokenizerBuilder::from_file(PathBuf::from("./resources/lindera.yml").as_path())?;let tokenizer = builder.build()?;// Tokenize a text.let text ="関西国際空港限定トートバッグ";letmut tokens = tokenizer.tokenize(text)?;// Print the text and tokens.println!("text:\t{}", text);for tokenin tokens.iter_mut(){let details = token.details().join(",");println!("token:\t{}\t{}", token.text.as_ref(), details);}Ok(())}

API reference

The API reference is available. Please see following URL:

About

A multilingual morphological analysis library.

Topics

Resources

License

Stars

Watchers

Forks

Sponsor this project

  •  
  •  
  •  

Packages

No packages published

[8]ページ先頭

©2009-2025 Movatter.jp