JapaneseNLP

Apache Solr 7.6 + SolrJ で形態素解析の詳しい結果を取得する(日本語)

概要

Solr の日本語解析はデフォルトで形態素解析が行われます。

Javaで形態素解析を利用したいときのコードです。

ソースコード

package hello.solr;import java.util.ArrayList;import java.util.HashMap;import org.apache.solr.client.solrj.SolrClient;import org.apache.solr.client.solrj.impl.HttpSolrClient;import org.apache.solr.client.solrj.request.DocumentAnalysisRequest;import org.apache.solr.common.SolrInputDocument;import org.apache.solr.common.SolrInputField;import org.apache.solr.common.util.NamedList;import org.apache.solr.common.util.SimpleOrderedMap;public class HelloAnalysisJapaneseSimple { @SuppressWarnings({ "unchecked", "rawtypes" }) static public void main(String[] args) throws Exception { String fieldName = "field_text_ja"; String coreName = "core_nlp"; String text = "こんにちは。今日はいい天気ですね。私は日産自動車の社員です。"; HashMap<String, SolrInputField> fields = new HashMap<String, SolrInputField>(); // Document SolrInputDocument doc = new SolrInputDocument(fields); { // Document Field doc.setField("id", "0"); doc.setField(fieldName, text); } // Request DocumentAnalysisRequest request = new DocumentAnalysisRequest(); request.addDocument(doc); String solrLocation = "http://localhost:8983/solr/" + coreName; // NLP Client SolrClient client = new HttpSolrClient.Builder(solrLocation).build(); // NLP Response NamedList<Object> response = client.request(request); // Get analysis response NamedList<Object> analysis = (NamedList<Object>) response .get("analysis"); SimpleOrderedMap f = ((SimpleOrderedMap) ((SimpleOrderedMap) analysis .getVal(0)).get(fieldName)); SimpleOrderedMap index = (SimpleOrderedMap) f.get("index"); NamedList nlpResult = (NamedList) index.getVal(0); System.err.println("Tokenizer,Filter ---"); { for (int n = 0; n < nlpResult.size(); n++) { System.err.println(nlpResult.getName(n) + "=" + nlpResult.getVal(n)); } } ArrayList wordListPOS = (ArrayList) nlpResult .get("org.apache.lucene.analysis.ja.JapaneseTokenizer"); if (wordListPOS != null) { for (int n = 0; n < wordListPOS.size(); n++) { SimpleOrderedMap wordPOS = (SimpleOrderedMap) wordListPOS .get(n); if (n == 0) { System.err.println("<names>"); for (int m = 0; m < wordPOS.size(); m++) { System.err.println(wordPOS.getName(m) + "=" + wordPOS.getVal(m)); } System.err.println("</names>"); } String namePOS = "org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute#partOfSpeech"; String nameREADING = "org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#reading"; System.err.println( // "text='" + wordPOS.get("text") + "'" // + ",type='" + wordPOS.get("type") + "'" // + ",partOfSpeech='" + wordPOS.get(namePOS) + "'" // + ",reading='" + wordPOS.get(nameREADING) + "'" // ); } } }}

結果

<names> text=こんにちは raw_bytes=[e3 81 93 e3 82 93 e3 81 ab e3 81 a1 e3 81 af] start=0 end=5 org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute#positionLength=1 type=word org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute#termFrequency=1 org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute#baseForm=null org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute#partOfSpeech=感動詞 org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute#partOfSpeech (en)=interjection org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#reading=コンニチハ org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#reading (en)=konnichiha org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#pronunciation=コンニチワ org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#pronunciation (en)=konnichiwa org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionType=null org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionType (en)=null org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionForm=null org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionForm (en)=null position=1 positionHistory=[1] </names> text='こんにちは',type='word',partOfSpeech='感動詞',reading='コンニチハ' text='今日',type='word',partOfSpeech='名詞-副詞可能',reading='キョウ' text='は',type='word',partOfSpeech='助詞-係助詞',reading='ハ' text='いい',type='word',partOfSpeech='形容詞-自立',reading='イイ' text='天気',type='word',partOfSpeech='名詞-一般',reading='テンキ' text='です',type='word',partOfSpeech='助動詞',reading='デス' text='ね',type='word',partOfSpeech='助詞-終助詞',reading='ネ' text='私',type='word',partOfSpeech='名詞-代名詞-一般',reading='ワタシ' text='は',type='word',partOfSpeech='助詞-係助詞',reading='ハ' text='日産',type='word',partOfSpeech='名詞-固有名詞-組織',reading='ニッサン' text='日産自動車',type='word',partOfSpeech='名詞-固有名詞-組織',reading='ニッサンジドウシャ' text='自動車',type='word',partOfSpeech='名詞-一般',reading='ジドウシャ' text='の',type='word',partOfSpeech='助詞-連体化',reading='ノ' text='社員',type='word',partOfSpeech='名詞-一般',reading='シャイン' text='です',type='word',partOfSpeech='助動詞',reading='デス'