Get detailed results of morphological analysis with Apache Solr 7.6 + SolrJ (Japanese)

Overview

Morphological analysis is performed by default in Japanese analysis of Solr. This is the code when you want to use morphological analysis in Java. In the management console, you can get the same result by using Verbose Output on the Analysis page.

Source code


package hello.solr;

import java.util.ArrayList;
import java.util.HashMap;

import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.DocumentAnalysisRequest;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;

public class HelloAnalysisJapaneseSimple {

	@SuppressWarnings({ "unchecked", "rawtypes" })
	static public void main(String[] args) throws Exception {

		String fieldName = "field_text_ja";
		String coreName = "core_nlp";
		String text = "Hello. It's nice weather today, is not it. I am an employee of Nissan Motor Co., Ltd.";

		HashMap<String, SolrInputField> fields = new HashMap<String, SolrInputField>();

		// Document
		SolrInputDocument doc = new SolrInputDocument(fields);
		{
			// Document Field
			doc.setField("id", "0");
			doc.setField(fieldName, text);
		}

		// Request
		DocumentAnalysisRequest request = new DocumentAnalysisRequest();
		request.addDocument(doc);

		String solrLocation = "http://localhost:8983/solr/" + coreName;

		// NLP Client
		SolrClient client = new HttpSolrClient.Builder(solrLocation).build();

		// NLP Response
		NamedList<Object> response = client.request(request);

		// Get analysis response
		NamedList<Object> analysis = (NamedList<Object>) response
				.get("analysis");

		SimpleOrderedMap f = ((SimpleOrderedMap) ((SimpleOrderedMap) analysis
				.getVal(0)).get(fieldName));

		SimpleOrderedMap index = (SimpleOrderedMap) f.get("index");

		NamedList nlpResult = (NamedList) index.getVal(0);

		System.err.println("Tokenizer,Filter ---");
		{
			for (int n = 0; n < nlpResult.size(); n++) {
				System.err.println(nlpResult.getName(n) + "="
						+ nlpResult.getVal(n));
			}
		}

		ArrayList wordListPOS = (ArrayList) nlpResult
				.get("org.apache.lucene.analysis.ja.JapaneseTokenizer");

		if (wordListPOS != null) {
			for (int n = 0; n < wordListPOS.size(); n++) {
				SimpleOrderedMap wordPOS = (SimpleOrderedMap) wordListPOS
						.get(n);

				if (n == 0) {
					System.err.println("<names>");
					for (int m = 0; m < wordPOS.size(); m++) {
						System.err.println(wordPOS.getName(m) + "="
								+ wordPOS.getVal(m));
					}
					System.err.println("</names>");
				}

				String namePOS = "org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute#partOfSpeech";
				String nameREADING = "org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#reading";

				System.err.println( //
						"text='" + wordPOS.get("text") + "'" //
						+ ",type='"	+ wordPOS.get("type") + "'" //
						+ ",partOfSpeech='" + wordPOS.get(namePOS) + "'" //
						+ ",reading='" + wordPOS.get(nameREADING) + "'" //
						);
			}
		}

	}
}


result


<names>
text=Hello
raw_bytes=[e3 81 93 e3 82 93 e3 81 ab e3 81 a1 e3 81 af]
start=0
end=5
org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute#positionLength=1
type=word
org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute#termFrequency=1
org.apache.lucene.analysis.ja.tokenattributes.BaseFormAttribute#baseForm=null
org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute#partOfSpeech=Interjection
org.apache.lucene.analysis.ja.tokenattributes.PartOfSpeechAttribute#partOfSpeech (en)=interjection
org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#reading=Hello
org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#reading (en)=konnichiha
org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#pronunciation=Hello
org.apache.lucene.analysis.ja.tokenattributes.ReadingAttribute#pronunciation (en)=konnichiwa
org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionType=null
org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionType (en)=null
org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionForm=null
org.apache.lucene.analysis.ja.tokenattributes.InflectionAttribute#inflectionForm (en)=null
position=1
positionHistory=[1]
</names>
text='Hello',type='word',partOfSpeech='Interjection',reading='Hello'
text='today',type='word',partOfSpeech='noun-Adverbs possible',reading='today'
text='Is',type='word',partOfSpeech='Particle-係Particle',reading='C'
text='Good',type='word',partOfSpeech='adjective-Independence',reading='good'
text='weather',type='word',partOfSpeech='noun-General',reading='weather'
text='is',type='word',partOfSpeech='Auxiliary verb',reading='death'
text='Ne',type='word',partOfSpeech='Particle-終Particle',reading='Ne'
text='I',type='word',partOfSpeech='noun-代noun-General',reading='I'
text='Is',type='word',partOfSpeech='Particle-係Particle',reading='C'
text='Nissan',type='word',partOfSpeech='noun-固有noun-Organization',reading='Nissan'
text='Nissan Motor',type='word',partOfSpeech='noun-固有noun-Organization',reading='Nissan Jidosha'
text='Automobile',type='word',partOfSpeech='noun-General',reading='Jidosha'
text='of',type='word',partOfSpeech='Particle-Attributive',reading='No'
text='Employee',type='word',partOfSpeech='noun-General',reading='Shine'
text='is',type='word',partOfSpeech='Auxiliary verb',reading='death'


Recommended Posts

Get detailed results of morphological analysis with Apache Solr 7.6 + SolrJ (Japanese)
Get detailed results of morphological analysis with Apache Solr 7.6 + SolrJ
NLP4J [006-030] 100 language processing knocks with NLP4J # 30 Reading morphological analysis results
Start Apache Solr with Embedded.
Use Japanese morphological analysis "kuromoji"
Morphological analysis in Java with Kuromoji
I tried morphological analysis with MeCab
Get validation results with Spring Boot
How to realize hybrid search using morphological analysis and Ngram with Solr