Three processes are required to use a REST API such as COTOHA API.
Of these, in the previous article (Try using COTOHA API parsing in Java) I was doing the processing equivalent to 1. (Create) and 2. (Send), but since the response was left as simple JSON, I will map it to a Java class.
DefaultKeywordWithDependency which is a model that implements dependency in NLP4J that I create with DIY /nlp4j/blob/master/nlp4j/nlp4j-core/src/main/java/nlp4j/impl/DefaultKeywordWithDependency.java) I have a class, so I will map it there. (Because it is →, it is not simply mapped to the POJO class.)
DefaultKeywordWithDependency https://github.com/oyahiroki/nlp4j/blob/master/nlp4j/nlp4j-core/src/main/java/nlp4j/impl/DefaultKeywordWithDependency.java
Parser
The JSON of the parsing result looks like the following. The result of the parsing is tree-like data, but it can be read that it is not a tree-like JSON.
{
"result": [
{
"chunk_info": {"id": 0,"head": 2,"dep": "D","chunk_head": 0,"chunk_func": 1,
"links": []
},
"tokens": [
{
"id": 0,"form": "today","kana": "today","lemma": "today","pos": "noun",
"features": ["Date and time"],
"dependency_labels": [{"token_id": 1,"label": "case"}],
"attributes": {}
},
{
"id": 1,"form": "Is","kana": "C","lemma": "Is","pos": "Conjunctive particles",
"features": [],
"attributes": {}
}
]
},
{...(Abbreviation)...},
{...(Abbreviation)...}
]
}
],
"status": 0,
"message": ""
}
Below is the class for Perth. (All code will be published on Maven Repository and Github)
package nlp4j.cotoha;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.HashMap;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import nlp4j.Keyword;
import nlp4j.impl.DefaultKeyword;
import nlp4j.impl.DefaultKeywordWithDependency;
/**
*COTOHA API Parsing V1 Response JSON Parsing
*
* @author Hiroki Oya
* @since 1.0.0.0
*
*/
public class CotohaNlpV1ResponseHandler {
static private final Logger logger = LogManager.getLogger(MethodHandles.lookup().lookupClass());
/**
*Keywords extracted as the root of the syntax
*/
ArrayList<DefaultKeywordWithDependency> roots = new ArrayList<>();
/**
*List of keywords
*/
ArrayList<Keyword> keywords = new ArrayList<>();
/**
*Original keyword for phrase
*/
ArrayList<Keyword> chunkLinkKeywords = new ArrayList<>();
/**
* @return clause Original keyword
*/
public ArrayList<Keyword> getChunkLinkKeywords() {
return chunkLinkKeywords;
}
/**
*Source
*/
ArrayList<String> chunkLinks = new ArrayList<>();
/**
*Source
*/
JsonArray arrChunkLinks = new JsonArray();
/**
* Map: token_id --> Keyword
*/
HashMap<String, DefaultKeywordWithDependency> mapTokenidKwd = new HashMap<>();
/**
* Map: id --> Keyword
*/
HashMap<String, DefaultKeywordWithDependency> mapIdKwd = new HashMap<>();
/**
* token id --> sentence
*/
HashMap<Integer, Integer> idSentenceMap = new HashMap<>();
/**
*Dependent keywords
*/
ArrayList<DefaultKeyword> patternKeywords = new ArrayList<>();
/**
* @return Kakemoto
*/
public JsonArray getArrChunkLinks() {
return arrChunkLinks;
}
/**
* @return Kakemoto
*/
public ArrayList<String> getChunkLinks() {
return chunkLinks;
}
/**
* @map of return ID and keywords
*/
public HashMap<String, DefaultKeywordWithDependency> getIdMapKwd() {
return mapIdKwd;
}
/**
* @return Morpheme ID and statement number mapping
*/
public HashMap<Integer, Integer> getIdSentenceMap() {
return idSentenceMap;
}
/**
* @Return word sequence
*/
public ArrayList<Keyword> getKeywords() {
return keywords;
}
/**
* @map of return TOKEN ID and keywords
*/
public HashMap<String, DefaultKeywordWithDependency> getMapKwd() {
return mapTokenidKwd;
}
/**
* @return Keyword
*/
public ArrayList<DefaultKeyword> getPatternKeywords() {
return patternKeywords;
}
/**
* @return Extracted dependency route keyword
*/
public ArrayList<DefaultKeywordWithDependency> getRoots() {
return roots;
}
/**
* @param json COTOHA API Parsing Response JSON
*/
public void parse(String json) {
// JSON Parser
Gson gson = new Gson();
// COTOHA API RESPONSE
JsonObject result = gson.fromJson(json, JsonObject.class);
//The order in which they appear in the sentence
int sequence = 0;
// {
// "result":[
// _{"chunk_info":{...},"tokens"[{...},{...},{...}]},
// _{"chunk_info":{...},"tokens"[{...},{...},{...}]},
// _{"chunk_info":{...},"tokens"[{...},{...},{...}]}
// ]
// }
// chunk_An object that combines info and tokens
JsonArray arrChunkTokens = result.getAsJsonArray("result");
int idxBegin = 0;
int idxSentence = 0;
// FOR EACH(chunk_tokens)
for (int idxChunkTokens = 0; idxChunkTokens < arrChunkTokens.size(); idxChunkTokens++) {
JsonObject chunk_token = arrChunkTokens.get(idxChunkTokens).getAsJsonObject();
// 1. chunk_info clause information object
// https://api.ce-cotoha.com/contents/reference/apireference.html#parsing_response_chunk
JsonObject chunk_info = chunk_token.get("chunk_info").getAsJsonObject();
logger.debug("chunk_info: " + chunk_info);
int chunk_head = -1;
{
//Morpheme number (0 origin)
String chunk_id = "" + chunk_info.get("id").getAsInt();
//Contact phrase number
chunk_head = chunk_info.get("head").getAsInt();
//Array of source information
// https://api.ce-cotoha.com/contents/reference/apireference.html#parsing_response_links
JsonArray links = chunk_info.get("links").getAsJsonArray();
for (int n = 0; n < links.size(); n++) {
JsonObject link = links.get(n).getAsJsonObject();
int link_link = link.get("link").getAsInt();
String link_label = link.get("label").getAsString();
chunkLinks.add(chunk_id + "/" + link_label + "/" + link_link);
arrChunkLinks.add(link);
}
}
// 2.tokens Morpheme information object
// https://api.ce-cotoha.com/contents/reference/apireference.html#parsing_response_morpheme
JsonArray tokens = chunk_token.get("tokens").getAsJsonArray();
//FOR EACH TOKENS Morpheme information object
for (int idxTokens = 0; idxTokens < tokens.size(); idxTokens++) {
JsonObject token = tokens.get(idxTokens).getAsJsonObject();
logger.debug("token: " + token);
// X-Y-style ID What morpheme in a clause
String token_id = idxChunkTokens + "-" + idxTokens;
logger.debug("token_id: " + token_id);
String token_pos = token.get("pos") != null ? token.get("pos").getAsString() : null;
String token_lemma = token.get("lemma") != null ? token.get("lemma").getAsString() : null;
String token_form = token.get("form") != null ? token.get("form").getAsString() : null;
String token_kana = token.get("kana") != null ? token.get("kana").getAsString() : null;
//Is it the last of the tokens? If true, the destination of the dependency is the next token
boolean isLastOfTokens = (idxTokens == tokens.size() - 1);
if (isLastOfTokens) {
logger.debug("Last token: chunk_head:" + chunk_head);
}
//Dependent keywords(Defined in nlp4j)
DefaultKeywordWithDependency kw = new DefaultKeywordWithDependency();
//Serial numbers in the order they appear in the text
kw.setSequence(sequence);
sequence++;
//Starting position
kw.setBegin(idxBegin);
// lemma:Headword:Prototype
if (token_lemma != null) {
kw.setLex(token_lemma);
} else {
logger.warn("lemma is null");
}
int intId = token.get("id").getAsInt();
String id = "" + token.get("id").getAsInt();
idSentenceMap.put(intId, idxSentence);
//Whether it is the end of a sentence
boolean isLastOfSentence = (chunk_head == -1 && idxTokens == tokens.size() - 1) //
|| (token_pos != null && token_pos.equals("Kuten"));
// IF(End of sentence)
if (isLastOfSentence) {
//increment statement number
idxSentence++;
}
//set facet part of speech
kw.setFacet(token_pos);
//set str expression type
kw.setStr(token_form);
kw.setEnd(idxBegin + kw.getStr().length());
idxBegin += kw.getStr().length();
//set reading reading
kw.setReading(token_kana);
mapTokenidKwd.put(token_id, kw);
mapIdKwd.put(id, kw);
keywords.add(kw);
//dependency labels Array of dependency information
if (token.get("dependency_labels") != null) {
//Array of dependent information
JsonArray arrDependency = token.get("dependency_labels").getAsJsonArray();
for (int n = 0; n < arrDependency.size(); n++) {
//Dependency information
JsonObject objDependency = arrDependency.get(n).getAsJsonObject();
String dependency_token_id = "" + objDependency.get("token_id").getAsInt();
//Set dependency information for keywords
kw.setDependencyKey(dependency_token_id);
}
}
} // END OF FOR EACH TOKENS
} // END OF FOR EACH (chunk_tokens)
// <Assembling the tree>
// FOR EACH(chunk_tokens)
for (int idxChunkTokens = 0; idxChunkTokens < arrChunkTokens.size(); idxChunkTokens++) {
JsonObject chunk_token = arrChunkTokens.get(idxChunkTokens).getAsJsonObject();
// 2. tokens
JsonArray tokens = chunk_token.get("tokens").getAsJsonArray();
// FOR (EACH TOKEN)
for (int idxTokens = 0; idxTokens < tokens.size(); idxTokens++) {
JsonObject token = tokens.get(idxTokens).getAsJsonObject();
String id = "" + token.get("id").getAsInt();
DefaultKeywordWithDependency kw = mapIdKwd.get(id);
// dependency labels
if (token.get("dependency_labels") != null) {
JsonArray arr_dependency_labels = token.get("dependency_labels").getAsJsonArray();
for (int n = 0; n < arr_dependency_labels.size(); n++) {
JsonObject dependency_label = arr_dependency_labels.get(n).getAsJsonObject();
String childID = "" + dependency_label.get("token_id").getAsInt();
String labelDependency = dependency_label.get("label").getAsString();
//Check if it straddles sentences
int sentence1 = idSentenceMap.get(token.get("id").getAsInt());
int sentence2 = idSentenceMap.get(dependency_label.get("token_id").getAsInt());
//Do not straddle sentences
if (mapIdKwd.get(childID) != null && (sentence1 == sentence2)) {
//Parent and Child are reversed in Japanese and English
DefaultKeywordWithDependency kw1Child = mapIdKwd.get(childID);
DefaultKeywordWithDependency kw2Parent = kw;
kw2Parent.addChild(kw1Child);
kw1Child.setRelation(labelDependency);
if (kw1Child.getBegin() < kw2Parent.getBegin()) {
DefaultKeyword kwd = new DefaultKeyword();
kwd.setBegin(kw1Child.getBegin());
kwd.setEnd(kw2Parent.getEnd());
kwd.setLex(kw1Child.getLex() + " ... " + kw2Parent.getLex());
kwd.setFacet(labelDependency);
patternKeywords.add(kwd);
} else {
DefaultKeyword kwd = new DefaultKeyword();
kwd.setBegin(kw2Parent.getBegin());
kwd.setEnd(kw1Child.getEnd());
kwd.setLex(kw2Parent.getLex() + " ... " + kw1Child.getLex());
kwd.setFacet(labelDependency);
patternKeywords.add(kwd);
}
} //
}
}
} // END OF FOR EACH TOKEN
} // END OF FOR EACH (chunk_tokens)
for (String link : chunkLinks) {
String id1 = link.split("/")[0];
String relation = link.split("/")[1];
String id2 = link.split("/")[2];
Keyword kwd1 = mapTokenidKwd.get(id1 + "-0");
Keyword kwd2 = mapTokenidKwd.get(id2 + "-0");
String lex1 = kwd1.getLex();
String lex2 = kwd2.getLex();
DefaultKeyword kwd = new DefaultKeyword();
kwd.setBegin(kwd1.getBegin());
kwd.setEnd(kwd2.getEnd());
kwd.setLex(lex2 + " ... " + lex1);
kwd.setStr(kwd.getLex());
kwd.setFacet(relation);
chunkLinkKeywords.add(kwd);
}
// </Assembling the tree>
for (String key : mapIdKwd.keySet()) {
DefaultKeywordWithDependency kw = mapIdKwd.get(key);
// IF(If it is a root keyword)
if (kw.getParent() == null) {
roots.add(kw);
}
}
} // end of parse()
}
Regarding the parsing of the COTOHA API, if you analyze two sentences like "It's a nice day today. I'm going to school tomorrow.", It seems that a dependency that spans two sentences is returned. (Please point out if the recognition is wrong)
On the analysis demo page, two sentences are divided, but it seems that the parsing process is performed after separating the sentences with punctuation marks in advance.
Therefore, this parser counts the "number of sentences" in advance as follows.
I try to ignore dependencies that span sentences.
As a TestCase, I will parse the JSON that saved the result of the COTOHA parsing API and output it as a character. (Scheduled to be released on Github at a later date)
File file = new File("src/test/resources/nlp_v1_parse_002.json");
String json = FileUtils.readFileToString(file, "UTF-8");
CotohaNlpV1ResponseHandler handler = new CotohaNlpV1ResponseHandler();
handler.parse(json);
for (DefaultKeywordWithDependency root : handler.getRoots()) {
System.err.println(root.toStringAsDependencyTree());
}
System.err.println("---");
for (Keyword kwd : handler.getKeywords()) {
System.err.println(kwd.getLex() + " (" + "word." + kwd.getFacet() + ")");
System.err.println("\t" + kwd);
}
System.err.println("---");
for (Keyword kwd : handler.getPatternKeywords()) {
System.err.println(kwd.getLex() + " (" + "pattern." + kwd.getFacet() + ")");
System.err.println("\t" + kwd);
}
System.err.println("---");
for (Keyword kwd : handler.getChunkLinkKeywords()) {
System.err.println(kwd.getLex() + " (" + "pattern." + kwd.getFacet() + ")");
System.err.println("\t" + kwd);
}
It looks like the following. It's hard to read if it's raw JSON, but I tried to output it in a tree shape. The state of the dependency has become easier to understand.
-sequence=11,lex=go,str=line,relation=null
-sequence=7,lex=tomorrow,str=tomorrow,relation=nmod
-sequence=8,lex=Is,str=Is,relation=case
-sequence=9,lex=school,str=school,relation=nmod
-sequence=10,lex=To,str=To,relation=case
-sequence=12,lex=Ki,str=Ki,relation=aux
-sequence=13,lex=Masu,str=Masu,relation=aux
-sequence=14,lex=。,str=。,relation=punct
-sequence=4,lex=weather,str=weather,relation=null
-sequence=0,lex=today,str=today,relation=nmod
-sequence=1,lex=Is,str=Is,relation=case
-sequence=2,lex=Good,str=I,relation=amod
-sequence=3,lex=I,str=I,relation=aux
-sequence=5,lex=is,str=is,relation=cop
-sequence=6,lex=。,str=。,relation=punct
---
today(word.noun)
today[relation=nmod, sequence=0, dependencyKey=1, hasChildren=true, hasParent=false, facet=noun, lex=today, str=today, reading=today, begin=0, end=2]
Is(word.Conjunctive particles)
Is[relation=case, sequence=1, dependencyKey=null, hasChildren=false, hasParent=false, facet=Conjunctive particles, lex=Is, str=Is, reading=C, begin=2, end=3]
Good(word.Adjective stem)
Good[relation=amod, sequence=2, dependencyKey=3, hasChildren=true, hasParent=false, facet=Adjective stem, lex=Good, str=I, reading=I, begin=3, end=4]
I(word.Adjective suffix)
I[relation=aux, sequence=3, dependencyKey=null, hasChildren=false, hasParent=false, facet=Adjective suffix, lex=I, str=I, reading=I, begin=4, end=5]
weather(word.noun)
weather[relation=null, sequence=4, dependencyKey=6, hasChildren=true, hasParent=true, facet=noun, lex=weather, str=weather, reading=weather, begin=5, end=7]
is(word.Judgment)
is[relation=cop, sequence=5, dependencyKey=null, hasChildren=false, hasParent=false, facet=Judgment, lex=is, str=is, reading=death, begin=7, end=9]
。 (word.Kuten)
。 [relation=punct, sequence=6, dependencyKey=null, hasChildren=false, hasParent=false, facet=Kuten, lex=。, str=。, reading=, begin=9, end=10]
tomorrow(word.noun)
tomorrow[relation=nmod, sequence=7, dependencyKey=8, hasChildren=true, hasParent=false, facet=noun, lex=tomorrow, str=tomorrow, reading=Ass, begin=10, end=12]
Is(word.Conjunctive particles)
Is[relation=case, sequence=8, dependencyKey=null, hasChildren=false, hasParent=false, facet=Conjunctive particles, lex=Is, str=Is, reading=C, begin=12, end=13]
school(word.noun)
school[relation=nmod, sequence=9, dependencyKey=10, hasChildren=true, hasParent=false, facet=noun, lex=school, str=school, reading=Gakkou, begin=13, end=15]
To(word.Case particles)
To[relation=case, sequence=10, dependencyKey=null, hasChildren=false, hasParent=false, facet=Case particles, lex=To, str=To, reading=D, begin=15, end=16]
go(word.Verb stem)
go[relation=null, sequence=11, dependencyKey=14, hasChildren=true, hasParent=true, facet=Verb stem, lex=go, str=line, reading=I, begin=16, end=17]
Ki(word.Verb conjugation ending)
Ki[relation=aux, sequence=12, dependencyKey=null, hasChildren=false, hasParent=false, facet=Verb conjugation ending, lex=Ki, str=Ki, reading=Ki, begin=17, end=18]
Masu(word.Verb suffix)
Masu[relation=aux, sequence=13, dependencyKey=null, hasChildren=false, hasParent=false, facet=Verb suffix, lex=Masu, str=Masu, reading=trout, begin=18, end=20]
。 (word.Kuten)
。 [relation=punct, sequence=14, dependencyKey=null, hasChildren=false, hasParent=false, facet=Kuten, lex=。, str=。, reading=, begin=20, end=21]
---
today...Is(pattern.case)
today...Is[sequence=-1, facet=case, lex=today...Is, str=null, reading=null, count=-1, begin=0, end=3, correlation=0.0]
Good...I(pattern.aux)
Good...I[sequence=-1, facet=aux, lex=Good...I, str=null, reading=null, count=-1, begin=3, end=5, correlation=0.0]
today...weather(pattern.nmod)
today...weather[sequence=-1, facet=nmod, lex=today...weather, str=null, reading=null, count=-1, begin=0, end=7, correlation=0.0]
Good...weather(pattern.amod)
Good...weather[sequence=-1, facet=amod, lex=Good...weather, str=null, reading=null, count=-1, begin=3, end=7, correlation=0.0]
weather...is(pattern.cop)
weather...is[sequence=-1, facet=cop, lex=weather...is, str=null, reading=null, count=-1, begin=5, end=9, correlation=0.0]
weather... 。 (pattern.punct)
weather... 。 [sequence=-1, facet=punct, lex=weather... 。, str=null, reading=null, count=-1, begin=5, end=10, correlation=0.0]
tomorrow...Is(pattern.case)
tomorrow...Is[sequence=-1, facet=case, lex=tomorrow...Is, str=null, reading=null, count=-1, begin=10, end=13, correlation=0.0]
school...To(pattern.case)
school...To[sequence=-1, facet=case, lex=school...To, str=null, reading=null, count=-1, begin=13, end=16, correlation=0.0]
tomorrow...go(pattern.nmod)
tomorrow...go[sequence=-1, facet=nmod, lex=tomorrow...go, str=null, reading=null, count=-1, begin=10, end=17, correlation=0.0]
school...go(pattern.nmod)
school...go[sequence=-1, facet=nmod, lex=school...go, str=null, reading=null, count=-1, begin=13, end=17, correlation=0.0]
go...Ki(pattern.aux)
go...Ki[sequence=-1, facet=aux, lex=go...Ki, str=null, reading=null, count=-1, begin=16, end=18, correlation=0.0]
go...Masu(pattern.aux)
go...Masu[sequence=-1, facet=aux, lex=go...Masu, str=null, reading=null, count=-1, begin=16, end=20, correlation=0.0]
go... 。 (pattern.punct)
go... 。 [sequence=-1, facet=punct, lex=go... 。, str=null, reading=null, count=-1, begin=16, end=21, correlation=0.0]
---
today...weather(pattern.time)
today...weather[sequence=-1, facet=time, lex=today...weather, str=today...weather, reading=null, count=-1, begin=5, end=2, correlation=0.0]
Good...weather(pattern.adjectivals)
Good...weather[sequence=-1, facet=adjectivals, lex=Good...weather, str=Good...weather, reading=null, count=-1, begin=5, end=4, correlation=0.0]
weather...go(pattern.manner)
weather...go[sequence=-1, facet=manner, lex=weather...go, str=weather...go, reading=null, count=-1, begin=16, end=7, correlation=0.0]
tomorrow...go(pattern.time)
tomorrow...go[sequence=-1, facet=time, lex=tomorrow...go, str=tomorrow...go, reading=null, count=-1, begin=16, end=12, correlation=0.0]
school...go(pattern.goal)
school...go[sequence=-1, facet=goal, lex=school...go, str=school...go, reading=null, count=-1, begin=16, end=15, correlation=0.0]
Being easy to handle as a Java class means that it is also easy to handle for business use. Parsing the results of parsing is a bit of a hassle, but in the business world ** this is the game **.
Recommended Posts