Eine Aufzeichnung zur Lösung der Probleme in der zweiten Hälfte von Kapitel 6. Die Zieldatei ist nlp.txt, wie auf der Webseite gezeigt.
Führen Sie die folgende Verarbeitung für den englischen Text (nlp.txt) durch.
Extrahieren Sie alle Personennamen im Eingabetext.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import re
WORD = re.compile(r"<word>(\w+)</word>")
NER = re.compile(r"<NER>(\w+)</NER>")
token = ""
person = ""
f = open('nlp.txt.xml', 'r')
for line in f:
line = line.strip()
word = WORD.search(line)
if word:
token = word.group(1)
continue
ner = NER.search(line)
if ner:
if ner.group(1) == "PERSON":
person = token
print person
f.close()
Ersetzen Sie den Referenzausdruck (Erwähnung) im Satz durch den repräsentativen Referenzausdruck (repräsentative Erwähnung) basierend auf dem Ergebnis der Co-Referenzanalyse von Stanford Core NLP. Achten Sie beim Ersetzen jedoch darauf, dass der ursprüngliche Referenzausdruck verstanden wird, z. B. "Repräsentativer Referenzausdruck (Referenzausdruck)".
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import re
import xml.etree.ElementTree as et
from functools import partial
LRB = re.compile(r"-LRB- ")
RRB = re.compile(r" -RRB-")
NOTATION = re.compile(r" ([,\.:;])")
LDQ = re.compile(r"`` ")
RDQ = re.compile(r" \'\'")
SQ = re.compile(r" \'")
SQS = re.compile(r" \'s")
class StanfordDocument():
def __init__(self, file):
self.xmltree = et.parse(file)
root = self.xmltree.getroot()
self.sentences = root.find('document/sentences')
self.coreferences = root.find('document/coreference')
def getListOfSentences(self):
sentences = []
for sentence in self.sentences.findall('sentence'):
sentences.append([word.text for word in sentence.findall('tokens/token/word')])
return sentences
def main(file):
doc = StanfordDocument(file)
sentences = doc.getListOfSentences()
for coref in doc.coreferences.findall('coreference'):
mentions = coref.findall('mention')
represent = coref.find('mention[@representative="true"]')
for mention in mentions:
if mention != represent:
sentence_i = int(mention.find('sentence').text) - 1
start_i = int(mention.find('start').text) - 1
end_i = int(mention.find('end').text) - 2
target_sentence = sentences[sentence_i]
target_sentence[start_i] = represent.find('text').text.strip() + ' (' + target_sentence[start_i]
# print list(represent)
# target_sentence[start_i] = "[" + str(sentence_i) +","+ str(start_i) +","+ str(end_i) +","+str(sentences[sentence_i][start_i])+ "]" + ' (' + target_sentence[start_i]
target_sentence[end_i] = target_sentence[end_i] + ')'
return sentences
def prettifySentence(sentence):
s = " ".join(sentence)
partials = map(
lambda x: partial(x[0], x[1]),
[
(LRB.sub, '('),
(RRB.sub, ')'),
(LDQ.sub, '\"'),
(RDQ.sub, '\"'),
(SQS.sub, "\'s"),
(SQ.sub, "\'"),
(NOTATION.sub, r'\1')
]
)
for part in partials:
s = part(s)
return s
if __name__ == "__main__":
file = "nlp_line.txt.xml"
sentences = main(file)
for sentence in sentences:
s = prettifySentence(sentence)
print s
Visualisieren Sie die reduzierten Abhängigkeiten des Stanford Core NLP als gerichteten Graphen. Zur Visualisierung ist es ratsam, den Abhängigkeitsbaum in die DOT-Sprache zu konvertieren und Graphviz zu verwenden. Verwenden Sie pydot, um gerichtete Diagramme direkt aus Python zu visualisieren.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import sys
import problem56
def dependToDot(i, dependency):
header = "digraph sentence{0} ".format(i)
body_head = "{ graph [rankdir = LR]; "
body = ""
for dep in dependency:
governor, dependent, label = dep.find('governor').text, dep.find('dependent').text, dep.get('type')
body += '"{gov}"->"{dep}" [label = "{label}"]; '.format(gov=governor, dep=dependent, label=label)
dotString = header + body_head + body + "}"
return dotString
def main(file):
doc = problem56.StanfordDocument(file)
sentences = doc.sentences.findall('sentence')
dotSentences = []
for i, sentence in enumerate(sentences):
dependency = sentence.find("dependencies[@type='collapsed-dependencies']")
dotSentences.append(dependToDot(i+1, dependency))
return dotSentences
if __name__ == '__main__':
dotSentences = main('nlp_line.txt.xml')
if len(sys.argv) > 1:
target = int(sys.argv[1]) - 1
print dotSentences[target]
else:
for dotSentence in dotSentences:
print dotSentence
Geben Sie die Menge des "Subjekt-Prädikat-Objekts" in einem durch Tabulatoren getrennten Format aus, basierend auf dem Ergebnis der Abhängigkeitsanalyse (kollabierte Abhängigkeiten) von Stanford Core NLP. Im Folgenden finden Sie jedoch die Definitionen des Subjekts, Prädikats und Objekts. --Predicate: Ein Wort, das Kinder (abhängige Personen) von nsubj- und dobj-Beziehungen hat
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import problem56
def extractTuples(sentence):
dependencies = sentence.find("dependencies[@type='collapsed-dependencies']")
dep_triple = []
dep_dic = {}
for dep in dependencies:
gov = (dep.find('governor').get('idx'), dep.find('governor').text)
if dep.get('type') in ['nsubj', 'dobj']:
dep_dic.setdefault(gov, []).append((dep.get('type'), dep.find('dependent').text))
verbs = [key for key, value in dep_dic.iteritems() if set([t for (t, d) in value]) == set(['nsubj', 'dobj'])]
for verb in verbs:
nsubj = [d for (t, d) in dep_dic[verb] if t == 'nsubj']
dobj = [d for (t, d) in dep_dic[verb] if t == 'dobj']
dep_triple += [[verb[1], n, d] for n in nsubj for d in dobj]
return dep_triple
def main(file):
doc = problem56.StanfordDocument(file)
sentences = doc.sentences.findall('sentence')
dep_triple = []
for sentence in sentences:
dep_triple.append(extractTuples(sentence))
return dep_triple
if __name__ == '__main__':
dep_triple = main('nlp_line.txt.xml')
for dep in dep_triple:
for dt in dep:
print "%s\t%s\t%s" % (dt[1], dt[0], dt[2])
Lesen Sie das Ergebnis der Phrasenstrukturanalyse von Stanford Core NLP (S-Formel) und zeigen Sie alle Nomenklaturphrasen (NP) im Satz an. Alle verschachtelten Nomenklaturen anzeigen.
# -*- coding: utf-8 -*-
__author__ = 'todoroki'
import problem56
class TreeParser():
def __init__(self):
self.root = None
self._stack = [[]]
def parse(self, tree_string):
read = []
for character in tree_string.strip():
if character == "(":
self._stack.append([])
elif character == " ":
if read:
self._stack[-1].append("".join(read))
read = []
elif character == ")":
if read:
self._stack[-1].append("".join(read))
read = []
self._stack[-2].append(self._stack.pop())
else:
read.append(character)
self.root = self._stack.pop()
def get_phrase(self, tag):
s = self.root[0][1]
return self._recursive_finder(s, tag)
def _recursive_finder(self, lst, tag):
res = []
if lst[0] == tag:
res.append(lst)
for l in lst[1:]:
if isinstance(l, list):
res.extend(self._recursive_finder(l, tag))
return res
def main(file, tag):
doc = problem56.StanfordDocument(file)
sentences = doc.sentences.findall('sentence')
tag_phases = []
for sentence in sentences:
parser = TreeParser()
tree_string = sentence.find('parse').text
parser.parse(tree_string)
tag_phases.append(parser.get_phrase(tag))
return tag_phases
def str_phrase(phrase):
res = []
for p in phrase:
if isinstance(p, list):
if isinstance(p[1], list):
res += str_phrase(p)
else:
res.append(p[1])
return res
if __name__ == "__main__":
np_phases = main("nlp_line.txt.xml", "NP")
for np_phase in np_phases:
for np in np_phase:
phase_list = str_phrase(np)
np_string = problem56.prettifySentence(phase_list)
print np_string
Recommended Posts