Before we talk about WordNet, let's take a quick look at natural language processing. In order for machines to understand sentences in natural language processing, the following step-by-step tasks of morphological analysis, parsing, semantic analysis, and context analysis are mainly required.
It is a work to divide and discriminate from text data in natural language without information notes into columns of morphemes (minimum units that have meaning in the language) based on information such as the grammar of the target language and the part of speech of words called dictionaries.
For example, the morphological analysis of "Waiting for you" is as follows.
Reference: wikipedia
MeCab is a typical morphological analysis tool.
If it is a natural language, it is a procedure to clarify it by dividing it into morphemes and diagramming the relationships.
For example, when there is a sentence "Beautiful watermill maiden", the following two syntaxes can be considered.
Reference: wikipedia
We will analyze what kind of connection there is based on the information of the surrounding morphemes. Cabocha is a typical analysis tool.
It is a procedure to give knowledge to the machine. For example, if you have the following sentence:
"The high Mt. Fuji and the sea are beautiful"
It's expensive for humans, but it's easy to see which word it depends on. Even if there is "high Mt. Fuji", it can be intuitively judged that there is no "high sea". However, since the machine has no knowledge, it cannot be judged that the "high sea" is strange.
A type of thesaurus. A thesaurus that classifies and organizes words according to their superior / inferior relationships, partial / whole relationships, synonyms, and synonyms. It is a systematization of the relationship between each word, and is used in the above-mentioned semantic analysis.
For example, in the case of "on-road bike", the superordinate concept including "on-road bike" is "motorcycle" and "vehicle", and the subordinate concept is "naked" and "American". In addition, "scooter" and "off-road motorcycle" written in parallel with "motorcycle" can be regarded as similar words.
These are already organized for programmatic use and can be downloaded below. It is published on sqlite and can be read using sql.
--table and column list
pos_def ('pos', lang', def')
link_def ('link', lang','def')
synset_def ('synset', 'lang', 'def', sid')
synset_ex ('synset', 'lang', 'def', 'sid')
synset ('synset', 'pos', 'name', 'src')
synlink ('synset1', synset2', 'link', 'src')
ancestor ('synset1', 'synset2', 'hops')
sense ('synset' ,'wordid','lang', 'rank', 'lexid','freq','src')
word ('wordid','lang', 'lemma', 'pron', 'pos')
variant ('varid','wordid','lang', 'lemma','vartype')
xlink ('synset', 'resource','xref', 'misc', 'confidence')
import sqlite3
conn = sqlite3.connect("wnjpn.db")
def chk_table():
print("")
print("###word table info")
cur = conn.execute("select count(*) from word")
for row in cur:
print("word num:" +str(row[0]))
cur = conn.execute("select name from sqlite_master where type='table'")
for row in cur:
print("=======================================")
print(row[0])
cur = conn.execute("PRAGMA TABLE_INFO("+row[0]+")")
for row in cur:
print(row)
if __name__=="__main__":
chk_table()
--Result 1
###word table info
word num:249121
=======================================
pos_def
(0, 'pos', 'text', 0, None, 0)
(1, 'lang', 'text', 0, None, 0)
(2, 'def', 'text', 0, None, 0)
=======================================
link_def
(0, 'link', 'text', 0, None, 0)
(1, 'lang', 'text', 0, None, 0)
(2, 'def', 'text', 0, None, 0)
=======================================
synset_def
(0, 'synset', 'text', 0, None, 0)
(1, 'lang', 'text', 0, None, 0)
(2, 'def', 'text', 0, None, 0)
(3, 'sid', 'text', 0, None, 0)
=======================================
synset_ex
(0, 'synset', 'text', 0, None, 0)
(1, 'lang', 'text', 0, None, 0)
(2, 'def', 'text', 0, None, 0)
(3, 'sid', 'text', 0, None, 0)
=======================================
synset
(0, 'synset', 'text', 0, None, 0)
(1, 'pos', 'text', 0, None, 0)
(2, 'name', 'text', 0, None, 0)
(3, 'src', 'text', 0, None, 0)
=======================================
synlink
(0, 'synset1', 'text', 0, None, 0)
(1, 'synset2', 'text', 0, None, 0)
(2, 'link', 'text', 0, None, 0)
(3, 'src', 'text', 0, None, 0)
=======================================
ancestor
(0, 'synset1', 'text', 0, None, 0)
(1, 'synset2', 'text', 0, None, 0)
(2, 'hops', 'int', 0, None, 0)
=======================================
sense
(0, 'synset', 'text', 0, None, 0)
(1, 'wordid', 'integer', 0, None, 0)
(2, 'lang', 'text', 0, None, 0)
(3, 'rank', 'text', 0, None, 0)
(4, 'lexid', 'integer', 0, None, 0)
(5, 'freq', 'integer', 0, None, 0)
(6, 'src', 'text', 0, None, 0)
=======================================
word
(0, 'wordid', 'integer', 0, None, 1)
(1, 'lang', 'text', 0, None, 0)
(2, 'lemma', 'text', 0, None, 0)
(3, 'pron', 'text', 0, None, 0)
(4, 'pos', 'text', 0, None, 0)
=======================================
variant
(0, 'varid', 'integer', 0, None, 1)
(1, 'wordid', 'integer', 0, None, 0)
(2, 'lang', 'text', 0, None, 0)
(3, 'lemma', 'text', 0, None, 0)
(4, 'vartype', 'text', 0, None, 0)
=======================================
xlink
(0, 'synset', 'text', 0, None, 0)
(1, 'resource', 'text', 0, None, 0)
(2, 'xref', 'text', 0, None, 0)
(3, 'misc', 'text', 0, None, 0)
(4, 'confidence', 'text', 0, None, 0)
import sqlite3
conn = sqlite3.connect("wnjpn.db")
def chk_word():
#cur = conn.execute("select * from word limit 240000")
cur = conn.execute("select * from word where lang='jpn' limit 240000")
for row in cur:
print(row)
if __name__=="__main__":
chk_word()
--Result 2 Show only part
(249100, 'jpn', 'Soup plate', None, 'n')
(249101, 'jpn', 'Stretch', None, 'v')
(249102, 'jpn', 'Astringent', None, 'n')
(249103, 'jpn', 'Affirmation', None, 'n')
(249104, 'jpn', 'Allborg', None, 'n')
(249105, 'jpn', 'Behind side', None, 'n')
(249106, 'jpn', 'Repair', None, 'n')
(249107, 'jpn', 'Convenience', None, 'n')
(249108, 'jpn', 'Convenience', None, 'a')
(249109, 'jpn', 'Vilas', None, 'n')
(249110, 'jpn', 'Old-fashioned', None, 'a')
(249111, 'jpn', 'Cut-off', None, 'n')
(249112, 'jpn', 'Cut-off', None, 'a')
(249113, 'jpn', 'Super text', None, 'n')
(249114, 'jpn', 'Sexually transmitted diseases', None, 'n')
(249115, 'jpn', 'Mayu ink', None, 'n')
(249116, 'jpn', 'hemline', None, 'n')
(249117, 'jpn', 'Non-inbred', None, 'a')
(249118, 'jpn', 'Scientific instruments', None, 'n')
(249119, 'jpn', 'Backtrack', None, 'v')
(249120, 'jpn', 'Repeat', None, 'v')
(249121, 'jpn', 'will', None, 'n')
import sqlite3
conn = sqlite3.connect("wnjpn.db")
def main(word):
print("")
print("")
print("##input: 【",word,"】")
print("")
#Check if the word exists on Wordnet
cur = conn.execute("select wordid from word where lemma='%s'" % word)
word_id = 99999999
for row in cur:
word_id = row[0]
if word_id==99999999:
print("「%s」is not exist" % word)
return
#Get the concept
cur = conn.execute("select synset from sense where wordid='%s'" % word_id)
synsets = []
for row in cur:
synsets.append(row[0])
print(synsets)
#Display of words contained in the concept
for synset in synsets:
cur1 = conn.execute("select name from synset where synset='%s'" % synset)
for row1 in cur1:
print("##concept: %s" %(row1[0]))
cur2 = conn.execute("select def from synset_def where (synset='%s' and lang='jpn')" % synset)
for row2 in cur2:
print("##meaning: %s" %(row2[0]))
cur3 = conn.execute("select wordid from sense where (synset='%s' and wordid!=%s)" % (synset,word_id))
for i,row3 in enumerate(cur3):
target_word_id = row3[0]
cur3_1 = conn.execute("select lemma from word where wordid=%s" % target_word_id)
for row3_1 in cur3_1:
print("Synonyms"+str(i+1)+": %s" % (row3_1[0]))
print()
if __name__=="__main__":
word="Automobile"
main(word)
--Result 3
##input:【 Automobile 】
['03791235-n', '02958343-n']
##concept: motor_vehicle
##meaning:Vehicles with self-propelled wheels that do not run on rails
Synonyms 1: motor_vehicle
Synonyms 2: automotive_vehicle
Synonyms 3:Motor vehicle
##concept: auto
##meaning:4-wheel car
##meaning:Usually propelled by an internal combustion engine
Synonyms 1: auto
Synonyms 2: motorcar
Synonyms 3: machine
Synonyms 4: car
Synonyms 5: automobile
Synonyms 6:Four-wheeler
Synonyms 7:Automobile
Synonyms 8:car
Synonyms 9:Passenger car
Synonyms 10:Automobile
Synonyms 11:Motor car
If you enter "automobile" as the flow of similar word extraction, first follow the superordinate concepts of automobiles "motor_vehicle" and "auto", and extract the words under it to extract words that are in parallel with "automobile". I'm taking it out.
Recommended Posts