Link

I've included only the code in this article. Please refer to the link below for supplements on problem sentences and how to solve them.

Language Processing 100 Knock 2020 Chapter 7: Word Vector

Chapter 7: Word Vector

60. Reading and displaying word vectors

import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
model['United_States']

61. Word similarity

model.similarity('United_States','U.S.')

62. 10 words with high similarity

model.most_similar('United_States',topn=10)

63. Analogy by additive construct

model.most_similar(positive=['Spain','Athens'], negative=['Madrid'],topn=10)

64. Experiments with analogy data

with open('questions-words.txt') as f:
    questions = f.readlines()
with open('64.txt','w') as f:
    for i,question in enumerate(questions):
        words = question.split()
        if len(words)==4:
            ans = model.most_similar(positive=[words[1],words[2]], negative=[words[0]],topn=1)[0]
            words += [ans[0], str(ans[1])]
            output = ' '.join(words)+'\n'
        else:
            output = question
        f.write(output)
        if (i%100==0):
            print (i)

65. Correct answer rate in analogy tasks

cnt = 0
ok = 0
with open('64.txt','r') as f:
    questions = f.readlines()
for question in questions:
    words = question.split()
    if len(words)==6:
        cnt += 1
        if (words[3]==words[4]):
            ok +=1
print (ok/cnt)

66. Evaluation by WordSimilarity-353

import pandas as pd
df = pd.read_csv('wordsim353/combined.csv')
sim = []
for i in range(len(df)):
    line = df.iloc[i]
    sim.append(model.similarity(line['Word 1'],line['Word 2']))
df['w2v'] = sim 
df[['Human (mean)', 'w2v']].corr(method='spearman')

67. k-means clustering

from sklearn.cluster import KMeans
with open('country.txt','r') as f:
    lines = f.readlines()
countries = []
for line in lines:
    country = line.split('　')[-1].replace('\n','')
    countries.append(country)
dic = {'United States of America':'United_States', 'Russian Federation':'Russia'}
ng = 0
vec = []
target_countries = []
for c in countries:
    for k,v in dic.items():
        c = c.replace(k,v)
    c = c.replace(' ','_').replace('-','_').replace('_and_','_')
    try:
        
        vec.append(model[c])
        target_countries.append(c)
    except:
        ng += 1
kmeans = KMeans(n_clusters=5, random_state=0)
kmeans.fit(vec)
for c,l in zip(target_countries, kmeans.labels_):
    print (c,l)

68. Ward's method clustering

import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
plt.figure(figsize=(32.0, 24.0))
link = linkage(vec, method='ward')
dendrogram(link, labels=target_countries,leaf_rotation=90,leaf_font_size=10)
plt.show()

69. Visualization by t-SNE

from sklearn.manifold import TSNE
vec_embedded = TSNE(n_components=2).fit_transform(vec)
vec_embedded_t = list(zip(*vec_embedded)) #Transpose
fig, ax = plt.subplots(figsize=(16, 12))
plt.scatter(*vec_embedded_t)
for i, c in enumerate(target_countries):
    ax.annotate(c, (vec_embedded[i][0],vec_embedded[i][1]))

[PYTHON] 100 language processing knock 2020 [00 ~ 69 answer]

Link

Chapter 7: Word Vector

60. Reading and displaying word vectors

61. Word similarity

62. 10 words with high similarity

63. Analogy by additive construct

64. Experiments with analogy data

65. Correct answer rate in analogy tasks

66. Evaluation by WordSimilarity-353

67. k-means clustering

68. Ward's method clustering

69. Visualization by t-SNE