"""
37.Top 10 words that frequently co-occur with "cat"
Display 10 words that often co-occur with "cat" (high frequency of co-occurrence) and their frequency of appearance in a graph (for example, a bar graph).
sentence_list:
[[{'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'},
{'surface': 'one', 'base': 'one', 'pos': 'noun', 'pos1': 'number'},
{'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'}],
[{'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'},
{'surface': 'I', 'base': 'I', 'pos': 'noun', 'pos1': '代noun'},
{'surface': 'Is', 'base': 'Is', 'pos': 'Particle', 'pos1': '係Particle'},
{'surface': 'Cat', 'base': 'Cat', 'pos': 'noun', 'pos1': 'General'},
{'surface': 'so', 'base': 'Is', 'pos': 'Auxiliary verb', 'pos1': '*'},
{'surface': 'is there', 'base': 'is there', 'pos': 'Auxiliary verb', 'pos1': '*'},
{'surface': '。', 'base': '。', 'pos': 'symbol', 'pos1': 'Kuten'},
{'surface': '', 'base': '*', 'pos': 'BOS/EOS', 'pos1': '*'}],
Memo:
-Co-occurrence frequency: https://www.jtp.co.jp/techport/2018-04-18-001/
"""
from collections import defaultdict
from typing import List
import matplotlib.pyplot as plt
import utils
plt.style.use("ggplot")
plt.rcParams["font.family"] = "Hiragino Mincho ProN" #Japanese support
def get_co_occurrence(sentence_list: List[List[dict]]) -> list:
sents = [
[word["surface"] for word in sent[1:-1]] for sent in sentence_list
] # [['one'], ['I', 'Is', 'Cat', 'so', 'is there', '。']]
counter = defaultdict(int)
for sent in sents:
if "Cat" in sent:
for word in sent:
counter[word] += 1
del counter["Cat"]
sorted_counter = {
k: v for k, v in sorted(counter.items(), key=lambda item: item[1], reverse=True)
}
return list(sorted_counter.items())
def plot_co_occurrence(x: list, y: list) -> None:
x_pos = [i for i, _ in enumerate(x)]
plt.bar(x, y)
plt.xlabel("Term")
plt.ylabel("Frequency")
plt.title("Co-occurrence with 'Cat'")
plt.xticks(x_pos, x)
plt.show()
sentence_list = utils.read_json("30_neko_mecab.json")
counter = get_co_occurrence(sentence_list)
# [('of', 391), ('Is', 272), ('、', 252), ('To', 250), ('To', 232)]
x = [word[0] for word in counter[:10]]
y = [word[1] for word in counter[:10]]
plot_co_occurrence(x, y)
# ![image-20200527193140109](https://raw.githubusercontent.com/LearnXu/images/master/imgs/image-20200527193140109.png)
Recommended Posts