Mémo TF-IDF. Il est beaucoup plus facile d'utiliser TfidfVectorizer , mais c'est aussi pour étudier. Si quelque chose est étrange, merci de me le dire.
Document à calculer
hoge.txt
white black red
white white black
white black black black
white
#Préparation
print(word_set)
['black', 'red', 'white']
print(doc_words)
[['white', 'black', 'red'], ['white', 'white', 'black'], ['white', 'black', 'black', 'black'], ['white']]
def tokenizer(word_set, doc_words):
    token_doc = []
    for words in doc_words:
        temp = []
        for w in words:
            temp.append(word_set.index(w))
        token_doc.append(temp)
    return token_doc
token_doc = tokenizer(word_set, doc_words)
#print(token_doc)
doc_num = len(token_doc)
#print(doc_num)
IDF = []
count = 0
import math
for j in range(len(word_set)):
    count = 0
    for d_list in token_doc:
        if j in d_list:
            count += 1
    IDF.append(math.log(doc_num / count) + 1)
TF_set = []
for doc in token_doc:
    TF = [0] * len(word_set)
    for t in doc:
        TF[t] += 1
    TF_set.append(TF)
TF_IDF_set = []
TF_IDF = []
for temp_TF in TF_set:
    for m in range(len(word_set)):
        TF_IDF.append(temp_TF[m] * IDF[m])
    TF_IDF_set.append(TF_IDF)
    TF_IDF = []
#résultat
print(token_doc)
[[2, 0, 1], [2, 2, 0], [2, 0, 0, 0], [2]]
print(word_set)
['black', 'red', 'white']
print(TF_IDF_set)
[[1.2876820724517808, 2.386294361119891, 1.0], [1.2876820724517808, 0.0, 2.0], [3.8630462173553424, 0.0, 1.0], [0.0, 0.0, 1.0]]
Recommended Posts