I called a textbook or a dissertation to create a classifier, so I would like to see if it can actually be classified well. In such a case, I think it would be easier to create evaluation data using the corpus provided by NLTK. A movie review corpus is provided for the data for document classification. A movie review labeled'pos'or'neg' is converted to a simple feature of {0, 1} whether or not it contains a word.
import nltk
from nltk.corpus import movie_reviews
def document_features(document, word_features):
document_words = set(document)
features = {}
for word in word_features:
if word in document_words:
features[word] = 1
else: features[word] = 0
return features
def dataset():
d = []
freqdist = nltk.FreqDist()
for category in movie_reviews.categories():
for fileid in movie_reviews.fileids(category):
for word in movie_reviews.words(fileid): freqdist.inc(word.lower())
word_features = freqdist.keys()
for category in movie_reviews.categories():
for fileid in movie_reviews.fileids(category):
fv = document_features(movie_reviews.words(fileid), word_features)
x_i = [v for k, v in sorted(fv.items(), key=lambda f: f[0])]
if category == 'pos':
y_i = 1
elif category == 'neg':
y_i = -1
d.append((y_i, x_i))
return d
Recommended Posts