[PYTHON] Create document classification data quickly using NLTK

I called a textbook or a dissertation to create a classifier, so I would like to see if it can actually be classified well. In such a case, I think it would be easier to create evaluation data using the corpus provided by NLTK. A movie review corpus is provided for the data for document classification. A movie review labeled'pos'or'neg' is converted to a simple feature of {0, 1} whether or not it contains a word.

import nltk
from nltk.corpus import movie_reviews

def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        if word in document_words:
            features[word] = 1 
        else: features[word] = 0 
    return features

def dataset():
    d = []

    freqdist = nltk.FreqDist()
    for category in movie_reviews.categories():
        for fileid in movie_reviews.fileids(category):
            for word in movie_reviews.words(fileid): freqdist.inc(word.lower())

    word_features = freqdist.keys()
    for category in movie_reviews.categories():
        for fileid in movie_reviews.fileids(category):
            fv = document_features(movie_reviews.words(fileid), word_features)
            x_i = [v for k, v in sorted(fv.items(), key=lambda f: f[0])]
            if category == 'pos':
                y_i = 1 
            elif category == 'neg':
                y_i = -1
            d.append((y_i, x_i))
    return d