[PYTHON] Predict the gender of Twitter users with machine learning

We had data for 20,000 Twitter users tagged with gender, so we used this data to predict the gender of Twitter users. Ruby is used for text processing and Python is used for machine learning.

Conclusion first

Gender prediction by simple machine learning using Twitter's profile was only about 60% accurate.

The data used this time is foreign language data, and the result is different from the Japanese profile, but the accuracy seems to be not so much as well. The reason I think this is because "Twitter user data is difficult to determine gender even if people see it in the first place."

Twitter user gender determination procedure

Ruby is used for steps 1 to 5, and Python is used for steps 6.

List the words in your profile
Record the number of occurrences of each word
Eliminate words that appear extremely infrequently or too often
Represent the user's profile as a vector by regarding the number of words as the number of dimensions
Create a label for the correct answer data
Apply machine learning

Pre-process text in Ruby

The Ruby code that performs steps 1 to 5 above is as follows. With a method like this one, the performance depends greatly on this text processing part. There are countless ways to do it, and this code does really minimal text processing.

# https://www.kaggle.com/crowdflower/twitter-user-gender-classification
def parse_kaggle_data
  str = File.read('gender-classifier-DFE-791531.csv', encoding: 'ISO-8859-1:UTF-8')
  lines = str.split("\r").map { |l| l.split(',') }
  header = lines[0]
  users = lines.drop(1).map { |l| header.map.with_index { |h, i| [h, l[i]] }.to_h }
  users = users.select { |u| %w(female male).include?(u['gender']) && u['gender:confidence'] == '1' }
  [users.map { |u| u['description'] }, users.map { |u| u['gender'] }]
end

def split_to_words(text_array)
  text_array.map { |d| d.split(/([\s"]|__REP__)/) }.flatten.
      map { |w| w.gsub(/^#/, '') }.
      map { |w| w.gsub(/[^.]\.+$/, '') }.
      map { |w| w.gsub(/[^!]!+$/, '') }.
      map { |w| w.gsub(/^\(/, '') }.
      map { |w| w.gsub(/^\)/, '') }.
      delete_if { |w| w.length < 2 }.
      map(&:downcase).sort.uniq
end

def count_words(text_array, word_array)
  words_count = Hash.new(0)
  text_array.each do |d|
    word_array.each do |w|
      if d.include?(w)
        words_count[w] += 1
      end
    end
  end
  words_count
end

descriptions, genders = parse_kaggle_data

desc_words = split_to_words(descriptions)
desc_words_count = count_words(descriptions, desc_words)
filtered_desc_words = desc_words.select { |w| desc_words_count[w] > 2 && desc_words_count[w] < 500 }
desc_vectors = descriptions.map { |d| filtered_desc_words.map { |w| d.include?(w) ? 1 : 0 } }
File.write('data/description_vectors.txt', desc_vectors.map { |v| v.join(' ') }.join("\n"))

labels = genders.map do |g|
  case g
  when '';        0
  when 'brand';   1
  when 'female';  2
  when 'male';    3
  when 'unknown'; 4
  end
end
File.write('data/labels.txt', labels.join("\n"))

Machine learning with Python

I've tried Naive Bayes, Logistic Regression, Random Forest, and Support Vector Machines, all with similar results.

Method	accuracy
Naive Bayes (normal distribution)	0.5493
Naive Bayes (Bernoulli)	0.6367
Logistic regression	0.6151
Random forest	0.6339
Support vector machine	0.6303

It should be noted that each method has a tacit assumption about the original data, but this time we do not consider it and simply compare the results.

# sudo yum install -y python3
# sudo pip3 install -U pip numpy sklearn ipython

import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
import pickle

description_vectors = np.loadtxt('data/description_vectors.txt')
labels = np.loadtxt('data/labels.txt')

(x_train, x_test, y_train, y_test) = train_test_split(description_vectors, labels)

clf = GaussianNB().fit(x_train, y_train)
clf = BernoulliNB().fit(x_train, y_train)
clf = LogisticRegression().fit(x_train, y_train)
clf = RandomForestClassifier().fit(x_train, y_train)
clf = SVC(C = 1.0).fit(x_train, y_train)

y_pred = clf.predict(x_test)
np.mean(y_test == y_pred)

# Grid search

# best params: {'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}
parameters = [{'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 'C': np.logspace(-2, 2, 5), 'gamma': ['scale']}]
clf = GridSearchCV(SVC(), parameters, verbose = True, n_jobs = -1)
clf.fit(x_train, y_train)

# best params: {'max_depth': 100, 'n_estimators': 300}
parameters = [{'n_estimators': [30, 50, 100, 300], 'max_depth': [25, 30, 40, 50, 100]}]
clf = GridSearchCV(RandomForestClassifier(), parameters, verbose = True, n_jobs = -1)
clf.fit(x_train, y_train)

print(clf.best_params_)
print(clf.best_score_)
print(clf.best_estimator_)

print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Model persistence

pickle.dump(clf, open('model.sav', 'wb'))
clf = pickle.load(open('model.sav', 'rb'))

[PYTHON] Predict the gender of Twitter users with machine learning

Conclusion first

Twitter user gender determination procedure

Pre-process text in Ruby

Machine learning with Python

Related Links