# the feature file is an edited copy of Weka's MultinomialNB output
import pandas as pd
weka_output = pd.read_csv('Weka-SMO-movie-review-weight.tsv', delimiter='\t')
print(weka_output.shape)
weights = weka_output['weight'].values
features = weka_output['token'].values
feature_ranks = sorted(zip(weights, features))
# print the words with highest positive weight
for i in range(len(feature_ranks)-20, len(feature_ranks)):
print(feature_ranks[i])
# print the words with highest negative weight
for i in range(0, 20):
print(feature_ranks[i])
# if the model is to classify more than two categories, e.g. 20 categories in the 20newsgroup data
# because SMO uses one-vs-one approach, it actually creates 190 binary classifiers
# you will find feature weights for each binary classifier, started with lines like
# "Classifier for classes: alt.atheism, comp.graphics"
# "Classifier for classes: alt.atheism, comp.os.ms-windows.misc"
# ...
# "Classifier for classes: talk.politics.misc, talk.religion.misc"
# You can sort individual list and check the most indicative features for each binary classifier