Weka SMO Feature Weighting¶

# the feature file is an edited copy of Weka's MultinomialNB output
import pandas as pd
weka_output = pd.read_csv('Weka-SMO-movie-review-weight.tsv', delimiter='\t')
print(weka_output.shape)
weights = weka_output['weight'].values
features = weka_output['token'].values

(1165, 2)

feature_ranks = sorted(zip(weights, features))

# print the words with highest positive weight 
for i in range(len(feature_ranks)-20, len(feature_ranks)):
    print(feature_ranks[i])

(1.0299, 'everything')
(1.0387, 'entertaining')
(1.0463, 'solid')
(1.0498, 'easily')
(1.0644, 'follows')
(1.0691, 'm')
(1.091, 'view')
(1.1209, 'good')
(1.1213, 'light')
(1.1237, 'excellent')
(1.1345, 'truman')
(1.1521, 'perfectly')
(1.1981, 'using')
(1.2062, 'overall')
(1.2675, 'seen')
(1.2773, 'fun')
(1.2783, 'back')
(1.3142, 'terrific')
(1.4153, 'flaws')
(1.8324, 'memorable')

# print the words with highest negative weight
for i in range(0, 20):
    print(feature_ranks[i])

(-1.9298, 'worst')
(-1.7567, 'unfortunately')
(-1.589, 'boring')
(-1.5658, 'bad')
(-1.5652, 'ridiculous')
(-1.5648, 'awful')
(-1.4931, 'filmmakers')
(-1.3724, 'looks')
(-1.3459999999999999, 'mess')
(-1.3187, 'lame')
(-1.3014, 'script')
(-1.2939, 'somewhere')
(-1.2823, 'any')
(-1.2474, 'material')
(-1.2257, 'bland')
(-1.2154, 'terrible')
(-1.1767, 'only')
(-1.145, 'falls')
(-1.136, '*')
(-1.1291, 'plot')

# if the model is to classify more than two categories, e.g. 20 categories in the 20newsgroup data
# because SMO uses one-vs-one approach, it actually creates 190 binary classifiers
# you will find feature weights for each binary classifier, started with lines like

# "Classifier for classes: alt.atheism, comp.graphics"
# "Classifier for classes: alt.atheism, comp.os.ms-windows.misc"
# ...
# "Classifier for classes: talk.politics.misc, talk.religion.misc"

# You can sort individual list and check the most indicative features for each binary classifier