MNB Feature Weighting

In [26]:
# the feature file is an edited copy of Weka's MultinomialNB output
import pandas as pd
weka_output = pd.read_csv('/Users/byu/Desktop/Data/MNB-feature-weight.tsv', delimiter='\t')
features = weka_output['token'].values
neg_cond_prob = weka_output['neg'].values
pos_cond_prob = weka_output['pos'].values
In [27]:
print(features[0])
print(neg_cond_prob[0])
print(pos_cond_prob[0])
&
0.00075178818189
0.000632576393282
In [28]:
import math
log_ratios = []
for i in range(0, len(features)):
    log_ratio = math.log(pos_cond_prob[i]) - math.log(neg_cond_prob[i])
    log_ratios.append(log_ratio)
print(log_ratios[0])
-0.17265361805312018
In [31]:
feature_ranks = sorted(zip(log_ratios, features))

# print the words with highest pos/neg conditional prob ratio / most positive words
top_pos_features = feature_ranks[-20:]
print(top_pos_features)
[(1.3471721356912916, 'fantastic'), (1.3527011771704371, 'cameron'), (1.4004166502101043, 'hanks'), (1.4367842943809803, 'tarantino'), (1.4946038652698048, 'toy'), (1.5919548621089836, 'titanic'), (1.6368054282743358, 'ripley'), (1.7681414303354224, 'era'), (1.829177320921791, 'wonderfully'), (1.8422494024891432, 'scorsese'), (1.842249402489144, 'jedi'), (1.960032438145527, 'derek'), (2.3530750262551345, 'outstanding'), (2.441870525038266, 'truman'), (2.5210078455969906, 'damon'), (2.875264408671441, 'bulworth'), (3.228543763609035, 'lebowski'), (3.5852187075477664, 'flynt'), (3.7675402643417213, 'mulan'), (4.055222336793502, 'shrek')]
In [32]:
# print the words with lowest pos/neg conditional prob ratio / most negative words
top_neg_features = feature_ranks[:20]
print(top_neg_features)
[(-4.18119819047189, '&nbsp'), (-3.3091135511022296, 'seagal'), (-2.706350432010554, 'schumacher'), (-2.263144995919541, 'wrestling'), (-2.191401091060701, 'godzilla'), (-1.982431592783855, 'spawn'), (-1.8954202157942248, 'wasted'), (-1.8419315308432394, 'lame'), (-1.8013912661459477, 'poorly'), (-1.772522079471071, 'worst'), (-1.7721875753702774, 'waste'), (-1.7388511551026857, 'ridiculous'), (-1.7130986590002708, 'awful'), (-1.7130986590002708, 'eve'), (-1.6570091923492267, 'stupid'), (-1.6495852532779436, 'snake'), (-1.635137117530559, 'unfunny'), (-1.5975857718784265, 'uninteresting'), (-1.5852652874903859, 'dull'), (-1.5536709220721674, 'arnold')]
In [33]:
# if the model is to classify more than two categories, 
# you can calculate the log ratio between the conditional probabilies of any two categories 

# if you simply print out the words with highest conditional probs in each category
# you may or may not get informative features 
# because some popular words in this category may also be popular in other categories.

# The following code prints out the words with 
# highest positive conditional probs and highest negative conditinal probs
# and both lists include common words like "are", "this", etc.

pos_features = sorted(zip(pos_cond_prob, features))
print(pos_features[-50:])
print()
neg_features = sorted(zip(neg_cond_prob, features))
print(neg_features[-50:])
[(0.0024851215450355678, '*'), (0.0024980312673474397, 'funny'), (0.0025173958508152489, 'comedy'), (0.0025561250177508692, 'star'), (0.002562579878906806, 'takes'), (0.0025690347400627411, 'year'), (0.0025948541846864881, 'played'), (0.0026013090458424232, 'cast'), (0.0026271284904661693, 'fact'), (0.0027110416854933448, 'find'), (0.0027691354358967747, 'family'), (0.0027885000193645822, 'big'), (0.0028272291863002041, 'young'), (0.0028465937697680125, 'audience'), (0.0028465937697680125, 'john'), (0.0028853229367036301, 'real'), (0.0028853229367036301, 'things'), (0.0030337847432901728, 'action'), (0.0031306076606292211, 'years'), (0.0031370625217851558, 'role'), (0.0031564271052529672, 'made'), (0.003317798634151378, 'work'), (0.0033371632176191889, 'director'), (0.0033629826622429354, 'end'), (0.0035501736357650936, 'performance'), (0.0036082673861685243, 'back'), (0.0036211771084803954, 'makes'), (0.0036792708588838244, '--'), (0.0036921805811956977, 'don'), (0.0037502743315991267, 'plot'), (0.003879371554717858, 'doesn'), (0.0040601076670840792, 'movies'), (0.0040859271117078258, 'scenes'), (0.0041569305844231311, 'world'), (0.0041827500290468741, 'love'), (0.0046345903099624325, 'scene'), (0.0047701423942370976, 'man'), (0.0048088715611727204, 'great'), (0.0049960625346948794, 'make'), (0.0050864305908779918, 'people'), (0.005260711842088278, '-'), (0.0056673680949122793, 'films'), (0.0063515833774415512, 'characters'), (0.0064032222666890425, 'life'), (0.0068937917145402228, 'character'), (0.0075650972747576184, 'time'), (0.0076941944978763544, 'good'), (0.0079459340829578697, 'story'), (0.016091968861749781, 'movie'), (0.033436180787751256, 'film')]

[(0.0026061990305512391, 'guy'), (0.0026205188054443801, 'actors'), (0.0026491583552306552, 'acting'), (0.0026634781301237962, 'point'), (0.0026706380175703648, 'plays'), (0.0026777979050169338, 'long'), (0.00271359734224978, 'role'), (0.0027422368920360586, 'minutes'), (0.002778036329268904, 'played'), (0.0027923561041620425, 'fact'), (0.0028353154288414586, 'great'), (0.0028711148660743053, 'things'), (0.0028997544158605839, 'real'), (0.002949873627986569, 'comedy'), (0.0030930713769179558, 'makes'), (0.0031789900262767863, 'funny'), (0.0032076295760630627, 'thing'), (0.0032147894635096343, 'love'), (0.0033651470998875893, 'audience'), (0.0034081064245670071, 'back'), (0.0034224261994601451, 'script'), (0.0035155047362655451, 'isn'), (0.0035226646237121141, 'life'), (0.0035369843986052547, 'work'), (0.0037804205717886109, 'end'), (0.0038520194462543047, 'big'), (0.0038520194462543047, 'made'), (0.003988057307739123, 'movies'), (0.0040668160696513862, '--'), (0.0042601330307087595, 'director'), (0.0042887725804950346, '-'), (0.0043102522428347416, 'man'), (0.0043388917926210184, 'action'), (0.0045178889787852545, 'scenes'), (0.0046109675155906566, 'films'), (0.0047183658272891942, 'people'), (0.0047183658272891942, 'scene'), (0.0048042844766480282, '*'), (0.0048114443640945963, 'doesn'), (0.0051049997494039389, 'don'), (0.0058639478187402895, 'make'), (0.0062505817408550317, 'characters'), (0.0063007009529810155, 'plot'), (0.0065369772387178105, 'story'), (0.0066372156629697756, 'character'), (0.0072959253080541577, 'bad'), (0.0079617948405851045, 'time'), (0.008069193152283648, 'good'), (0.02270400309307137, 'movie'), (0.0304510013102594, 'film')]
In [ ]: