MNB Feature Weighting¶

from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

import pandas as pd
ng_df = pd.DataFrame(newsgroups_train)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-19-333e9965d118> in <module>
      1 import pandas as pd
----> 2 ng_df = pd.DataFrame(newsgroups_train)

/usr/local/lib/python3.7/site-packages/pandas/core/frame.py in __init__(self, data, index, columns, dtype, copy)
    409             )
    410         elif isinstance(data, dict):
--> 411             mgr = init_dict(data, index, columns, dtype=dtype)
    412         elif isinstance(data, ma.MaskedArray):
    413             import numpy.ma.mrecords as mrecords

/usr/local/lib/python3.7/site-packages/pandas/core/internals/construction.py in init_dict(data, index, columns, dtype)
    255             arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays
    256         ]
--> 257     return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
    258 
    259 

/usr/local/lib/python3.7/site-packages/pandas/core/internals/construction.py in arrays_to_mgr(arrays, arr_names, index, columns, dtype)
     75     # figure out the index, if necessary
     76     if index is None:
---> 77         index = extract_index(arrays)
     78     else:
     79         index = ensure_index(index)

/usr/local/lib/python3.7/site-packages/pandas/core/internals/construction.py in extract_index(data)
    366             lengths = list(set(raw_lengths))
    367             if len(lengths) > 1:
--> 368                 raise ValueError("arrays must all be same length")
    369 
    370             if have_dicts:

ValueError: arrays must all be same length

# the feature file is an edited copy of Weka's MultinomialNB output
import pandas as pd
weka_output = pd.read_csv('MNB-feature-weight.tsv', delimiter='\t')
features = weka_output['token'].values
neg_cond_prob = weka_output['neg'].values
pos_cond_prob = weka_output['pos'].values
neg_cond_prob

array([7.51788182e-04, 4.22433359e-04, 4.80428448e-03, ...,
       7.87587619e-05, 3.29354823e-04, 2.79235610e-04])

print(features[0])
print(neg_cond_prob[0])
print(pos_cond_prob[0])

&
0.0007517881818897805
0.0006325763932817808

import math
log_ratios = []
for i in range(0, len(features)):
    log_ratio = math.log(pos_cond_prob[i]) - math.log(neg_cond_prob[i])
    log_ratios.append(log_ratio)
print(log_ratios[0])

-0.17265361805312018

feature_ranks = sorted(zip(log_ratios, features))

# print the words with highest pos/neg conditional prob ratio / most positive words
top_pos_features = feature_ranks[-20:]
print(top_pos_features)

[(1.3471721356912916, 'fantastic'), (1.3527011771704371, 'cameron'), (1.4004166502101043, 'hanks'), (1.4367842943809803, 'tarantino'), (1.4946038652698048, 'toy'), (1.5919548621089836, 'titanic'), (1.6368054282743358, 'ripley'), (1.7681414303354224, 'era'), (1.829177320921791, 'wonderfully'), (1.8422494024891432, 'scorsese'), (1.842249402489144, 'jedi'), (1.960032438145527, 'derek'), (2.3530750262551345, 'outstanding'), (2.441870525038266, 'truman'), (2.5210078455969906, 'damon'), (2.875264408671441, 'bulworth'), (3.228543763609035, 'lebowski'), (3.5852187075477664, 'flynt'), (3.7675402643417213, 'mulan'), (4.055222336793502, 'shrek')]

# print the words with lowest pos/neg conditional prob ratio / most negative words
top_neg_features = feature_ranks[:20]
print(top_neg_features)

[(-4.18119819047189, '&nbsp'), (-3.3091135511022296, 'seagal'), (-2.706350432010554, 'schumacher'), (-2.263144995919541, 'wrestling'), (-2.191401091060701, 'godzilla'), (-1.982431592783855, 'spawn'), (-1.8954202157942248, 'wasted'), (-1.8419315308432394, 'lame'), (-1.8013912661459477, 'poorly'), (-1.772522079471071, 'worst'), (-1.7721875753702774, 'waste'), (-1.7388511551026857, 'ridiculous'), (-1.7130986590002708, 'awful'), (-1.7130986590002708, 'eve'), (-1.6570091923492267, 'stupid'), (-1.6495852532779436, 'snake'), (-1.635137117530559, 'unfunny'), (-1.5975857718784265, 'uninteresting'), (-1.5852652874903859, 'dull'), (-1.5536709220721674, 'arnold')]

# if the model is to classify more than two categories, 
# you can calculate the log ratio between the conditional probabilies of any two categories 

# if you simply print out the words with highest conditional probs in each category
# you may or may not get informative features 
# because some popular words in this category may also be popular in other categories.

# The following code prints out the words with 
# highest positive conditional probs and highest negative conditinal probs
# and both lists include common words like "are", "this", etc.

pos_features = sorted(zip(pos_cond_prob, features))
print(pos_features[-50:])
print()
neg_features = sorted(zip(neg_cond_prob, features))
print(neg_features[-50:])

[(0.0024851215450355678, '*'), (0.0024980312673474397, 'funny'), (0.002517395850815249, 'comedy'), (0.002556125017750869, 'star'), (0.002562579878906806, 'takes'), (0.002569034740062741, 'year'), (0.002594854184686488, 'played'), (0.002601309045842423, 'cast'), (0.0026271284904661693, 'fact'), (0.002711041685493345, 'find'), (0.0027691354358967747, 'family'), (0.002788500019364582, 'big'), (0.002827229186300204, 'young'), (0.0028465937697680125, 'audience'), (0.0028465937697680125, 'john'), (0.00288532293670363, 'real'), (0.00288532293670363, 'things'), (0.0030337847432901728, 'action'), (0.003130607660629221, 'years'), (0.003137062521785156, 'role'), (0.003156427105252967, 'made'), (0.003317798634151378, 'work'), (0.003337163217619189, 'director'), (0.0033629826622429354, 'end'), (0.0035501736357650936, 'performance'), (0.0036082673861685243, 'back'), (0.0036211771084803954, 'makes'), (0.0036792708588838244, '--'), (0.0036921805811956977, 'don'), (0.0037502743315991267, 'plot'), (0.003879371554717858, 'doesn'), (0.004060107667084079, 'movies'), (0.004085927111707826, 'scenes'), (0.004156930584423131, 'world'), (0.004182750029046874, 'love'), (0.0046345903099624325, 'scene'), (0.004770142394237098, 'man'), (0.00480887156117272, 'great'), (0.004996062534694879, 'make'), (0.005086430590877992, 'people'), (0.005260711842088278, '-'), (0.005667368094912279, 'films'), (0.006351583377441551, 'characters'), (0.0064032222666890425, 'life'), (0.006893791714540223, 'character'), (0.007565097274757618, 'time'), (0.007694194497876354, 'good'), (0.00794593408295787, 'story'), (0.01609196886174978, 'movie'), (0.033436180787751256, 'film')]

[(0.002606199030551239, 'guy'), (0.00262051880544438, 'actors'), (0.002649158355230655, 'acting'), (0.0026634781301237962, 'point'), (0.002670638017570365, 'plays'), (0.0026777979050169338, 'long'), (0.00271359734224978, 'role'), (0.0027422368920360586, 'minutes'), (0.002778036329268904, 'played'), (0.0027923561041620425, 'fact'), (0.0028353154288414586, 'great'), (0.0028711148660743053, 'things'), (0.002899754415860584, 'real'), (0.002949873627986569, 'comedy'), (0.003093071376917956, 'makes'), (0.0031789900262767863, 'funny'), (0.0032076295760630627, 'thing'), (0.0032147894635096343, 'love'), (0.0033651470998875893, 'audience'), (0.003408106424567007, 'back'), (0.003422426199460145, 'script'), (0.003515504736265545, 'isn'), (0.003522664623712114, 'life'), (0.0035369843986052547, 'work'), (0.003780420571788611, 'end'), (0.0038520194462543047, 'big'), (0.0038520194462543047, 'made'), (0.003988057307739123, 'movies'), (0.004066816069651386, '--'), (0.0042601330307087595, 'director'), (0.004288772580495035, '-'), (0.0043102522428347416, 'man'), (0.004338891792621018, 'action'), (0.0045178889787852545, 'scenes'), (0.004610967515590657, 'films'), (0.004718365827289194, 'people'), (0.004718365827289194, 'scene'), (0.004804284476648028, '*'), (0.004811444364094596, 'doesn'), (0.005104999749403939, 'don'), (0.0058639478187402895, 'make'), (0.006250581740855032, 'characters'), (0.0063007009529810155, 'plot'), (0.0065369772387178105, 'story'), (0.0066372156629697756, 'character'), (0.007295925308054158, 'bad'), (0.007961794840585104, 'time'), (0.008069193152283648, 'good'), (0.02270400309307137, 'movie'), (0.0304510013102594, 'film')]