In [33]:
# import nltk
# from nltk.tokenize import word_tokenize
# from nltk.probability import FreqDist
# file = open('WK2/moviereview_arff.arff')
# tokens = []
# for line in file:
# #     print(type(line))
# #     tokens.append(word_tokenize(line))
# len(tokens)
In [46]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import pandas as pd
file = open('WK2/moviereview.csv')
all_df = pd.DataFrame(file)
In [35]:
# freq_dist_sent = []
# for sent in tokenized_sentences[1:2]:
#     fdist = FreqDist(sent)
#     print(len(sent))
#     print(fdist['bad'])
# #     print(fdist.items())
# #     print(sent)
In [ ]:
 
In [47]:
from nltk.tokenize import word_tokenize
def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

all_df['tokenized'] = all_df.apply(lambda x: get_tokens(x[0]),axis=1)
all_df['tokenized_count'] = all_df.apply(lambda x: len(x['tokenized']),axis=1)
In [48]:
all_df
Out[48]:
0 tokenized tokenized_count
0 text,reviewclass\n [text, reviewclass] 2
1 'plot : two teen couples go to a church party ... [two, teen, couples, go, to, a, church, party,... 638
2 'the happy bastard\'s quick movie review \ndam... [happy, quick, movie, review, that, bug, got, ... 215
3 'it is movies like these that make a jaded mov... [is, movies, like, these, that, make, a, jaded... 444
4 ' \" quest for camelot \" is warner bros . \' ... [quest, for, camelot, is, warner, bros, first,... 410
... ... ... ...
1996 'wow ! what a movie . \nit\'s everything a mov... [what, a, movie, everything, a, movie, can, be... 702
1997 'richard gere can be a commanding actor , but ... [gere, can, be, a, commanding, actor, but, not... 286
1998 'glory--starring matthew broderick , denzel wa... [starring, matthew, broderick, denzel, washing... 990
1999 'steven spielberg\'s second epic film on world... [second, epic, film, on, world, war, ii, is, a... 538
2000 'truman ( \" true-man \" ) burbank is the perf... [burbank, is, the, perfect, name, for, jim, ch... 901

2001 rows × 3 columns

In [49]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def remove_stopwords(sentence):
    filtered_text = []
    for word in sentence:
        if word not in stop_words:
            filtered_text.append(word)
    return filtered_text
all_df['no_stopwords'] = all_df.apply(lambda x: remove_stopwords(x['tokenized']),axis=1)
all_df['no_stopwords_count'] = all_df.apply(lambda x: len(x['no_stopwords']),axis=1)
In [50]:
all_df
Out[50]:
0 tokenized tokenized_count no_stopwords no_stopwords_count
0 text,reviewclass\n [text, reviewclass] 2 [text, reviewclass] 2
1 'plot : two teen couples go to a church party ... [two, teen, couples, go, to, a, church, party,... 638 [two, teen, couples, go, church, party, drink,... 306
2 'the happy bastard\'s quick movie review \ndam... [happy, quick, movie, review, that, bug, got, ... 215 [happy, quick, movie, review, bug, got, head, ... 119
3 'it is movies like these that make a jaded mov... [is, movies, like, these, that, make, a, jaded... 444 [movies, like, make, jaded, movie, viewer, tha... 246
4 ' \" quest for camelot \" is warner bros . \' ... [quest, for, camelot, is, warner, bros, first,... 410 [quest, camelot, warner, bros, first, attempt,... 234
... ... ... ... ... ...
1996 'wow ! what a movie . \nit\'s everything a mov... [what, a, movie, everything, a, movie, can, be... 702 [movie, everything, movie, funny, dramatic, in... 355
1997 'richard gere can be a commanding actor , but ... [gere, can, be, a, commanding, actor, but, not... 286 [gere, commanding, actor, always, great, films... 148
1998 'glory--starring matthew broderick , denzel wa... [starring, matthew, broderick, denzel, washing... 990 [starring, matthew, broderick, denzel, washing... 561
1999 'steven spielberg\'s second epic film on world... [second, epic, film, on, world, war, ii, is, a... 538 [second, epic, film, world, war, ii, unquestio... 287
2000 'truman ( \" true-man \" ) burbank is the perf... [burbank, is, the, perfect, name, for, jim, ch... 901 [burbank, perfect, name, jim, character, film,... 483

2001 rows × 5 columns

In [51]:
from nltk.probability import FreqDist
def get_most_common(tokens):
    fdist = FreqDist(tokens)
    return fdist
all_df['fdist'] = all_df.apply(lambda x: get_most_common(x['no_stopwords']),axis=1)
In [53]:
all_df = all_df[1:]
# all_df
In [55]:
# In the 2000 docs, how many times was "bad" used
# inverse of the normalized value
def get_bad(fdist): 
#     fdist['bad']
    return fdist['bad']

# import math
# (math.log10(2000/760))
# print((all_df['bad']!=0).sum())

def get_tfidf(fdist): 
    return fdist['bad']*(math.log10(2000/760))

all_df['bad'] = all_df.apply(lambda x: get_bad(x['fdist']),axis=1)
all_df['tfidf_bad'] = all_df.apply(lambda x: get_tfidf(x['fdist']),axis=1)
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
In [57]:
all_df
Out[57]:
0 tokenized tokenized_count no_stopwords no_stopwords_count fdist bad tfidf_bad
1 'plot : two teen couples go to a church party ... [two, teen, couples, go, to, a, church, party,... 638 [two, teen, couples, go, church, party, drink,... 306 {'two': 2, 'teen': 4, 'couples': 1, 'go': 2, '... 2 0.840433
2 'the happy bastard\'s quick movie review \ndam... [happy, quick, movie, review, that, bug, got, ... 215 [happy, quick, movie, review, bug, got, head, ... 119 {'happy': 1, 'quick': 1, 'movie': 5, 'review':... 0 0.000000
3 'it is movies like these that make a jaded mov... [is, movies, like, these, that, make, a, jaded... 444 [movies, like, make, jaded, movie, viewer, tha... 246 {'movies': 1, 'like': 4, 'make': 2, 'jaded': 1... 0 0.000000
4 ' \" quest for camelot \" is warner bros . \' ... [quest, for, camelot, is, warner, bros, first,... 410 [quest, camelot, warner, bros, first, attempt,... 234 {'quest': 5, 'camelot': 4, 'warner': 1, 'bros'... 0 0.000000
5 'synopsis : a mentally unstable man undergoing... [a, mentally, unstable, man, undergoing, psych... 658 [mentally, unstable, man, undergoing, psychoth... 346 {'mentally': 1, 'unstable': 1, 'man': 2, 'unde... 2 0.840433
... ... ... ... ... ... ... ... ...
1996 'wow ! what a movie . \nit\'s everything a mov... [what, a, movie, everything, a, movie, can, be... 702 [movie, everything, movie, funny, dramatic, in... 355 {'movie': 14, 'everything': 2, 'funny': 5, 'dr... 0 0.000000
1997 'richard gere can be a commanding actor , but ... [gere, can, be, a, commanding, actor, but, not... 286 [gere, commanding, actor, always, great, films... 148 {'gere': 1, 'commanding': 1, 'actor': 1, 'alwa... 0 0.000000
1998 'glory--starring matthew broderick , denzel wa... [starring, matthew, broderick, denzel, washing... 990 [starring, matthew, broderick, denzel, washing... 561 {'starring': 1, 'matthew': 1, 'broderick': 2, ... 0 0.000000
1999 'steven spielberg\'s second epic film on world... [second, epic, film, on, world, war, ii, is, a... 538 [second, epic, film, world, war, ii, unquestio... 287 {'second': 1, 'epic': 2, 'film': 14, 'world': ... 0 0.000000
2000 'truman ( \" true-man \" ) burbank is the perf... [burbank, is, the, perfect, name, for, jim, ch... 901 [burbank, perfect, name, jim, character, film,... 483 {'burbank': 4, 'perfect': 4, 'name': 1, 'jim':... 0 0.000000

2000 rows × 8 columns

In [58]:
print((all_df['bad']!=0).sum())
760
In [59]:
import math
(math.log10(2000/760))
Out[59]:
0.4202164033831899
In [60]:
all_df['removed'] = all_df['tokenized_count'] - all_df['no_stopwords_count']
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
In [61]:
all_df['removed'].sum()
Out[61]:
543035
In [62]:
all_df['tokenized_count'].sum()
Out[62]:
1189601
In [63]:
all_df['removed'].sum()/all_df['tokenized_count'].sum()
Out[63]:
0.4564849895048844