# import nltk
# from nltk.tokenize import word_tokenize
# from nltk.probability import FreqDist
# file = open('WK2/moviereview_arff.arff')
# tokens = []
# for line in file:
# #     print(type(line))
# #     tokens.append(word_tokenize(line))
# len(tokens)

import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import pandas as pd
file = open('WK2/moviereview.csv')
all_df = pd.DataFrame(file)

# freq_dist_sent = []
# for sent in tokenized_sentences[1:2]:
#     fdist = FreqDist(sent)
#     print(len(sent))
#     print(fdist['bad'])
# #     print(fdist.items())
# #     print(sent)

from nltk.tokenize import word_tokenize
def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

all_df['tokenized'] = all_df.apply(lambda x: get_tokens(x[0]),axis=1)
all_df['tokenized_count'] = all_df.apply(lambda x: len(x['tokenized']),axis=1)

all_df

from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def remove_stopwords(sentence):
    filtered_text = []
    for word in sentence:
        if word not in stop_words:
            filtered_text.append(word)
    return filtered_text
all_df['no_stopwords'] = all_df.apply(lambda x: remove_stopwords(x['tokenized']),axis=1)
all_df['no_stopwords_count'] = all_df.apply(lambda x: len(x['no_stopwords']),axis=1)

all_df

from nltk.probability import FreqDist
def get_most_common(tokens):
    fdist = FreqDist(tokens)
    return fdist
all_df['fdist'] = all_df.apply(lambda x: get_most_common(x['no_stopwords']),axis=1)

all_df = all_df[1:]
# all_df

# In the 2000 docs, how many times was "bad" used
# inverse of the normalized value
def get_bad(fdist): 
#     fdist['bad']
    return fdist['bad']

# import math
# (math.log10(2000/760))
# print((all_df['bad']!=0).sum())

def get_tfidf(fdist): 
    return fdist['bad']*(math.log10(2000/760))

all_df['bad'] = all_df.apply(lambda x: get_bad(x['fdist']),axis=1)
all_df['tfidf_bad'] = all_df.apply(lambda x: get_tfidf(x['fdist']),axis=1)

/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:14: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app

all_df

print((all_df['bad']!=0).sum())

760

import math
(math.log10(2000/760))

0.4202164033831899

all_df['removed'] = all_df['tokenized_count'] - all_df['no_stopwords_count']

/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

all_df['removed'].sum()

543035

all_df['tokenized_count'].sum()

1189601

all_df['removed'].sum()/all_df['tokenized_count'].sum()

0.4564849895048844

	0	tokenized	tokenized_count
0	text,reviewclass\n	[text, reviewclass]	2
1	'plot : two teen couples go to a church party ...	[two, teen, couples, go, to, a, church, party,...	638
2	'the happy bastard\'s quick movie review \ndam...	[happy, quick, movie, review, that, bug, got, ...	215
3	'it is movies like these that make a jaded mov...	[is, movies, like, these, that, make, a, jaded...	444
4	' \" quest for camelot \" is warner bros . \' ...	[quest, for, camelot, is, warner, bros, first,...	410
...	...	...	...
1996	'wow ! what a movie . \nit\'s everything a mov...	[what, a, movie, everything, a, movie, can, be...	702
1997	'richard gere can be a commanding actor , but ...	[gere, can, be, a, commanding, actor, but, not...	286
1998	'glory--starring matthew broderick , denzel wa...	[starring, matthew, broderick, denzel, washing...	990
1999	'steven spielberg\'s second epic film on world...	[second, epic, film, on, world, war, ii, is, a...	538
2000	'truman ( \" true-man \" ) burbank is the perf...	[burbank, is, the, perfect, name, for, jim, ch...	901

	0	tokenized	tokenized_count	no_stopwords	no_stopwords_count
0	text,reviewclass\n	[text, reviewclass]	2	[text, reviewclass]	2
1	'plot : two teen couples go to a church party ...	[two, teen, couples, go, to, a, church, party,...	638	[two, teen, couples, go, church, party, drink,...	306
2	'the happy bastard\'s quick movie review \ndam...	[happy, quick, movie, review, that, bug, got, ...	215	[happy, quick, movie, review, bug, got, head, ...	119
3	'it is movies like these that make a jaded mov...	[is, movies, like, these, that, make, a, jaded...	444	[movies, like, make, jaded, movie, viewer, tha...	246
4	' \" quest for camelot \" is warner bros . \' ...	[quest, for, camelot, is, warner, bros, first,...	410	[quest, camelot, warner, bros, first, attempt,...	234
...	...	...	...	...	...
1996	'wow ! what a movie . \nit\'s everything a mov...	[what, a, movie, everything, a, movie, can, be...	702	[movie, everything, movie, funny, dramatic, in...	355
1997	'richard gere can be a commanding actor , but ...	[gere, can, be, a, commanding, actor, but, not...	286	[gere, commanding, actor, always, great, films...	148
1998	'glory--starring matthew broderick , denzel wa...	[starring, matthew, broderick, denzel, washing...	990	[starring, matthew, broderick, denzel, washing...	561
1999	'steven spielberg\'s second epic film on world...	[second, epic, film, on, world, war, ii, is, a...	538	[second, epic, film, world, war, ii, unquestio...	287
2000	'truman ( \" true-man \" ) burbank is the perf...	[burbank, is, the, perfect, name, for, jim, ch...	901	[burbank, perfect, name, jim, character, film,...	483

	0	tokenized	tokenized_count	no_stopwords	no_stopwords_count	fdist	bad	tfidf_bad
1	'plot : two teen couples go to a church party ...	[two, teen, couples, go, to, a, church, party,...	638	[two, teen, couples, go, church, party, drink,...	306	{'two': 2, 'teen': 4, 'couples': 1, 'go': 2, '...	2	0.840433
2	'the happy bastard\'s quick movie review \ndam...	[happy, quick, movie, review, that, bug, got, ...	215	[happy, quick, movie, review, bug, got, head, ...	119	{'happy': 1, 'quick': 1, 'movie': 5, 'review':...	0	0.000000
3	'it is movies like these that make a jaded mov...	[is, movies, like, these, that, make, a, jaded...	444	[movies, like, make, jaded, movie, viewer, tha...	246	{'movies': 1, 'like': 4, 'make': 2, 'jaded': 1...	0	0.000000
4	' \" quest for camelot \" is warner bros . \' ...	[quest, for, camelot, is, warner, bros, first,...	410	[quest, camelot, warner, bros, first, attempt,...	234	{'quest': 5, 'camelot': 4, 'warner': 1, 'bros'...	0	0.000000
5	'synopsis : a mentally unstable man undergoing...	[a, mentally, unstable, man, undergoing, psych...	658	[mentally, unstable, man, undergoing, psychoth...	346	{'mentally': 1, 'unstable': 1, 'man': 2, 'unde...	2	0.840433
...	...	...	...	...	...	...	...	...
1996	'wow ! what a movie . \nit\'s everything a mov...	[what, a, movie, everything, a, movie, can, be...	702	[movie, everything, movie, funny, dramatic, in...	355	{'movie': 14, 'everything': 2, 'funny': 5, 'dr...	0	0.000000
1997	'richard gere can be a commanding actor , but ...	[gere, can, be, a, commanding, actor, but, not...	286	[gere, commanding, actor, always, great, films...	148	{'gere': 1, 'commanding': 1, 'actor': 1, 'alwa...	0	0.000000
1998	'glory--starring matthew broderick , denzel wa...	[starring, matthew, broderick, denzel, washing...	990	[starring, matthew, broderick, denzel, washing...	561	{'starring': 1, 'matthew': 1, 'broderick': 2, ...	0	0.000000
1999	'steven spielberg\'s second epic film on world...	[second, epic, film, on, world, war, ii, is, a...	538	[second, epic, film, world, war, ii, unquestio...	287	{'second': 1, 'epic': 2, 'film': 14, 'world': ...	0	0.000000
2000	'truman ( \" true-man \" ) burbank is the perf...	[burbank, is, the, perfect, name, for, jim, ch...	901	[burbank, perfect, name, jim, character, film,...	483	{'burbank': 4, 'perfect': 4, 'name': 1, 'jim':...	0	0.000000