import os 
import csv 
import pandas as pd

reviews = pd.read_csv("WK7/kaggle-sentiment/train.tsv", delimiter = "\t")
reviews.head()

Visualizing the Data¶

label_count = pd.DataFrame(reviews.Sentiment.value_counts())
label_count.columns = ["count"]
label_count["sentiment"] = label_count.index 
label_count.reset_index(drop = True, inplace = True)
label_count

#Packages needed for my graphs 
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt 
from matplotlib import cm

with sns.plotting_context("talk"): 
    sns.barplot(y = "count", x = "sentiment", data = label_count, 
                palette = "GnBu_d")
    plt.title("Count of Sentiment by Label")
    plt.xlabel("Sentiment")
    plt.ylabel("Count")

When creating the testing and training data, I should have an equal sample size for each sentiment label. The label with the smallest number of entries is 0 with 7072 entries. Therefore I should randomly select 7072 reviews for the other 4 sentiment labels. This will help ensure that the training and testing data is balanced.¶

Looking at sentence id to determine the number of phrases each sentence is broken up into.¶

sentence_phrase_count = pd.DataFrame(reviews.groupby("SentenceId")["Phrase"].count())
sentence_phrase_count.head()

count_phrase_counts = pd.DataFrame(sentence_phrase_count.Phrase.value_counts())
count_phrase_counts["number_of_phrases"] = count_phrase_counts.index
count_phrase_counts.columns = ["count", "phrases_per_review"]
count_phrase_counts.reset_index(drop = True, inplace = True)
count_phrase_counts.head()

tick_mark_labels = []
for num in range (1, 64): 
    if num%5 == 0: 
        tick_mark_labels.append(num)
print(tick_mark_labels)

[5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60]

tick_marks = []
for num in tick_mark_labels: 
    new_num = num - 1
    tick_marks.append(new_num)
print(tick_marks)

[4, 9, 14, 19, 24, 29, 34, 39, 44, 49, 54, 59]

with sns.plotting_context("talk"):
    sns.barplot(y = "count", x = "phrases_per_review", 
                 data = count_phrase_counts, palette = "GnBu_d")
    plt.title("Phrases per Review")
    plt.xlabel("Phrases per Review")
    plt.ylabel("Count")
    plt.xticks(tick_marks, tick_mark_labels, rotation = 0)

The majority of reviews were broken into 7 - 25 phrases...¶

Creating a dictionary from the reviews to make a word cloud to visualize the phrases.¶

from collections import Counter 
def make_dic(df, column): 
    list_ = df[column].tolist()
    string = " ".join(list_).split()
    dic = Counter(string)
    return(dic)

dic = make_dic(reviews, "Phrase")

from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import numpy as np

def word_cloud(mask_path, dic, title):    
    with sns.plotting_context("talk"):
        mask = np.array(Image.open(mask_path))
        word_cloud = WordCloud(background_color = "white", max_words = 750, 
                  mask = mask, max_font_size = 125)
        word_cloud.generate_from_frequencies(dic)
        image_colors = ImageColorGenerator(mask)
        plt.figure(figsize = [8,8])
        plt.imshow(word_cloud.recolor(color_func = image_colors), 
                   interpolation = "bilinear")
        plt.title(title)
        plt.axis("off")

def all_words_df(dic): 
    df = pd.DataFrame.from_dict(dic, orient = "index")
    df.columns = ["count"]
    df["word"] = df.index 
    df.reset_index(drop = True, inplace = True)
    return(df)

def top_words_df(df, num_of_words): 
    df.sort_values(by = ["count"], ascending = False, inplace = True)
    df.reset_index(drop = True, inplace = True)
    new_df = df[:num_of_words]
    
    return(new_df)

def top_words_barplot(df, title): 
    with sns.plotting_context("talk"): 
        sns.barplot(y = "count", x = "word", data = df, palette = "GnBu_d")
        plt.ylabel("Count")
        plt.xlabel("Word")
        plt.xticks(rotation = 90)
        plt.title(title)

def unique_total_words( type_of_review, df): 
    print("The total number of words in the", type_of_review, "reviews is", sum(df["count"]))
    print("The total number of unique words in the", type_of_review, "reviews is", len(df))

word_cloud("tomatoes.png", dic, "Rotten Tomatoes Reviews")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-18-553b4f1b9477> in <module>
----> 1 word_cloud("tomatoes.png", dic, "Rotten Tomatoes Reviews")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'tomatoes.png'

words_df = all_words_df(dic)
words_df.head()

top_20 = top_words_df(words_df, 20)
top_20

top_words_barplot(top_20, "Top 20 Words")

unique_total_words("all reviews uncleaned", words_df)

Separating and visualizing for each sentiment label¶

def sentiment_subset(df, sentiment_label): 
    df = df[df["Sentiment"] == sentiment_label]
    df.reset_index(drop = True, inplace = True)
    print(df.shape)
    return(df)

reviews_0 = sentiment_subset(reviews, 0)
reviews_0.head()

(7072, 4)

reviews_1 = sentiment_subset(reviews, 1)
reviews_1.head()

(27273, 4)

reviews_2 = sentiment_subset(reviews, 2)
reviews_2.head()

(79582, 4)

reviews_3 = sentiment_subset(reviews, 3)
reviews_3.head()

(32927, 4)

reviews_4 = sentiment_subset(reviews, 4)
reviews_4.head()

(9206, 4)

Phrase¶

dic_0 = make_dic(reviews_0, "Phrase")

# word_cloud("horrible.png", dic_0, "Bad Reviews: \n Uncleaned")

words_0 = all_words_df(dic_0)
words_0.head()

top_words_0 = top_words_df(words_0, 20)
top_words_0

top_words_barplot(top_words_0, "Top 20 Words: \n Bad Reviews")

unique_total_words("uncleaned bad", words_0)

The total number of words in the uncleaned bad reviews is 85609
The total number of unique words in the uncleaned bad reviews is 7682

dic_1 = make_dic(reviews_1, "Phrase")

# word_cloud("bad.png", dic_1, "Somewhat Bad Reviews: \n Uncleaned")

words_1 = all_words_df(dic_1)
top_words_1 = top_words_df(words_1, 20)
top_words_1

top_words_barplot(top_words_1, "Top 20 Words: \n Somewhat Bad Reviews")

unique_total_words("somewhat bad uncleaned", words_1)

The total number of words in the somewhat bad uncleaned reviews is 248512
The total number of unique words in the somewhat bad uncleaned reviews is 13400

dic_2 = make_dic(reviews_2, "Phrase")

# word_cloud("neutral.png", dic_2, "Neutral Reviews: \n Uncleaned")

words_2 = all_words_df(dic_2)
top_words_2 = top_words_df(words_2, 20)
top_words_2

top_words_barplot(top_words_2, "Top 20 Words: \n Neutral Reviews")

unique_total_words("neutral uncleanned", words_2)

The total number of words in the neutral uncleanned reviews is 413398
The total number of unique words in the neutral uncleanned reviews is 17359

dic_3 = make_dic(reviews_3, "Phrase")

word_cloud("good.png", dic_3, "Somewhat Positive: \n Uncleaned")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-51-53541d58ce81> in <module>
----> 1 word_cloud("good.png", dic_3, "Somewhat Positive: \n Uncleaned")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'good.png'

words_3 = all_words_df(dic)
top_words_3 = top_words_df(words_3, 20)
top_words_3

top_words_barplot(top_words_3, "Somewhat Positive: \n Uncleaned")

unique_total_words("somewhat positive uncleaned", words_3)

The total number of words in the somewhat positive uncleaned reviews is 1124157
The total number of unique words in the somewhat positive uncleaned reviews is 18226

dic_4 = make_dic(reviews_4, "Phrase")

word_cloud("best.png", dic_4, "Positive Reviews: \n Uncleaned")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-56-5dec0d53b747> in <module>
----> 1 word_cloud("best.png", dic_4, "Positive Reviews: \n Uncleaned")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'best.png'

words_4 = all_words_df(dic_4)
top_words_4 = top_words_df(words_4, 20)
top_words_4

top_words_barplot(top_words_4, "Top 20 Words: \n Positive Reviews")

unique_total_words("positive uncleaned", words_4)

The total number of words in the positive uncleaned reviews is 98517
The total number of unique words in the positive uncleaned reviews is 7759

Cleaning the Phrases¶

1. Make everything lowercase
2. Change contractions 
3. Remove punctuation 
4. Tokenize 
5. Stem 
6. Remove stopwords

def remove_contractions(df, column): 
    df[column] = df[column].str.replace(r"(can't)", "cannot")
    df[column] = df[column].str.replace(r"(n't)", " not")
    df[column] = df[column].str.replace(r"('s)", "")
    df[column] = df[column].str.replace(r"('m)", " am")
    df[column] = df[column].str.replace(r"('d)", " would")
    df[column] = df[column].str.replace(r"('ll)", " will")
    df[column] = df[column].str.replace(r"('ve)", " have")
    df[column] = df[column].str.replace(r"('re)", " are")
    return(df[column])

reviews["phrase"] = reviews["Phrase"].str.lower() 
reviews.phrase.head()

0    a series of escapades demonstrating the adage ...
1    a series of escapades demonstrating the adage ...
2                                             a series
3                                                    a
4                                               series
Name: phrase, dtype: object

reviews["phrase"] = remove_contractions(reviews, "phrase") 
reviews.tail()

reviews["phrase"] = reviews["phrase"].str.replace(r"[^\w^\s]", "")
reviews["phrase"] = reviews["phrase"].str.replace(r"[0-9]+", "")

import nltk

reviews["phrase"] = reviews.apply(lambda row: nltk.word_tokenize(row["phrase"]), axis = 1)
reviews.head()

from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer("english")
reviews["stemmed"] = reviews.apply(lambda row: [stemmer.stem(token) for token in row["phrase"]], axis = 1)

reviews.head()

Stopwords¶

import re

#What the function does: It removes all words that have less than 3 characters in it. 
#Input: The string to have stopwords removed 
#Ouptut: The string with the words with 2 or less characters removed 
def remove_words_less_than_3_characters(string):
    new_string = ""
    for word in re.findall('[A-z][A-z]+\w', string): 
        new_string = new_string + " " + word
    return new_string

reviews["phrase_reduced"] = reviews.apply(lambda row: remove_words_less_than_3_characters(row["Phrase"]), axis =1 )

reviews.head()

reviews["phrase_reduced"] = reviews.apply(lambda row: nltk.word_tokenize(row["phrase_reduced"]), axis = 1)
stemmer = SnowballStemmer("english")
reviews["reduced_stemmed"] = reviews.apply(lambda row: [stemmer.stem(token) for token in row["phrase_reduced"]], axis = 1)
reviews.head()

#Generating a list of stop words from the first most commonly used words 
stop_words = ["the", "and", "you", "that", "was", "for", "with", "are", "his", "they", "one", "have", 
             "this", "from", "had", "word", "what", "some", "can", "out", "other", "were", "all", "there", 
             "when", "use", "your", "how", "said", "each", "she", "which", "their", "time", "will", "way", 
             "about", "many", "then", "them", "write", "would", "these", "her", "make", "thing", "see", "him", 
             "two", "has", "look", "more", "day", "could", "come", "did", "number", "sound", "people", "over", 
             "know", "than", "first", "who", "may", "down", "side", "been", "now", "find"]

reviews["further_reduced"] = reviews["phrase_reduced"].apply(lambda row: [word for word in row if word not in stop_words])

reviews["further_reduced_stemmed"] = reviews["reduced_stemmed"].apply(lambda row: [word for word in row if word not in stop_words])
reviews.head()

Visualizing All Reviews¶

def get_df_ready_for_viz(df, column): 
    df[column] = df[column].apply(",".join)
    df[column] = df[column].str.replace(",", " ")
    return(df[column])

reviews_viz = reviews.copy()
columns = ["phrase_reduced", "reduced_stemmed", "further_reduced", "further_reduced_stemmed"]
for column in columns: 
    reviews_viz[column] = get_df_ready_for_viz(reviews_viz, column)
reviews_viz.head()

Phrase¶

# dic_all = make_dic(reviews_viz, "phrase")
# word_cloud("tomatoes.png", dic_all, "Rotten Tomatoes: \n Cleaned Reviews")

# words_all = all_words_df(dic_all)
# top_words_all = top_words_df(words_all, 20)
# top_words_all

# top_words_barplot(top_words_all, "Top 20 Words: \n All Cleaned Reviews")

# unique_total_words("all cleaned", words_all)

reviews_0 = sentiment_subset(reviews_viz, 0)
reviews_0_viz = reviews_0.copy()
reviews_0_viz.head()

(7072, 10)

reviews_1 = sentiment_subset(reviews_viz, 1)
reviews_1_viz = reviews_1.copy()
reviews_1_viz.head()

(27273, 10)

reviews_2 = sentiment_subset(reviews_viz, 2)
reviews_2_viz = reviews_2.copy()
reviews_2_viz.head()

(79582, 10)

reviews_3 = sentiment_subset(reviews_viz, 3)
reviews_3_viz = reviews_3.copy()
reviews_3_viz.head()

(32927, 10)

reviews_4 = sentiment_subset(reviews_viz, 4)
reviews_4_viz = reviews_4.copy()
reviews_4_viz.head()

(9206, 10)

dic_0 = make_dic(reviews_0_viz, "phrase_reduced")
# word_cloud("horrible.png", dic_0, "Bad Reviews: \n Reduced")

words_0 = all_words_df(dic_0)
top_words_0 = top_words_df(words_0, 20)
top_words_0

top_words_barplot(top_words_0, "Top 20 Words: \n Bad Reduced")

unique_total_words("bad reduced", words_0)

The total number of words in the bad reduced reviews is 62373
The total number of unique words in the bad reduced reviews is 7220

dic_1 = make_dic(reviews_1_viz, "phrase_reduced")
word_cloud("bad.png", dic_1, "Somewhat Bad Reviews: \n Reduced")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-93-bdc938ff87b0> in <module>
      1 dic_1 = make_dic(reviews_1_viz, "phrase_reduced")
----> 2 word_cloud("bad.png", dic_1, "Somewhat Bad Reviews: \n Reduced")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'bad.png'

words_1 = all_words_df(dic_1)
top_words_1 = top_words_df(words_1, 20)
top_words_1

top_words_barplot(top_words_1, "Top 20 Words: \n Somewhat Bad Reduced")

unique_total_words("somewhat bad reduced", words_1)

The total number of words in the somewhat bad reduced reviews is 248512
The total number of unique words in the somewhat bad reduced reviews is 13400

dic_2 = make_dic(reviews_2_viz, "phrase_reduced")
word_cloud("neutral.png", dic_2, "Neutral Reviews: \n Reduced")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-95-47cd9cbb2d9a> in <module>
      1 dic_2 = make_dic(reviews_2_viz, "phrase_reduced")
----> 2 word_cloud("neutral.png", dic_2, "Neutral Reviews: \n Reduced")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'neutral.png'

words_2 = all_words_df(dic_2)
top_words_2 = top_words_df(words_2, 20)
top_words_2

top_words_barplot(top_words_2, "Top 20 Words: \n Neutral Reduced")

unique_total_words("neutral reduced", words_2)

The total number of words in the neutral reduced reviews is 413398
The total number of unique words in the neutral reduced reviews is 17359

dic_3 = make_dic(reviews_3_viz, "phrase_reduced")
word_cloud("good.png", dic_3, "Good Reviews: \n Reduced")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-98-9226dcdeb817> in <module>
      1 dic_3 = make_dic(reviews_3_viz, "phrase_reduced")
----> 2 word_cloud("good.png", dic_3, "Good Reviews: \n Reduced")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'good.png'

words_3 = all_words_df(dic_3)
top_words_3 = top_words_df(words_3, 20)
top_words_3

top_words_barplot(top_words_3, "Top 20 Words: \n Somewhat Postive Reduced")

unique_total_words("somewhat positive reduced", words_3)

dic_4 = make_dic(reviews_4_viz, "phrase_reduced")
word_cloud("best.png", dic_4, "Positive Reviews: \n Reduced")

words_4 = all_words_df(dic_4)
top_words_4 = top_words_df(words_4, 20)
top_words_4

top_words_barplot(top_words_4, "Top 20 Words: \n Positive Reduced")

unique_total_words("positive reduced", words_4)

The total number of words in the positive reduced reviews is 98517
The total number of unique words in the positive reduced reviews is 7759

Reduced Stemmed¶

dic_0 = make_dic(reviews_0_viz, "reduced_stemmed")
# word_cloud("horrible.png", dic_0, "Bad Reviews: \n Reduced Stemmed")

words_0 = all_words_df(dic_0)
top_words_0 = top_words_df(words_0, 20)
top_words_0

top_words_barplot(top_words_0, "Top 20 Words: \n Bad Reduced Stemmed")

unique_total_words("bad reduced stemmed", words_0)

The total number of words in the bad reduced stemmed reviews is 62373
The total number of unique words in the bad reduced stemmed reviews is 5084

dic_1 = make_dic(reviews_1_viz, "reduced_stemmed")
word_cloud("bad.png", dic_1, "Somewhat Bad Reviews: \n Reduced Stemmed")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-107-0c44aea59a38> in <module>
      1 dic_1 = make_dic(reviews_1_viz, "reduced_stemmed")
----> 2 word_cloud("bad.png", dic_1, "Somewhat Bad Reviews: \n Reduced Stemmed")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'bad.png'

words_1 = all_words_df(dic_1)
top_words_1 = top_words_df(words_1, 20)
top_words_1

top_words_barplot(top_words_1, "Top 20 Words: \n Somewhat Bad Reduced Stemmed")

unique_total_words("somewhat bad reduced stemmed", words_1)

The total number of words in the somewhat bad reduced stemmed reviews is 248512
The total number of unique words in the somewhat bad reduced stemmed reviews is 13400

dic_2 = make_dic(reviews_2_viz, "reduced_stemmed")
word_cloud("neutral.png", dic_2, "Neutral Reviews: \n Reduced Stemmed")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-110-9fe6a7e67627> in <module>
      1 dic_2 = make_dic(reviews_2_viz, "reduced_stemmed")
----> 2 word_cloud("neutral.png", dic_2, "Neutral Reviews: \n Reduced Stemmed")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'neutral.png'

words_2 = all_words_df(dic_2)
top_words_2 = top_words_df(words_2, 20)
top_words_2

top_words_barplot(top_words_2, "Top 20 Words: \n Neutral Reduced Stemmed")

unique_total_words("neutral reduced stemmed", words_2)

dic_3 = make_dic(reviews_3_viz, "reduced_stemmed")
word_cloud("good.png", dic_3, "Good Reviews: \n Reduced Stemmed")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-111-78a92b1ee9e0> in <module>
      1 dic_3 = make_dic(reviews_3_viz, "reduced_stemmed")
----> 2 word_cloud("good.png", dic_3, "Good Reviews: \n Reduced Stemmed")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'good.png'

words_3 = all_words_df(dic_3)
top_words_3 = top_words_df(words_3, 20)
top_words_3

top_words_barplot(top_words_3, "Top 20 Words: \n Somewhat Postive Reduced Stemmed")

unique_total_words("somewhat positive reduced stemmed", words_3)

The total number of words in the somewhat positive reduced stemmed reviews is 1124157
The total number of unique words in the somewhat positive reduced stemmed reviews is 18226

dic_4 = make_dic(reviews_4_viz, "reduced_stemmed")
word_cloud("best.png", dic_4, "Positive Reviews: \n Reduced Stemmed")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-114-2e68e85da60c> in <module>
      1 dic_4 = make_dic(reviews_4_viz, "reduced_stemmed")
----> 2 word_cloud("best.png", dic_4, "Positive Reviews: \n Reduced Stemmed")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'best.png'

words_4 = all_words_df(dic_4)
top_words_4 = top_words_df(words_4, 20)
top_words_4

top_words_barplot(top_words_4, "Top 20 Words: \n Positive Reduced Stemmed")

unique_total_words("positive reduced stemmed", words_4)

The total number of words in the positive reduced stemmed reviews is 98517
The total number of unique words in the positive reduced stemmed reviews is 7759

Further Reduced¶

dic_0 = make_dic(reviews_0_viz, "further_reduced")
word_cloud("horrible.png", dic_0, "Bad Reviews: \n Further Reduced")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-117-2fd385620bba> in <module>
      1 dic_0 = make_dic(reviews_0_viz, "further_reduced")
----> 2 word_cloud("horrible.png", dic_0, "Bad Reviews: \n Further Reduced")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'horrible.png'

words_0 = all_words_df(dic_0)
top_words_0 = top_words_df(words_0, 20)
top_words_0

top_words_barplot(top_words_0, "Top 20 Words: \n Bad Further Reduced")

unique_total_words("bad further reduced", words_0)

The total number of words in the bad further reduced reviews is 62373
The total number of unique words in the bad further reduced reviews is 5084

dic_1 = make_dic(reviews_1_viz, "further_reduced")
word_cloud("bad.png", dic_1, "Somewhat Bad Reviews: \n Further Reduced")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-120-e2ef93f60785> in <module>
      1 dic_1 = make_dic(reviews_1_viz, "further_reduced")
----> 2 word_cloud("bad.png", dic_1, "Somewhat Bad Reviews: \n Further Reduced")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'bad.png'

words_1 = all_words_df(dic_1)
top_words_1 = top_words_df(words_1, 20)
top_words_1

top_words_barplot(top_words_1, "Top 20 Words: \n Somewhat Bad Further Reduced")

unique_total_words("somewhat bad further reduced", words_1)

The total number of words in the somewhat bad further reduced reviews is 248512
The total number of unique words in the somewhat bad further reduced reviews is 13400

dic_2 = make_dic(reviews_2_viz, "further_reduced")
word_cloud("neutral.png", dic_2, "Neutral Reviews: \n Further Reduced")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-123-1c1799928923> in <module>
      1 dic_2 = make_dic(reviews_2_viz, "further_reduced")
----> 2 word_cloud("neutral.png", dic_2, "Neutral Reviews: \n Further Reduced")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'neutral.png'

words_2 = all_words_df(dic_2)
top_words_2 = top_words_df(words_2, 20)
top_words_2

top_words_barplot(top_words_2, "Top 20 Words: \n Neutral Further Reduced")

unique_total_words("neutral futher reduced", words_2)

The total number of words in the neutral futher reduced reviews is 413398
The total number of unique words in the neutral futher reduced reviews is 17359

dic_3 = make_dic(reviews_3_viz, "further_reduced")
word_cloud("good.png", dic_3, "Good Reviews: \n Further Reduced")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-125-83ffccd17fcd> in <module>
      1 dic_3 = make_dic(reviews_3_viz, "further_reduced")
----> 2 word_cloud("good.png", dic_3, "Good Reviews: \n Further Reduced")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'good.png'

words_3 = all_words_df(dic_3)
top_words_3 = top_words_df(words_3, 20)
top_words_3

top_words_barplot(top_words_3, "Top 20 Words: \n Somewhat Postive Further Reduced")

unique_total_words("somewhat positive further reduced", words_3)

The total number of words in the somewhat positive further reduced reviews is 161729
The total number of unique words in the somewhat positive further reduced reviews is 12505

dic_4 = make_dic(reviews_4_viz, "further_reduced")
word_cloud("best.png", dic_4, "Positive Reviews: \n Further Reduced")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-129-0eee8cfe5a36> in <module>
      1 dic_4 = make_dic(reviews_4_viz, "further_reduced")
----> 2 word_cloud("best.png", dic_4, "Positive Reviews: \n Further Reduced")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'best.png'

words_4 = all_words_df(dic_4)
top_words_4 = top_words_df(words_4, 20)
top_words_4

top_words_barplot(top_words_4, "Top 20 Words: \n Positive Further Reduced")

unique_total_words("positive further reduced", words_4)

The total number of words in the positive further reduced reviews is 98517
The total number of unique words in the positive further reduced reviews is 7759

Further Reduced Stemmed¶

dic_0 = make_dic(reviews_0_viz, "further_reduced_stemmed")
word_cloud("horrible.png", dic_0, "Bad Reviews: \n Further Reduced Stemmed")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-131-ffc919d79666> in <module>
      1 dic_0 = make_dic(reviews_0_viz, "further_reduced_stemmed")
----> 2 word_cloud("horrible.png", dic_0, "Bad Reviews: \n Further Reduced Stemmed")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'horrible.png'

words_0 = all_words_df(dic_0)
top_words_0 = top_words_df(words_0, 20)
top_words_0

top_words_barplot(top_words_0, "Top 20 Words: \n Bad Further Reduced Stemmed")

unique_total_words("bad further reduced stemmed", words_0)

The total number of words in the bad further reduced stemmed reviews is 46182
The total number of unique words in the bad further reduced stemmed reviews is 5016

dic_1 = make_dic(reviews_1_viz, "further_reduced_stemmed")
word_cloud("bad.png", dic_1, "Somewhat Bad Reviews: \n Further Reduced Stemmed")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-135-eb5b77ecd2b4> in <module>
      1 dic_1 = make_dic(reviews_1_viz, "further_reduced_stemmed")
----> 2 word_cloud("bad.png", dic_1, "Somewhat Bad Reviews: \n Further Reduced Stemmed")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'bad.png'

words_1 = all_words_df(dic_1)
top_words_1 = top_words_df(words_1, 20)
top_words_1

top_words_barplot(top_words_1, "Top 20 Words: \n Somewhat Bad Further Reduced Stemmed")

unique_total_words("somewhat bad further reduced stemmed", words_1)

The total number of words in the somewhat bad further reduced stemmed reviews is 248512
The total number of unique words in the somewhat bad further reduced stemmed reviews is 13400

dic_2 = make_dic(reviews_2_viz, "further_reduced_stemmed")
word_cloud("neutral.png", dic_2, "Neutral Reviews: \n Further Reduced Stemmed")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-138-27dab4cf81a8> in <module>
      1 dic_2 = make_dic(reviews_2_viz, "further_reduced_stemmed")
----> 2 word_cloud("neutral.png", dic_2, "Neutral Reviews: \n Further Reduced Stemmed")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'neutral.png'

words_2 = all_words_df(dic_2)
top_words_2 = top_words_df(words_2, 20)
top_words_2

top_words_barplot(top_words_2, "Top 20 Words: \n Neutral Further Reduced Stemmed")

unique_total_words("neutral further reduced stemmed", words_2)

dic_3 = make_dic(reviews_3_viz, "further_reduced_stemmed")
word_cloud("good.png", dic_3, "Good Reviews: \n Further Reduced Stemmed")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-139-505dc0841fdd> in <module>
      1 dic_3 = make_dic(reviews_3_viz, "further_reduced_stemmed")
----> 2 word_cloud("good.png", dic_3, "Good Reviews: \n Further Reduced Stemmed")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'good.png'

words_3 = all_words_df(dic_3)
top_words_3 = top_words_df(words_3, 20)
top_words_3

top_words_barplot(top_words_3, "Top 20 Words: \n Somewhat Postive Further Reduced Stemmed")

unique_total_words("somewhat positive reduced", words_3)

The total number of words in the somewhat positive reduced reviews is 158185
The total number of unique words in the somewhat positive reduced reviews is 8021

dic_4 = make_dic(reviews_4_viz, "further_reduced_stemmed")
word_cloud("best.png", dic_4, "Positive Reviews: \n Further Reduced Stemmed")

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-143-0ed92561b21f> in <module>
      1 dic_4 = make_dic(reviews_4_viz, "further_reduced_stemmed")
----> 2 word_cloud("best.png", dic_4, "Positive Reviews: \n Further Reduced Stemmed")

<ipython-input-17-a52dc0f354f0> in word_cloud(mask_path, dic, title)
      1 def word_cloud(mask_path, dic, title):
      2     with sns.plotting_context("talk"):
----> 3         mask = np.array(Image.open(mask_path))
      4         word_cloud = WordCloud(background_color = "white", max_words = 750, 
      5                   mask = mask, max_font_size = 125)

~/anaconda3/lib/python3.7/site-packages/PIL/Image.py in open(fp, mode)
   2764 
   2765     if filename:
-> 2766         fp = builtins.open(filename, "rb")
   2767         exclusive_fp = True
   2768 

FileNotFoundError: [Errno 2] No such file or directory: 'best.png'

words_4 = all_words_df(dic_4)
top_words_4 = top_words_df(words_4, 20)
top_words_4

top_words_barplot(top_words_4, "Top 20 Words: \n Positive Further Reduced Stemmed")

unique_total_words("positive further reduced stemmed", words_4)

The total number of words in the positive further reduced stemmed reviews is 98517
The total number of unique words in the positive further reduced stemmed reviews is 7759

Getting the data ready to run naive bayes and svms on it...¶

To make the training data contain an equal number of rows for each sentiment. The sample function replce = False is utilized

def make_df(column):
    df = pd.DataFrame()
    df["review"] = reviews[column]
    df["sentiment"] = reviews["Sentiment"]
    df.reset_index(drop = True, inplace = True)
    return(df)

reviews.head()

phrase_reduced_df = make_df("phrase_reduced")
phrase_reduced_df.head()

reduced_stemmed_df = make_df("reduced_stemmed")
reduced_stemmed_df.head()

further_reduced_df = make_df("further_reduced")
further_reduced_df.head()

further_reduced_stemmed_df = make_df("further_reduced_stemmed")
further_reduced_stemmed_df.head()

Models¶

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#What the function does: It takes the tokens from the df and joins it into a string, then replaces the "," with a space
#Input: the df and column to be changed 
#Output: the data untokenized 
def getting_data_ready_for_freq(df, column): 
    df[column] = df[column].apply(",".join)
    df[column] = df[column].str.replace(",", " ")
    return(df[column])

phrase_reduced_df["review"] = getting_data_ready_for_freq(phrase_reduced_df, "review")
reduced_stemmed_df["review"] = getting_data_ready_for_freq(reduced_stemmed_df, "review")
further_reduced_df["review"] = getting_data_ready_for_freq(further_reduced_df, "review")
further_reduced_stemmed_df["review"] = getting_data_ready_for_freq(further_reduced_stemmed_df, "review")

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
uni_vec = CountVectorizer(encoding='latin-1', binary=False, min_df=3)
bi_vec = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=3)
uni_tf_vec = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=3)
bigram_tf_vec = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=3)

model_1_uni_vec = uni_vec.fit_transform(phrase_reduced_df["review"])
model_1_bi_vec = bi_vec.fit_transform(phrase_reduced_df["review"])
model_1_uni_tf_vec = uni_tf_vec.fit_transform(phrase_reduced_df["review"])
model_1_bigram_tf_vec = bigram_tf_vec.fit_transform(phrase_reduced_df["review"])

model_2_uni_vec = uni_vec.fit_transform(reduced_stemmed_df["review"])
model_2_bi_vec = bi_vec.fit_transform(reduced_stemmed_df["review"])
model_2_uni_tf_vec = uni_tf_vec.fit_transform(reduced_stemmed_df["review"])
model_2_bigram_tf_vec = bigram_tf_vec.fit_transform(reduced_stemmed_df["review"])

model_3_uni_vec = uni_vec.fit_transform(further_reduced_df["review"])
model_3_bi_vec = bi_vec.fit_transform(further_reduced_df["review"])
model_3_uni_tf_vec = uni_tf_vec.fit_transform(further_reduced_df["review"])
model_3_bigram_tf_vec = bigram_tf_vec.fit_transform(further_reduced_df["review"])

model_4_uni_vec = uni_vec.fit_transform(further_reduced_stemmed_df["review"])
model_4_bi_vec = bi_vec.fit_transform(further_reduced_stemmed_df["review"])
model_4_uni_tf_vec = uni_tf_vec.fit_transform(further_reduced_stemmed_df["review"])
model_4_bigram_tf_vec = bigram_tf_vec.fit_transform(further_reduced_stemmed_df["review"])

# Creating testing and training df and labels 
model_1_uni_vec_train, model_1_uni_vec_test, label_model_1_uni_vec_train, label_model_1_uni_vec_test = train_test_split(model_1_uni_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12) 
model_1_bi_vec_train, model_1_bi_vec_test, label_model_1_bi_vec_train, label_model_1_bi_vec_test = train_test_split(model_1_bi_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12) 
model_1_uni_tf_vec_train, model_1_uni_tf_vec_test, label_model_1_uni_tf_vec_train, label_model_1_uni_tf_vec_test = train_test_split(model_1_uni_tf_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12) 
model_1_bigram_tf_vec_train, model_1_bigram_tf_vec_test, label_model_1_bigram_tf_vec_train, label_model_1_bigram_tf_vec_test = train_test_split(model_1_bigram_tf_vec, phrase_reduced_df["sentiment"], test_size = 0.2, random_state = 12)

# Creating testing and training df and labels 
model_2_uni_vec_train, model_2_uni_vec_test, label_model_2_uni_vec_train, label_model_2_uni_vec_test = train_test_split(model_2_uni_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12) 
model_2_bi_vec_train, model_2_bi_vec_test, label_model_2_bi_vec_train, label_model_2_bi_vec_test = train_test_split(model_2_bi_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12) 
model_2_uni_tf_vec_train, model_2_uni_tf_vec_test, label_model_2_uni_tf_vec_train, label_model_2_uni_tf_vec_test = train_test_split(model_2_uni_tf_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12) 
model_2_bigram_tf_vec_train, model_2_bigram_tf_vec_test, label_model_2_bigram_tf_vec_train, label_model_2_bigram_tf_vec_test = train_test_split(model_2_bigram_tf_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)

# Creating testing and training df and labels 
model_3_uni_vec_train, model_3_uni_vec_test, label_model_3_uni_vec_train, label_model_3_uni_vec_test = train_test_split(model_3_uni_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12) 
model_3_bi_vec_train, model_3_bi_vec_test, label_model_3_bi_vec_train, label_model_3_bi_vec_test = train_test_split(model_3_bi_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12) 
model_3_uni_tf_vec_train, model_3_uni_tf_vec_test, label_model_3_uni_tf_vec_train, label_model_3_uni_tf_vec_test = train_test_split(model_3_uni_tf_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12) 
model_3_bigram_tf_vec_train, model_3_bigram_tf_vec_test, label_model_3_bigram_tf_vec_train, label_model_3_bigram_tf_vec_test = train_test_split(model_3_bigram_tf_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)

# Creating testing and training df and labels 
model_4_uni_vec_train, model_4_uni_vec_test, label_model_4_uni_vec_train, label_model_4_uni_vec_test = train_test_split(model_4_uni_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12) 
model_4_bi_vec_train, model_4_bi_vec_test, label_model_4_bi_vec_train, label_model_4_bi_vec_test = train_test_split(model_4_bi_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12) 
model_4_uni_tf_vec_train, model_4_uni_tf_vec_test, label_model_4_uni_tf_vec_train, label_model_4_uni_tf_vec_test = train_test_split(model_4_uni_tf_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12) 
model_4_bigram_tf_vec_train, model_4_bigram_tf_vec_test, label_model_4_bigram_tf_vec_train, label_model_4_bigram_tf_vec_test = train_test_split(model_4_bigram_tf_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)

from sklearn.metrics import accuracy_score
all_stats = []
def running_model(model, clf, train_df, train_label, test_df, test_label): 
    clf = clf
    clf.fit(train_df, train_label)
    predicted = clf.predict(test_df)
    accuracy = accuracy_score(test_label, predicted, normalize = True)
    data = [model, clf, accuracy]
    all_stats.append(data)
    results = pd.DataFrame(all_stats, columns = ["model", "classifier", "accuracy"])
    print("The accuracy is", accuracy)
    print("#----------------------------------------------------------------#")
    cm = confusion_matrix(test_label, predicted)
    print(cm)
    print("#----------------------------------------------------------------#")
    print(classification_report(test_label, predicted, target_names = ["0", "1", "2", "3", "4"]))
    return clf, results

print("Model 1 Unigram Vec")
clf, results = running_model("Model 1 Unigram Vec", MultinomialNB(), model_1_uni_vec_train, label_model_1_uni_vec_train, model_1_uni_vec_test, label_model_1_uni_vec_test)

print(clf)

Model 1 Unigram Vec
The accuracy is 0.603886325772139
#----------------------------------------------------------------#
[[  778  1340   642    86    13]
 [  750  4580  5072   595    54]
 [  283  2885 24725  3447   282]
 [   21   519  5226  6538   884]
 [    0    56   605  1967  1076]]
#----------------------------------------------------------------#
              precision    recall  f1-score   support

           0       0.42      0.27      0.33      2859
           1       0.49      0.41      0.45     11051
           2       0.68      0.78      0.73     31622
           3       0.52      0.50      0.51     13188
           4       0.47      0.29      0.36      3704

    accuracy                           0.60     62424
   macro avg       0.52      0.45      0.47     62424
weighted avg       0.59      0.60      0.59     62424

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

def return_features(vec, model):
#     print(vec)
    
    for i,feature_probability in enumerate(model.coef_):
        print('============ Sentiment Score: ', i)
        df1 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
        df2 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[-10:])
        df3 = pd.concat([df1, df2], axis=1)
        print(df3)
#         print(tabulate(df3, tablefmt="fancy_grid", headers=["Most","Likely","Least","Likely"], floatfmt=".2f"))

return_features(uni_vec, clf)

============ Sentiment Score:  0
           0        1         0         1
0 -10.857575      aaa -5.769979      lust
1 -10.857575  aaliyah -5.698520     tommi
2 -10.857575      aan -5.664618   terrifi
3 -10.857575  abagnal -5.529699  schepisi
4 -10.857575   abbass -5.262864     blunt
5 -10.857575   abbott -5.180821      rail
6 -10.857575      abc -5.153792    kouyat
7 -10.857575    abdul -5.037492   lighter
8 -10.857575    abhor -4.719848     style
9 -10.857575     abid -3.559130    assaya
============ Sentiment Score:  1
           0         1         0                1
0 -11.746732       aaa -5.470088      nonchalleng
1 -11.746732       aan -5.381981         schepisi
2 -11.746732   abagnal -5.331635  straightforward
3 -11.746732    abbott -5.170262          terrifi
4 -11.746732     abdul -5.114730           kouyat
5 -11.746732     abhor -5.062120         communal
6 -11.746732    abject -4.954388            style
7 -11.746732  aborigin -4.878758             rail
8 -11.746732    abound -4.832995          lighter
9 -11.746732   abraham -3.522837           assaya
============ Sentiment Score:  2
          0         1         0                1
0 -12.24624    absent -5.521206          terrifi
1 -12.24624  ackerman -5.467455  straightforward
2 -12.24624    acquir -5.385576            abras
3 -12.24624     activ -5.292556             orgi
4 -12.24624   actorli -5.196118         communal
5 -12.24624    adrian -5.133913            style
6 -12.24624  affluenc -5.069985           kouyat
7 -12.24624  affluent -4.897652             rail
8 -12.24624     after -4.770334          lighter
9 -12.24624   ailment -3.518138           assaya
============ Sentiment Score:  3
          0            1         0                1
0 -11.86103      aaliyah -5.554754             orgi
1 -11.86103          aan -5.540261           nineti
2 -11.86103      abbrevi -5.496279             ming
3 -11.86103          abc -5.479214  straightforward
4 -11.86103  abderrahman -5.135996            style
5 -11.86103        abdul -5.095991         communal
6 -11.86103         abel -4.880024          lighter
7 -11.86103       abhorr -4.836381           kouyat
8 -11.86103         abid -4.836381             rail
9 -11.86103        abort -3.256559           assaya
============ Sentiment Score:  4
           0            1         0        1
0 -10.986326          aaa -5.728831   nineti
1 -10.986326      aaliyah -5.625034     main
2 -10.986326          aan -5.548247     burr
3 -10.986326      abagnal -5.376854   string
4 -10.986326      abandon -5.299351    tommi
5 -10.986326       abbass -5.202501     rail
6 -10.986326       abbott -5.160326  lighter
7 -10.986326      abbrevi -5.154444    style
8 -10.986326          abc -4.619856   kouyat
9 -10.986326  abderrahman -3.191915   assaya

print("Model 1 Bigram Vec")
results = running_model("Model 1 Bigram Vec", MultinomialNB(), model_1_bi_vec_train, label_model_1_bi_vec_train, model_1_bi_vec_test, label_model_1_bi_vec_test)

Model 1 Bigram Vec
The accuracy is 0.5912950147379213
#----------------------------------------------------------------#
[[ 1155  1315   331    44    14]
 [ 1269  5483  3673   560    66]
 [  581  4192 21625  4628   596]
 [   57   605  3923  7021  1582]
 [    9    42   297  1729  1627]]
#----------------------------------------------------------------#
              precision    recall  f1-score   support

           0       0.38      0.40      0.39      2859
           1       0.47      0.50      0.48     11051
           2       0.72      0.68      0.70     31622
           3       0.50      0.53      0.52     13188
           4       0.42      0.44      0.43      3704

    accuracy                           0.59     62424
   macro avg       0.50      0.51      0.50     62424
weighted avg       0.60      0.59      0.59     62424

results

(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
                  model                                         classifier  \
 0  Model 1 Unigram Vec  MultinomialNB(alpha=1.0, class_prior=None, fit...   
 1  Model 1 Unigram Vec  MultinomialNB(alpha=1.0, class_prior=None, fit...   
 2   Model 1 Bigram Vec  MultinomialNB(alpha=1.0, class_prior=None, fit...   
 
    accuracy  
 0  0.603886  
 1  0.603886  
 2  0.591295  )

print("Model 1 Unigram TFIDF Vec")
results = running_model("Model 1 Unigram TFIDF Vec", MultinomialNB(), model_1_uni_tf_vec_train, label_model_1_uni_tf_vec_train, model_1_uni_tf_vec_test, label_model_1_uni_tf_vec_test)

Model 1 Unigram TFIDF Vec
The accuracy is 0.579040112777137
#----------------------------------------------------------------#
[[   71  1094  1644    50     0]
 [   38  2536  8190   287     0]
 [   10  1256 28537  1800    19]
 [    0   126  8146  4841    75]
 [    0    14  1446  2083   161]]
#----------------------------------------------------------------#
              precision    recall  f1-score   support

           0       0.60      0.02      0.05      2859
           1       0.50      0.23      0.32     11051
           2       0.59      0.90      0.72     31622
           3       0.53      0.37      0.44     13188
           4       0.63      0.04      0.08      3704

    accuracy                           0.58     62424
   macro avg       0.57      0.31      0.32     62424
weighted avg       0.57      0.58      0.52     62424

print("Model 1 Bigram TFIDF Vec")
running_model(MultinomialNB(), model_1_bigram_tf_vec_train, label_model_1_bigram_tf_vec_train, model_1_bigram_tf_vec_test, label_model_1_bigram_tf_vec_test)

Model 1 Bigram TFIDF Vec

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-187-31e7743b0eb1> in <module>
      1 print("Model 1 Bigram TFIDF Vec")
----> 2 running_model(MultinomialNB(), model_1_bigram_tf_vec_train, label_model_1_bigram_tf_vec_train, model_1_bigram_tf_vec_test, label_model_1_bigram_tf_vec_test)

TypeError: running_model() missing 1 required positional argument: 'test_label'

print("Model 2 Unigram Vec")
running_model(MultinomialNB(), model_2_uni_vec_train, label_model_2_uni_vec_train, model_2_uni_vec_test, label_model_2_uni_vec_test)

print("Model 2 Bigram Vec")
running_model(MultinomialNB(), model_2_bi_vec_train, label_model_2_bi_vec_train, model_2_bi_vec_test, label_model_2_bi_vec_test)

Model 2 Bigram Vec

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-188-5f38e2062f69> in <module>
      1 print("Model 2 Bigram Vec")
----> 2 running_model(MultinomialNB(), model_2_bi_vec_train, label_model_2_bi_vec_train, model_2_bi_vec_test, label_model_2_bi_vec_test)

TypeError: running_model() missing 1 required positional argument: 'test_label'

print("Model 2 Unigram TFIDF Vec")
running_model(MultinomialNB(), model_2_uni_tf_vec_train, label_model_2_uni_tf_vec_train, model_2_uni_tf_vec_test, label_model_2_uni_tf_vec_test)

Model 2 Unigram TFIDF Vec

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-189-f45d415c7b87> in <module>
      1 print("Model 2 Unigram TFIDF Vec")
----> 2 running_model(MultinomialNB(), model_2_uni_tf_vec_train, label_model_2_uni_tf_vec_train, model_2_uni_tf_vec_test, label_model_2_uni_tf_vec_test)

TypeError: running_model() missing 1 required positional argument: 'test_label'

print("Model 2 Bigram TFIDF Vec")
running_model(MultinomialNB(), model_2_bigram_tf_vec_train, label_model_2_bigram_tf_vec_train, model_2_bigram_tf_vec_test, label_model_2_bigram_tf_vec_test)

Model 2 Bigram TFIDF Vec

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-190-a1734cfdd160> in <module>
      1 print("Model 2 Bigram TFIDF Vec")
----> 2 running_model(MultinomialNB(), model_2_bigram_tf_vec_train, label_model_2_bigram_tf_vec_train, model_2_bigram_tf_vec_test, label_model_2_bigram_tf_vec_test)

TypeError: running_model() missing 1 required positional argument: 'test_label'

print("Model 3 Unigram Vec")
running_model(MultinomialNB(), model_3_uni_vec_train, label_model_3_uni_vec_train, model_3_uni_vec_test, label_model_3_uni_vec_test)

Model 3 Unigram Vec

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-191-f500af84770b> in <module>
      1 print("Model 3 Unigram Vec")
----> 2 running_model(MultinomialNB(), model_3_uni_vec_train, label_model_3_uni_vec_train, model_3_uni_vec_test, label_model_3_uni_vec_test)

TypeError: running_model() missing 1 required positional argument: 'test_label'

print("Model 3 Bigram Vec")
running_model(MultinomialNB(), model_3_bi_vec_train, label_model_3_bi_vec_train, model_3_bi_vec_test, label_model_3_bi_vec_test)

Model 3 Bigram Vec

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-192-7b14d7cdd7c2> in <module>
      1 print("Model 3 Bigram Vec")
----> 2 running_model(MultinomialNB(), model_3_bi_vec_train, label_model_3_bi_vec_train, model_3_bi_vec_test, label_model_3_bi_vec_test)

TypeError: running_model() missing 1 required positional argument: 'test_label'

print("Model 3 Unigram TFIDF Vec")
running_model(MultinomialNB(), model_3_uni_tf_vec_train, label_model_3_uni_tf_vec_train, model_3_uni_tf_vec_test, label_model_3_uni_tf_vec_test)

Model 3 Unigram TFIDF Vec

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-193-e6f4fc8d3882> in <module>
      1 print("Model 3 Unigram TFIDF Vec")
----> 2 running_model(MultinomialNB(), model_3_uni_tf_vec_train, label_model_3_uni_tf_vec_train, model_3_uni_tf_vec_test, label_model_3_uni_tf_vec_test)

TypeError: running_model() missing 1 required positional argument: 'test_label'

print("Model 3 Bigram TFIDF Vec")
running_model(MultinomialNB(), model_3_bigram_tf_vec_train, label_model_3_bigram_tf_vec_train, model_3_bigram_tf_vec_test, label_model_3_bigram_tf_vec_test)

Model 3 Bigram TFIDF Vec

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-194-adc7fcfdf0ec> in <module>
      1 print("Model 3 Bigram TFIDF Vec")
----> 2 running_model(MultinomialNB(), model_3_bigram_tf_vec_train, label_model_3_bigram_tf_vec_train, model_3_bigram_tf_vec_test, label_model_3_bigram_tf_vec_test)

TypeError: running_model() missing 1 required positional argument: 'test_label'

print("Model 4 Unigram Vec")
running_model(MultinomialNB(), model_4_uni_vec_train, label_model_4_uni_vec_train, model_4_uni_vec_test, label_model_4_uni_vec_test)

Model 4 Unigram Vec

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-195-382a5e037cfc> in <module>
      1 print("Model 4 Unigram Vec")
----> 2 running_model(MultinomialNB(), model_4_uni_vec_train, label_model_4_uni_vec_train, model_4_uni_vec_test, label_model_4_uni_vec_test)

TypeError: running_model() missing 1 required positional argument: 'test_label'

print("Model 4 Bigram Vec")
running_model(MultinomialNB(), model_4_bi_vec_train, label_model_4_bi_vec_train, model_4_bi_vec_test, label_model_4_bi_vec_test)

Model 4 Bigram Vec

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-196-ea455412ca88> in <module>
      1 print("Model 4 Bigram Vec")
----> 2 running_model(MultinomialNB(), model_4_bi_vec_train, label_model_4_bi_vec_train, model_4_bi_vec_test, label_model_4_bi_vec_test)

TypeError: running_model() missing 1 required positional argument: 'test_label'

print("Model 4 Unigram TFIDF Vec")
running_model(MultinomialNB(), model_4_uni_tf_vec_train, label_model_4_uni_tf_vec_train, model_4_uni_tf_vec_test, label_model_4_uni_tf_vec_test)

print("Model 4 Bigram TFIDF Vec")
running_model(MultinomialNB(), model_4_bigram_tf_vec_train, label_model_4_bigram_tf_vec_train, model_4_bigram_tf_vec_test, label_model_4_bigram_tf_vec_test)

SVMS¶

from sklearn.svm import LinearSVC
print("Model 1 Unigram Vec")
running_model((LinearSVC(C=1)), model_1_uni_vec_train, label_model_1_uni_vec_train, model_1_uni_vec_test, label_model_1_uni_vec_test)

print("Model 1 Bigram Vec")
running_model((LinearSVC(C=1)), model_1_bi_vec_train, label_model_1_bi_vec_train, model_1_bi_vec_test, label_model_1_bi_vec_test)

print("Model 1 Unigram TFIDF Vec")
running_model((LinearSVC(C=1)), model_1_uni_tf_vec_train, label_model_1_uni_tf_vec_train, model_1_uni_tf_vec_test, label_model_1_uni_tf_vec_test)

print("Model 1 Bigram TFIDF Vec")
running_model((LinearSVC(C=1)), model_1_bigram_tf_vec_train, label_model_1_bigram_tf_vec_train, model_1_bigram_tf_vec_test, label_model_1_bigram_tf_vec_test)

print("Model 2 Unigram Vec")
running_model((LinearSVC(C=1)), model_2_uni_vec_train, label_model_2_uni_vec_train, model_2_uni_vec_test, label_model_2_uni_vec_test)

print("Model 2 Bigram Vec")
running_model((LinearSVC(C=1)), model_2_bi_vec_train, label_model_2_bi_vec_train, model_2_bi_vec_test, label_model_2_bi_vec_test)

print("Model 2 Unigram TFIDF Vec")
running_model((LinearSVC(C=1)), model_2_uni_tf_vec_train, label_model_2_uni_tf_vec_train, model_2_uni_tf_vec_test, label_model_2_uni_tf_vec_test)

print("Model 2 Bigram TFIDF Vec")
running_model((LinearSVC(C=1)), model_2_bigram_tf_vec_train, label_model_2_bigram_tf_vec_train, model_2_bigram_tf_vec_test, label_model_2_bigram_tf_vec_test)

print("Model 3 Unigram Vec")
running_model((LinearSVC(C=1)), model_3_uni_vec_train, label_model_3_uni_vec_train, model_3_uni_vec_test, label_model_3_uni_vec_test)

print("Model 3 Bigram Vec")
running_model((LinearSVC(C=1)), model_3_bi_vec_train, label_model_3_bi_vec_train, model_3_bi_vec_test, label_model_3_bi_vec_test)

print("Model 3 Unigram TFIDF Vec")
running_model((LinearSVC(C=1)), model_3_uni_tf_vec_train, label_model_3_uni_tf_vec_train, model_3_uni_tf_vec_test, label_model_3_uni_tf_vec_test)

print("Model 3 Bigram TFIDF Vec")
running_model((LinearSVC(C=1)), model_3_bigram_tf_vec_train, label_model_3_bigram_tf_vec_train, model_3_bigram_tf_vec_test, label_model_3_bigram_tf_vec_test)

print("Model 4 Unigram Vec")
running_model((LinearSVC(C=1)), model_4_uni_vec_train, label_model_4_uni_vec_train, model_4_uni_vec_test, label_model_4_uni_vec_test)

print("Model 4 Bigram Vec")
running_model((LinearSVC(C=1)), model_4_bi_vec_train, label_model_4_bi_vec_train, model_4_bi_vec_test, label_model_4_bi_vec_test)

print("Model 4 Unigram TFIDF Vec")
running_model((LinearSVC(C=1)), model_4_uni_tf_vec_train, label_model_4_uni_tf_vec_train, model_4_uni_tf_vec_test, label_model_4_uni_tf_vec_test)

print("Model 4 Bigram TFIDF Vec")
running_model((LinearSVC(C=1)), model_4_bigram_tf_vec_train, label_model_4_bigram_tf_vec_train, model_4_bigram_tf_vec_test, label_model_4_bigram_tf_vec_test)

def return_features(vec, model):
    for i,feature_probability in enumerate(model.coef_):
        print('============ Sentiment Score: ', i)
        df1 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
        df2 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[-10:])
        df3 = pd.concat([df1, df2], axis=1)
        print(tabulate(df3, tablefmt="fancy_grid", headers=["Most","Likely","Least","Likely"], floatfmt=".2f"))

	PhraseId	SentenceId	Phrase
0	102	3	would have a hard time sitting through this one
1	104	3	have a hard time sitting through this one
2	158	5	Aggressive self-glorification and a manipulati...
3	160	5	self-glorification and a manipulative whitewash
4	202	7	Trouble Every Day is a plodding mess .

	PhraseId	SentenceId	Phrase	Sentiment	phrase
156055	156056	8544	Hearst 's	2	hearst
156056	156057	8544	forced avuncular chortles	1	forced avuncular chortles
156057	156058	8544	avuncular chortles	3	avuncular chortles
156058	156059	8544	avuncular	2	avuncular
156059	156060	8544	chortles	2	chortles

	PhraseId	SentenceId	Phrase	Sentiment	phrase
0	1	1	A series of escapades demonstrating the adage ...	1	[a, series, of, escapades, demonstrating, the,...
1	2	1	A series of escapades demonstrating the adage ...	2	[a, series, of, escapades, demonstrating, the,...
2	3	1	A series	2	[a, series]
3	4	1	A	2	[a]
4	5	1	series	2	[series]

	PhraseId	SentenceId	Phrase	Sentiment	phrase	stemmed
0	1	1	A series of escapades demonstrating the adage ...	1	[a, series, of, escapades, demonstrating, the,...	[a, seri, of, escapad, demonstr, the, adag, th...
1	2	1	A series of escapades demonstrating the adage ...	2	[a, series, of, escapades, demonstrating, the,...	[a, seri, of, escapad, demonstr, the, adag, th...
2	3	1	A series	2	[a, series]	[a, seri]
3	4	1	A	2	[a]	[a]
4	5	1	series	2	[series]	[seri]

	PhraseId	SentenceId	Phrase	Sentiment	phrase	stemmed	phrase_reduced
0	1	1	A series of escapades demonstrating the adage ...	1	[a, series, of, escapades, demonstrating, the,...	[a, seri, of, escapad, demonstr, the, adag, th...	series escapades demonstrating the adage that...
1	2	1	A series of escapades demonstrating the adage ...	2	[a, series, of, escapades, demonstrating, the,...	[a, seri, of, escapad, demonstr, the, adag, th...	series escapades demonstrating the adage that...
2	3	1	A series	2	[a, series]	[a, seri]	series
3	4	1	A	2	[a]	[a]
4	5	1	series	2	[series]	[seri]	series

	PhraseId	SentenceId	Phrase	Sentiment
0	1	1	A series of escapades demonstrating the adage ...	1
1	34	1	the gander , some of which occasionally amuses...	1
2	48	1	but none of which amounts to much of a story	1
3	50	1	none of which amounts to much of a story	1
4	82	3	Even fans of Ismail Merchant 's work , I suspe...	1

	PhraseId	SentenceId	Phrase	Sentiment
0	22	1	good for the goose	3
1	23	1	good	3
2	47	1	amuses	3
3	65	2	This quiet , introspective and entertaining in...	3
4	68	2	quiet , introspective and entertaining	3

	PhraseId	SentenceId	Phrase	Sentiment
0	64	2	This quiet , introspective and entertaining in...	4
1	67	2	quiet , introspective and entertaining indepen...	4
2	75	2	entertaining	4
3	78	2	is worth seeking	4
4	118	4	A positively thrilling combination of ethnogra...	4

	count	word
0	3722	,
1	3070	the
2	2572	a
3	2507	and
4	2236	of
5	1973	.
6	1880	to
7	1363	is
8	1136	's
9	1130	that
10	998	in
11	927	it
12	757	movie
13	738	as
14	576	this
15	509	for
16	488	its
17	477	film
18	437	with
19	424	n't

	count	word
0	9712	the
1	8847	,
2	7327	a
3	6552	of
4	6078	and
5	5437	to
6	4432	.
7	3506	's
8	3324	is
9	2941	that
10	2863	in
11	2697	it
12	2127	as
13	1685	for
14	1608	its
15	1520	n't
16	1484	with
17	1365	movie
18	1266	but
19	1260	film

	count	word
0	18638	the
1	13141	,
2	12126	of
3	11297	a
4	10062	and
5	8167	to
6	7001	's
7	5370	in
8	4114	.
9	3634	is
10	3607	that
11	3411	it
12	2851	as
13	2837	for
14	2498	its
15	2404	with
16	2131	film
17	1944	on
18	1898	be
19	1868	movie

	count	word
0	46552	the
1	42006	,
2	33443	a
3	32245	of
4	31695	and
5	22363	to
6	17565	.
7	16971	's
8	13523	in
9	13340	is
10	12175	that
11	10358	it
12	8415	as
13	7568	with
14	7273	for
15	6947	its
16	6626	film
17	5929	an
18	5827	movie
19	5064	this

	count	word
0	4879	,
1	3968	and
2	3819	the
3	3311	a
4	3022	of
5	2280	.
6	1547	is
7	1540	to
8	1329	's
9	1248	that
10	1045	in
11	930	film
12	912	with
13	739	it
14	713	as
15	664	an
16	564	movie
17	553	for
18	520	A
19	516	its

	PhraseId	SentenceId	Phrase	phrase	stemmed	phrase_reduced	reduced_stemmed	further_reduced	further_reduced_stemmed
0	102	3	would have a hard time sitting through this one	[would, have, a, hard, time, sitting, through,...	[would, have, a, hard, time, sit, through, thi...	would have hard time sitting through this one	would have hard time sit through this one	hard sitting through	hard sit through
1	104	3	have a hard time sitting through this one	[have, a, hard, time, sitting, through, this, ...	[have, a, hard, time, sit, through, this, one]	have hard time sitting through this one	have hard time sit through this one	hard sitting through	hard sit through
2	158	5	Aggressive self-glorification and a manipulati...	[aggressive, selfglorification, and, a, manipu...	[aggress, selfglorif, and, a, manipul, whitewash]	Aggressive self glorification and manipulative...	aggress self glorif and manipul whitewash	Aggressive self glorification manipulative whi...	aggress self glorif manipul whitewash
3	160	5	self-glorification and a manipulative whitewash	[selfglorification, and, a, manipulative, whit...	[selfglorif, and, a, manipul, whitewash]	self glorification and manipulative whitewash	self glorif and manipul whitewash	self glorification manipulative whitewash	self glorif manipul whitewash
4	202	7	Trouble Every Day is a plodding mess .	[trouble, every, day, is, a, plodding, mess]	[troubl, everi, day, is, a, plod, mess]	Trouble Every Day plodding mess	troubl everi day plod mess	Trouble Every Day plodding mess	troubl everi plod mess

	PhraseId	SentenceId	Phrase	Sentiment	phrase	stemmed	phrase_reduced	reduced_stemmed	further_reduced	further_reduced_stemmed
0	22	1	good for the goose	3	[good, for, the, goose]	[good, for, the, goos]	good for the goose	good for the goos	good goose	good goos
1	23	1	good	3	[good]	[good]	good	good	good	good
2	47	1	amuses	3	[amuses]	[amus]	amuses	amus	amuses	amus
3	65	2	This quiet , introspective and entertaining in...	3	[this, quiet, introspective, and, entertaining...	[this, quiet, introspect, and, entertain, inde...	This quiet introspective and entertaining inde...	this quiet introspect and entertain independ	This quiet introspective entertaining independent	quiet introspect entertain independ
4	68	2	quiet , introspective and entertaining	3	[quiet, introspective, and, entertaining]	[quiet, introspect, and, entertain]	quiet introspective and entertaining	quiet introspect and entertain	quiet introspective entertaining	quiet introspect entertain

	count	sentiment
0	79582	2
1	32927	3
2	27273	1
3	9206	4
4	7072	0

	count	phrases_per_review
0	375	14
1	340	19
2	335	15
3	328	16
4	325	18

	count	word
0	3462	the
1	2549	and
2	1139	that
3	862	movi
4	652	this
5	551	film
6	545	for
7	499	bad
8	492	it
9	452	with
10	437	you
11	357	like
12	310	than
13	304	one
14	301	not
15	285	have
16	268	from
17	260	out
18	259	all
19	256	too

	count	word
0	1837	its
1	1832	film
2	1405	but
3	1321	movie
4	1067	The
5	956	good
6	663	story
7	630	funny
8	618	most
9	618	not
10	572	into
11	561	well
12	551	like
13	551	RRB
14	526	life
15	512	LRB
16	497	characters
17	473	comedy
18	466	love
19	449	enough

	count	word
0	3117	the
1	2544	and
2	1131	that
3	790	movie
4	576	this
5	535	for
6	488	its
7	477	film
8	437	with
9	433	bad
10	418	you
11	345	The
12	319	like
13	310	than
14	288	not
15	284	one
16	264	from
17	260	out
18	252	too
19	245	have

	count	word
0	2080	film
1	1870	it
2	1639	movi
3	1434	but
4	1005	good
5	756	charact
6	739	stori
7	719	like
8	670	most
9	648	not
10	640	love
11	640	funni
12	607	work
13	581	well
14	572	into
15	552	perform
16	551	rrb
17	545	comedi
18	544	life
19	512	lrb

	review	sentiment
0	[series, escapades, demonstrating, the, adage,...	1
1	[series, escapades, demonstrating, the, adage,...	2
2	[series]	2
3	[]	2
4	[series]	2

	review	sentiment
0	[seri, escapad, demonstr, the, adag, that, wha...	1
1	[seri, escapad, demonstr, the, adag, that, wha...	2
2	[seri]	2
3	[]	2
4	[seri]	2

	review	sentiment
0	[series, escapades, demonstrating, adage, good...	1
1	[series, escapades, demonstrating, adage, good...	2
2	[series]	2
3	[]	2
4	[series]	2

	review	sentiment
0	[seri, escapad, demonstr, adag, good, goos, al...	1
1	[seri, escapad, demonstr, adag, good, goos]	2
2	[seri]	2
3	[]	2
4	[seri]	2