import os
import csv
import pandas as pd
reviews = pd.read_csv("WK7/kaggle-sentiment/train.tsv", delimiter = "\t")
reviews.head()
label_count = pd.DataFrame(reviews.Sentiment.value_counts())
label_count.columns = ["count"]
label_count["sentiment"] = label_count.index
label_count.reset_index(drop = True, inplace = True)
label_count
#Packages needed for my graphs
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
with sns.plotting_context("talk"):
sns.barplot(y = "count", x = "sentiment", data = label_count,
palette = "GnBu_d")
plt.title("Count of Sentiment by Label")
plt.xlabel("Sentiment")
plt.ylabel("Count")
sentence_phrase_count = pd.DataFrame(reviews.groupby("SentenceId")["Phrase"].count())
sentence_phrase_count.head()
count_phrase_counts = pd.DataFrame(sentence_phrase_count.Phrase.value_counts())
count_phrase_counts["number_of_phrases"] = count_phrase_counts.index
count_phrase_counts.columns = ["count", "phrases_per_review"]
count_phrase_counts.reset_index(drop = True, inplace = True)
count_phrase_counts.head()
tick_mark_labels = []
for num in range (1, 64):
if num%5 == 0:
tick_mark_labels.append(num)
print(tick_mark_labels)
tick_marks = []
for num in tick_mark_labels:
new_num = num - 1
tick_marks.append(new_num)
print(tick_marks)
with sns.plotting_context("talk"):
sns.barplot(y = "count", x = "phrases_per_review",
data = count_phrase_counts, palette = "GnBu_d")
plt.title("Phrases per Review")
plt.xlabel("Phrases per Review")
plt.ylabel("Count")
plt.xticks(tick_marks, tick_mark_labels, rotation = 0)
from collections import Counter
def make_dic(df, column):
list_ = df[column].tolist()
string = " ".join(list_).split()
dic = Counter(string)
return(dic)
dic = make_dic(reviews, "Phrase")
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import numpy as np
def word_cloud(mask_path, dic, title):
with sns.plotting_context("talk"):
mask = np.array(Image.open(mask_path))
word_cloud = WordCloud(background_color = "white", max_words = 750,
mask = mask, max_font_size = 125)
word_cloud.generate_from_frequencies(dic)
image_colors = ImageColorGenerator(mask)
plt.figure(figsize = [8,8])
plt.imshow(word_cloud.recolor(color_func = image_colors),
interpolation = "bilinear")
plt.title(title)
plt.axis("off")
def all_words_df(dic):
df = pd.DataFrame.from_dict(dic, orient = "index")
df.columns = ["count"]
df["word"] = df.index
df.reset_index(drop = True, inplace = True)
return(df)
def top_words_df(df, num_of_words):
df.sort_values(by = ["count"], ascending = False, inplace = True)
df.reset_index(drop = True, inplace = True)
new_df = df[:num_of_words]
return(new_df)
def top_words_barplot(df, title):
with sns.plotting_context("talk"):
sns.barplot(y = "count", x = "word", data = df, palette = "GnBu_d")
plt.ylabel("Count")
plt.xlabel("Word")
plt.xticks(rotation = 90)
plt.title(title)
def unique_total_words( type_of_review, df):
print("The total number of words in the", type_of_review, "reviews is", sum(df["count"]))
print("The total number of unique words in the", type_of_review, "reviews is", len(df))
word_cloud("tomatoes.png", dic, "Rotten Tomatoes Reviews")
words_df = all_words_df(dic)
words_df.head()
top_20 = top_words_df(words_df, 20)
top_20
top_words_barplot(top_20, "Top 20 Words")
unique_total_words("all reviews uncleaned", words_df)
def sentiment_subset(df, sentiment_label):
df = df[df["Sentiment"] == sentiment_label]
df.reset_index(drop = True, inplace = True)
print(df.shape)
return(df)
reviews_0 = sentiment_subset(reviews, 0)
reviews_0.head()
reviews_1 = sentiment_subset(reviews, 1)
reviews_1.head()
reviews_2 = sentiment_subset(reviews, 2)
reviews_2.head()
reviews_3 = sentiment_subset(reviews, 3)
reviews_3.head()
reviews_4 = sentiment_subset(reviews, 4)
reviews_4.head()
dic_0 = make_dic(reviews_0, "Phrase")
# word_cloud("horrible.png", dic_0, "Bad Reviews: \n Uncleaned")
words_0 = all_words_df(dic_0)
words_0.head()
top_words_0 = top_words_df(words_0, 20)
top_words_0
top_words_barplot(top_words_0, "Top 20 Words: \n Bad Reviews")
unique_total_words("uncleaned bad", words_0)
dic_1 = make_dic(reviews_1, "Phrase")
# word_cloud("bad.png", dic_1, "Somewhat Bad Reviews: \n Uncleaned")
words_1 = all_words_df(dic_1)
top_words_1 = top_words_df(words_1, 20)
top_words_1
top_words_barplot(top_words_1, "Top 20 Words: \n Somewhat Bad Reviews")
unique_total_words("somewhat bad uncleaned", words_1)
dic_2 = make_dic(reviews_2, "Phrase")
# word_cloud("neutral.png", dic_2, "Neutral Reviews: \n Uncleaned")
words_2 = all_words_df(dic_2)
top_words_2 = top_words_df(words_2, 20)
top_words_2
top_words_barplot(top_words_2, "Top 20 Words: \n Neutral Reviews")
unique_total_words("neutral uncleanned", words_2)
dic_3 = make_dic(reviews_3, "Phrase")
word_cloud("good.png", dic_3, "Somewhat Positive: \n Uncleaned")
words_3 = all_words_df(dic)
top_words_3 = top_words_df(words_3, 20)
top_words_3
top_words_barplot(top_words_3, "Somewhat Positive: \n Uncleaned")
unique_total_words("somewhat positive uncleaned", words_3)
dic_4 = make_dic(reviews_4, "Phrase")
word_cloud("best.png", dic_4, "Positive Reviews: \n Uncleaned")
words_4 = all_words_df(dic_4)
top_words_4 = top_words_df(words_4, 20)
top_words_4
top_words_barplot(top_words_4, "Top 20 Words: \n Positive Reviews")
unique_total_words("positive uncleaned", words_4)
1. Make everything lowercase
2. Change contractions
3. Remove punctuation
4. Tokenize
5. Stem
6. Remove stopwords
def remove_contractions(df, column):
df[column] = df[column].str.replace(r"(can't)", "cannot")
df[column] = df[column].str.replace(r"(n't)", " not")
df[column] = df[column].str.replace(r"('s)", "")
df[column] = df[column].str.replace(r"('m)", " am")
df[column] = df[column].str.replace(r"('d)", " would")
df[column] = df[column].str.replace(r"('ll)", " will")
df[column] = df[column].str.replace(r"('ve)", " have")
df[column] = df[column].str.replace(r"('re)", " are")
return(df[column])
reviews["phrase"] = reviews["Phrase"].str.lower()
reviews.phrase.head()
reviews["phrase"] = remove_contractions(reviews, "phrase")
reviews.tail()
reviews["phrase"] = reviews["phrase"].str.replace(r"[^\w^\s]", "")
reviews["phrase"] = reviews["phrase"].str.replace(r"[0-9]+", "")
import nltk
reviews["phrase"] = reviews.apply(lambda row: nltk.word_tokenize(row["phrase"]), axis = 1)
reviews.head()
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
reviews["stemmed"] = reviews.apply(lambda row: [stemmer.stem(token) for token in row["phrase"]], axis = 1)
reviews.head()
import re
#What the function does: It removes all words that have less than 3 characters in it.
#Input: The string to have stopwords removed
#Ouptut: The string with the words with 2 or less characters removed
def remove_words_less_than_3_characters(string):
new_string = ""
for word in re.findall('[A-z][A-z]+\w', string):
new_string = new_string + " " + word
return new_string
reviews["phrase_reduced"] = reviews.apply(lambda row: remove_words_less_than_3_characters(row["Phrase"]), axis =1 )
reviews.head()
reviews["phrase_reduced"] = reviews.apply(lambda row: nltk.word_tokenize(row["phrase_reduced"]), axis = 1)
stemmer = SnowballStemmer("english")
reviews["reduced_stemmed"] = reviews.apply(lambda row: [stemmer.stem(token) for token in row["phrase_reduced"]], axis = 1)
reviews.head()
#Generating a list of stop words from the first most commonly used words
stop_words = ["the", "and", "you", "that", "was", "for", "with", "are", "his", "they", "one", "have",
"this", "from", "had", "word", "what", "some", "can", "out", "other", "were", "all", "there",
"when", "use", "your", "how", "said", "each", "she", "which", "their", "time", "will", "way",
"about", "many", "then", "them", "write", "would", "these", "her", "make", "thing", "see", "him",
"two", "has", "look", "more", "day", "could", "come", "did", "number", "sound", "people", "over",
"know", "than", "first", "who", "may", "down", "side", "been", "now", "find"]
reviews["further_reduced"] = reviews["phrase_reduced"].apply(lambda row: [word for word in row if word not in stop_words])
reviews["further_reduced_stemmed"] = reviews["reduced_stemmed"].apply(lambda row: [word for word in row if word not in stop_words])
reviews.head()
def get_df_ready_for_viz(df, column):
df[column] = df[column].apply(",".join)
df[column] = df[column].str.replace(",", " ")
return(df[column])
reviews_viz = reviews.copy()
columns = ["phrase_reduced", "reduced_stemmed", "further_reduced", "further_reduced_stemmed"]
for column in columns:
reviews_viz[column] = get_df_ready_for_viz(reviews_viz, column)
reviews_viz.head()
# dic_all = make_dic(reviews_viz, "phrase")
# word_cloud("tomatoes.png", dic_all, "Rotten Tomatoes: \n Cleaned Reviews")
# words_all = all_words_df(dic_all)
# top_words_all = top_words_df(words_all, 20)
# top_words_all
# top_words_barplot(top_words_all, "Top 20 Words: \n All Cleaned Reviews")
# unique_total_words("all cleaned", words_all)
reviews_0 = sentiment_subset(reviews_viz, 0)
reviews_0_viz = reviews_0.copy()
reviews_0_viz.head()
reviews_1 = sentiment_subset(reviews_viz, 1)
reviews_1_viz = reviews_1.copy()
reviews_1_viz.head()
reviews_2 = sentiment_subset(reviews_viz, 2)
reviews_2_viz = reviews_2.copy()
reviews_2_viz.head()
reviews_3 = sentiment_subset(reviews_viz, 3)
reviews_3_viz = reviews_3.copy()
reviews_3_viz.head()
reviews_4 = sentiment_subset(reviews_viz, 4)
reviews_4_viz = reviews_4.copy()
reviews_4_viz.head()
dic_0 = make_dic(reviews_0_viz, "phrase_reduced")
# word_cloud("horrible.png", dic_0, "Bad Reviews: \n Reduced")
words_0 = all_words_df(dic_0)
top_words_0 = top_words_df(words_0, 20)
top_words_0
top_words_barplot(top_words_0, "Top 20 Words: \n Bad Reduced")
unique_total_words("bad reduced", words_0)
dic_1 = make_dic(reviews_1_viz, "phrase_reduced")
word_cloud("bad.png", dic_1, "Somewhat Bad Reviews: \n Reduced")
words_1 = all_words_df(dic_1)
top_words_1 = top_words_df(words_1, 20)
top_words_1
top_words_barplot(top_words_1, "Top 20 Words: \n Somewhat Bad Reduced")
unique_total_words("somewhat bad reduced", words_1)
dic_2 = make_dic(reviews_2_viz, "phrase_reduced")
word_cloud("neutral.png", dic_2, "Neutral Reviews: \n Reduced")
words_2 = all_words_df(dic_2)
top_words_2 = top_words_df(words_2, 20)
top_words_2
top_words_barplot(top_words_2, "Top 20 Words: \n Neutral Reduced")
unique_total_words("neutral reduced", words_2)
dic_3 = make_dic(reviews_3_viz, "phrase_reduced")
word_cloud("good.png", dic_3, "Good Reviews: \n Reduced")
words_3 = all_words_df(dic_3)
top_words_3 = top_words_df(words_3, 20)
top_words_3
top_words_barplot(top_words_3, "Top 20 Words: \n Somewhat Postive Reduced")
unique_total_words("somewhat positive reduced", words_3)
dic_4 = make_dic(reviews_4_viz, "phrase_reduced")
word_cloud("best.png", dic_4, "Positive Reviews: \n Reduced")
words_4 = all_words_df(dic_4)
top_words_4 = top_words_df(words_4, 20)
top_words_4
top_words_barplot(top_words_4, "Top 20 Words: \n Positive Reduced")
unique_total_words("positive reduced", words_4)
dic_0 = make_dic(reviews_0_viz, "reduced_stemmed")
# word_cloud("horrible.png", dic_0, "Bad Reviews: \n Reduced Stemmed")
words_0 = all_words_df(dic_0)
top_words_0 = top_words_df(words_0, 20)
top_words_0
top_words_barplot(top_words_0, "Top 20 Words: \n Bad Reduced Stemmed")
unique_total_words("bad reduced stemmed", words_0)
dic_1 = make_dic(reviews_1_viz, "reduced_stemmed")
word_cloud("bad.png", dic_1, "Somewhat Bad Reviews: \n Reduced Stemmed")
words_1 = all_words_df(dic_1)
top_words_1 = top_words_df(words_1, 20)
top_words_1
top_words_barplot(top_words_1, "Top 20 Words: \n Somewhat Bad Reduced Stemmed")
unique_total_words("somewhat bad reduced stemmed", words_1)
dic_2 = make_dic(reviews_2_viz, "reduced_stemmed")
word_cloud("neutral.png", dic_2, "Neutral Reviews: \n Reduced Stemmed")
words_2 = all_words_df(dic_2)
top_words_2 = top_words_df(words_2, 20)
top_words_2
top_words_barplot(top_words_2, "Top 20 Words: \n Neutral Reduced Stemmed")
unique_total_words("neutral reduced stemmed", words_2)
dic_3 = make_dic(reviews_3_viz, "reduced_stemmed")
word_cloud("good.png", dic_3, "Good Reviews: \n Reduced Stemmed")
words_3 = all_words_df(dic_3)
top_words_3 = top_words_df(words_3, 20)
top_words_3
top_words_barplot(top_words_3, "Top 20 Words: \n Somewhat Postive Reduced Stemmed")
unique_total_words("somewhat positive reduced stemmed", words_3)
dic_4 = make_dic(reviews_4_viz, "reduced_stemmed")
word_cloud("best.png", dic_4, "Positive Reviews: \n Reduced Stemmed")
words_4 = all_words_df(dic_4)
top_words_4 = top_words_df(words_4, 20)
top_words_4
top_words_barplot(top_words_4, "Top 20 Words: \n Positive Reduced Stemmed")
unique_total_words("positive reduced stemmed", words_4)
dic_0 = make_dic(reviews_0_viz, "further_reduced")
word_cloud("horrible.png", dic_0, "Bad Reviews: \n Further Reduced")
words_0 = all_words_df(dic_0)
top_words_0 = top_words_df(words_0, 20)
top_words_0
top_words_barplot(top_words_0, "Top 20 Words: \n Bad Further Reduced")
unique_total_words("bad further reduced", words_0)
dic_1 = make_dic(reviews_1_viz, "further_reduced")
word_cloud("bad.png", dic_1, "Somewhat Bad Reviews: \n Further Reduced")
words_1 = all_words_df(dic_1)
top_words_1 = top_words_df(words_1, 20)
top_words_1
top_words_barplot(top_words_1, "Top 20 Words: \n Somewhat Bad Further Reduced")
unique_total_words("somewhat bad further reduced", words_1)
dic_2 = make_dic(reviews_2_viz, "further_reduced")
word_cloud("neutral.png", dic_2, "Neutral Reviews: \n Further Reduced")
words_2 = all_words_df(dic_2)
top_words_2 = top_words_df(words_2, 20)
top_words_2
top_words_barplot(top_words_2, "Top 20 Words: \n Neutral Further Reduced")
unique_total_words("neutral futher reduced", words_2)
dic_3 = make_dic(reviews_3_viz, "further_reduced")
word_cloud("good.png", dic_3, "Good Reviews: \n Further Reduced")
words_3 = all_words_df(dic_3)
top_words_3 = top_words_df(words_3, 20)
top_words_3
top_words_barplot(top_words_3, "Top 20 Words: \n Somewhat Postive Further Reduced")
unique_total_words("somewhat positive further reduced", words_3)
dic_4 = make_dic(reviews_4_viz, "further_reduced")
word_cloud("best.png", dic_4, "Positive Reviews: \n Further Reduced")
words_4 = all_words_df(dic_4)
top_words_4 = top_words_df(words_4, 20)
top_words_4
top_words_barplot(top_words_4, "Top 20 Words: \n Positive Further Reduced")
unique_total_words("positive further reduced", words_4)
dic_0 = make_dic(reviews_0_viz, "further_reduced_stemmed")
word_cloud("horrible.png", dic_0, "Bad Reviews: \n Further Reduced Stemmed")
words_0 = all_words_df(dic_0)
top_words_0 = top_words_df(words_0, 20)
top_words_0
top_words_barplot(top_words_0, "Top 20 Words: \n Bad Further Reduced Stemmed")
unique_total_words("bad further reduced stemmed", words_0)
dic_1 = make_dic(reviews_1_viz, "further_reduced_stemmed")
word_cloud("bad.png", dic_1, "Somewhat Bad Reviews: \n Further Reduced Stemmed")
words_1 = all_words_df(dic_1)
top_words_1 = top_words_df(words_1, 20)
top_words_1
top_words_barplot(top_words_1, "Top 20 Words: \n Somewhat Bad Further Reduced Stemmed")
unique_total_words("somewhat bad further reduced stemmed", words_1)
dic_2 = make_dic(reviews_2_viz, "further_reduced_stemmed")
word_cloud("neutral.png", dic_2, "Neutral Reviews: \n Further Reduced Stemmed")
words_2 = all_words_df(dic_2)
top_words_2 = top_words_df(words_2, 20)
top_words_2
top_words_barplot(top_words_2, "Top 20 Words: \n Neutral Further Reduced Stemmed")
unique_total_words("neutral further reduced stemmed", words_2)
dic_3 = make_dic(reviews_3_viz, "further_reduced_stemmed")
word_cloud("good.png", dic_3, "Good Reviews: \n Further Reduced Stemmed")
words_3 = all_words_df(dic_3)
top_words_3 = top_words_df(words_3, 20)
top_words_3
top_words_barplot(top_words_3, "Top 20 Words: \n Somewhat Postive Further Reduced Stemmed")
unique_total_words("somewhat positive reduced", words_3)
dic_4 = make_dic(reviews_4_viz, "further_reduced_stemmed")
word_cloud("best.png", dic_4, "Positive Reviews: \n Further Reduced Stemmed")
words_4 = all_words_df(dic_4)
top_words_4 = top_words_df(words_4, 20)
top_words_4
top_words_barplot(top_words_4, "Top 20 Words: \n Positive Further Reduced Stemmed")
unique_total_words("positive further reduced stemmed", words_4)
To make the training data contain an equal number of rows for each sentiment. The sample function replce = False is utilized
def make_df(column):
df = pd.DataFrame()
df["review"] = reviews[column]
df["sentiment"] = reviews["Sentiment"]
df.reset_index(drop = True, inplace = True)
return(df)
reviews.head()
phrase_reduced_df = make_df("phrase_reduced")
phrase_reduced_df.head()
reduced_stemmed_df = make_df("reduced_stemmed")
reduced_stemmed_df.head()
further_reduced_df = make_df("further_reduced")
further_reduced_df.head()
further_reduced_stemmed_df = make_df("further_reduced_stemmed")
further_reduced_stemmed_df.head()
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
#What the function does: It takes the tokens from the df and joins it into a string, then replaces the "," with a space
#Input: the df and column to be changed
#Output: the data untokenized
def getting_data_ready_for_freq(df, column):
df[column] = df[column].apply(",".join)
df[column] = df[column].str.replace(",", " ")
return(df[column])
phrase_reduced_df["review"] = getting_data_ready_for_freq(phrase_reduced_df, "review")
reduced_stemmed_df["review"] = getting_data_ready_for_freq(reduced_stemmed_df, "review")
further_reduced_df["review"] = getting_data_ready_for_freq(further_reduced_df, "review")
further_reduced_stemmed_df["review"] = getting_data_ready_for_freq(further_reduced_stemmed_df, "review")
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
uni_vec = CountVectorizer(encoding='latin-1', binary=False, min_df=3)
bi_vec = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=3)
uni_tf_vec = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=3)
bigram_tf_vec = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=3)
model_1_uni_vec = uni_vec.fit_transform(phrase_reduced_df["review"])
model_1_bi_vec = bi_vec.fit_transform(phrase_reduced_df["review"])
model_1_uni_tf_vec = uni_tf_vec.fit_transform(phrase_reduced_df["review"])
model_1_bigram_tf_vec = bigram_tf_vec.fit_transform(phrase_reduced_df["review"])
model_2_uni_vec = uni_vec.fit_transform(reduced_stemmed_df["review"])
model_2_bi_vec = bi_vec.fit_transform(reduced_stemmed_df["review"])
model_2_uni_tf_vec = uni_tf_vec.fit_transform(reduced_stemmed_df["review"])
model_2_bigram_tf_vec = bigram_tf_vec.fit_transform(reduced_stemmed_df["review"])
model_3_uni_vec = uni_vec.fit_transform(further_reduced_df["review"])
model_3_bi_vec = bi_vec.fit_transform(further_reduced_df["review"])
model_3_uni_tf_vec = uni_tf_vec.fit_transform(further_reduced_df["review"])
model_3_bigram_tf_vec = bigram_tf_vec.fit_transform(further_reduced_df["review"])
model_4_uni_vec = uni_vec.fit_transform(further_reduced_stemmed_df["review"])
model_4_bi_vec = bi_vec.fit_transform(further_reduced_stemmed_df["review"])
model_4_uni_tf_vec = uni_tf_vec.fit_transform(further_reduced_stemmed_df["review"])
model_4_bigram_tf_vec = bigram_tf_vec.fit_transform(further_reduced_stemmed_df["review"])
# Creating testing and training df and labels
model_1_uni_vec_train, model_1_uni_vec_test, label_model_1_uni_vec_train, label_model_1_uni_vec_test = train_test_split(model_1_uni_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)
model_1_bi_vec_train, model_1_bi_vec_test, label_model_1_bi_vec_train, label_model_1_bi_vec_test = train_test_split(model_1_bi_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)
model_1_uni_tf_vec_train, model_1_uni_tf_vec_test, label_model_1_uni_tf_vec_train, label_model_1_uni_tf_vec_test = train_test_split(model_1_uni_tf_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)
model_1_bigram_tf_vec_train, model_1_bigram_tf_vec_test, label_model_1_bigram_tf_vec_train, label_model_1_bigram_tf_vec_test = train_test_split(model_1_bigram_tf_vec, phrase_reduced_df["sentiment"], test_size = 0.2, random_state = 12)
# Creating testing and training df and labels
model_2_uni_vec_train, model_2_uni_vec_test, label_model_2_uni_vec_train, label_model_2_uni_vec_test = train_test_split(model_2_uni_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)
model_2_bi_vec_train, model_2_bi_vec_test, label_model_2_bi_vec_train, label_model_2_bi_vec_test = train_test_split(model_2_bi_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)
model_2_uni_tf_vec_train, model_2_uni_tf_vec_test, label_model_2_uni_tf_vec_train, label_model_2_uni_tf_vec_test = train_test_split(model_2_uni_tf_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)
model_2_bigram_tf_vec_train, model_2_bigram_tf_vec_test, label_model_2_bigram_tf_vec_train, label_model_2_bigram_tf_vec_test = train_test_split(model_2_bigram_tf_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)
# Creating testing and training df and labels
model_3_uni_vec_train, model_3_uni_vec_test, label_model_3_uni_vec_train, label_model_3_uni_vec_test = train_test_split(model_3_uni_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)
model_3_bi_vec_train, model_3_bi_vec_test, label_model_3_bi_vec_train, label_model_3_bi_vec_test = train_test_split(model_3_bi_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)
model_3_uni_tf_vec_train, model_3_uni_tf_vec_test, label_model_3_uni_tf_vec_train, label_model_3_uni_tf_vec_test = train_test_split(model_3_uni_tf_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)
model_3_bigram_tf_vec_train, model_3_bigram_tf_vec_test, label_model_3_bigram_tf_vec_train, label_model_3_bigram_tf_vec_test = train_test_split(model_3_bigram_tf_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)
# Creating testing and training df and labels
model_4_uni_vec_train, model_4_uni_vec_test, label_model_4_uni_vec_train, label_model_4_uni_vec_test = train_test_split(model_4_uni_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)
model_4_bi_vec_train, model_4_bi_vec_test, label_model_4_bi_vec_train, label_model_4_bi_vec_test = train_test_split(model_4_bi_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)
model_4_uni_tf_vec_train, model_4_uni_tf_vec_test, label_model_4_uni_tf_vec_train, label_model_4_uni_tf_vec_test = train_test_split(model_4_uni_tf_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)
model_4_bigram_tf_vec_train, model_4_bigram_tf_vec_test, label_model_4_bigram_tf_vec_train, label_model_4_bigram_tf_vec_test = train_test_split(model_4_bigram_tf_vec, phrase_reduced_df["sentiment"], test_size = 0.4, random_state = 12)
from sklearn.metrics import accuracy_score
all_stats = []
def running_model(model, clf, train_df, train_label, test_df, test_label):
clf = clf
clf.fit(train_df, train_label)
predicted = clf.predict(test_df)
accuracy = accuracy_score(test_label, predicted, normalize = True)
data = [model, clf, accuracy]
all_stats.append(data)
results = pd.DataFrame(all_stats, columns = ["model", "classifier", "accuracy"])
print("The accuracy is", accuracy)
print("#----------------------------------------------------------------#")
cm = confusion_matrix(test_label, predicted)
print(cm)
print("#----------------------------------------------------------------#")
print(classification_report(test_label, predicted, target_names = ["0", "1", "2", "3", "4"]))
return clf, results
print("Model 1 Unigram Vec")
clf, results = running_model("Model 1 Unigram Vec", MultinomialNB(), model_1_uni_vec_train, label_model_1_uni_vec_train, model_1_uni_vec_test, label_model_1_uni_vec_test)
print(clf)
def return_features(vec, model):
# print(vec)
for i,feature_probability in enumerate(model.coef_):
print('============ Sentiment Score: ', i)
df1 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
df2 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[-10:])
df3 = pd.concat([df1, df2], axis=1)
print(df3)
# print(tabulate(df3, tablefmt="fancy_grid", headers=["Most","Likely","Least","Likely"], floatfmt=".2f"))
return_features(uni_vec, clf)
print("Model 1 Bigram Vec")
results = running_model("Model 1 Bigram Vec", MultinomialNB(), model_1_bi_vec_train, label_model_1_bi_vec_train, model_1_bi_vec_test, label_model_1_bi_vec_test)
results
print("Model 1 Unigram TFIDF Vec")
results = running_model("Model 1 Unigram TFIDF Vec", MultinomialNB(), model_1_uni_tf_vec_train, label_model_1_uni_tf_vec_train, model_1_uni_tf_vec_test, label_model_1_uni_tf_vec_test)
print("Model 1 Bigram TFIDF Vec")
running_model(MultinomialNB(), model_1_bigram_tf_vec_train, label_model_1_bigram_tf_vec_train, model_1_bigram_tf_vec_test, label_model_1_bigram_tf_vec_test)
print("Model 2 Unigram Vec")
running_model(MultinomialNB(), model_2_uni_vec_train, label_model_2_uni_vec_train, model_2_uni_vec_test, label_model_2_uni_vec_test)
print("Model 2 Bigram Vec")
running_model(MultinomialNB(), model_2_bi_vec_train, label_model_2_bi_vec_train, model_2_bi_vec_test, label_model_2_bi_vec_test)
print("Model 2 Unigram TFIDF Vec")
running_model(MultinomialNB(), model_2_uni_tf_vec_train, label_model_2_uni_tf_vec_train, model_2_uni_tf_vec_test, label_model_2_uni_tf_vec_test)
print("Model 2 Bigram TFIDF Vec")
running_model(MultinomialNB(), model_2_bigram_tf_vec_train, label_model_2_bigram_tf_vec_train, model_2_bigram_tf_vec_test, label_model_2_bigram_tf_vec_test)
print("Model 3 Unigram Vec")
running_model(MultinomialNB(), model_3_uni_vec_train, label_model_3_uni_vec_train, model_3_uni_vec_test, label_model_3_uni_vec_test)
print("Model 3 Bigram Vec")
running_model(MultinomialNB(), model_3_bi_vec_train, label_model_3_bi_vec_train, model_3_bi_vec_test, label_model_3_bi_vec_test)
print("Model 3 Unigram TFIDF Vec")
running_model(MultinomialNB(), model_3_uni_tf_vec_train, label_model_3_uni_tf_vec_train, model_3_uni_tf_vec_test, label_model_3_uni_tf_vec_test)
print("Model 3 Bigram TFIDF Vec")
running_model(MultinomialNB(), model_3_bigram_tf_vec_train, label_model_3_bigram_tf_vec_train, model_3_bigram_tf_vec_test, label_model_3_bigram_tf_vec_test)
print("Model 4 Unigram Vec")
running_model(MultinomialNB(), model_4_uni_vec_train, label_model_4_uni_vec_train, model_4_uni_vec_test, label_model_4_uni_vec_test)
print("Model 4 Bigram Vec")
running_model(MultinomialNB(), model_4_bi_vec_train, label_model_4_bi_vec_train, model_4_bi_vec_test, label_model_4_bi_vec_test)
print("Model 4 Unigram TFIDF Vec")
running_model(MultinomialNB(), model_4_uni_tf_vec_train, label_model_4_uni_tf_vec_train, model_4_uni_tf_vec_test, label_model_4_uni_tf_vec_test)
print("Model 4 Bigram TFIDF Vec")
running_model(MultinomialNB(), model_4_bigram_tf_vec_train, label_model_4_bigram_tf_vec_train, model_4_bigram_tf_vec_test, label_model_4_bigram_tf_vec_test)
from sklearn.svm import LinearSVC
print("Model 1 Unigram Vec")
running_model((LinearSVC(C=1)), model_1_uni_vec_train, label_model_1_uni_vec_train, model_1_uni_vec_test, label_model_1_uni_vec_test)
print("Model 1 Bigram Vec")
running_model((LinearSVC(C=1)), model_1_bi_vec_train, label_model_1_bi_vec_train, model_1_bi_vec_test, label_model_1_bi_vec_test)
print("Model 1 Unigram TFIDF Vec")
running_model((LinearSVC(C=1)), model_1_uni_tf_vec_train, label_model_1_uni_tf_vec_train, model_1_uni_tf_vec_test, label_model_1_uni_tf_vec_test)
print("Model 1 Bigram TFIDF Vec")
running_model((LinearSVC(C=1)), model_1_bigram_tf_vec_train, label_model_1_bigram_tf_vec_train, model_1_bigram_tf_vec_test, label_model_1_bigram_tf_vec_test)
print("Model 2 Unigram Vec")
running_model((LinearSVC(C=1)), model_2_uni_vec_train, label_model_2_uni_vec_train, model_2_uni_vec_test, label_model_2_uni_vec_test)
print("Model 2 Bigram Vec")
running_model((LinearSVC(C=1)), model_2_bi_vec_train, label_model_2_bi_vec_train, model_2_bi_vec_test, label_model_2_bi_vec_test)
print("Model 2 Unigram TFIDF Vec")
running_model((LinearSVC(C=1)), model_2_uni_tf_vec_train, label_model_2_uni_tf_vec_train, model_2_uni_tf_vec_test, label_model_2_uni_tf_vec_test)
print("Model 2 Bigram TFIDF Vec")
running_model((LinearSVC(C=1)), model_2_bigram_tf_vec_train, label_model_2_bigram_tf_vec_train, model_2_bigram_tf_vec_test, label_model_2_bigram_tf_vec_test)
print("Model 3 Unigram Vec")
running_model((LinearSVC(C=1)), model_3_uni_vec_train, label_model_3_uni_vec_train, model_3_uni_vec_test, label_model_3_uni_vec_test)
print("Model 3 Bigram Vec")
running_model((LinearSVC(C=1)), model_3_bi_vec_train, label_model_3_bi_vec_train, model_3_bi_vec_test, label_model_3_bi_vec_test)
print("Model 3 Unigram TFIDF Vec")
running_model((LinearSVC(C=1)), model_3_uni_tf_vec_train, label_model_3_uni_tf_vec_train, model_3_uni_tf_vec_test, label_model_3_uni_tf_vec_test)
print("Model 3 Bigram TFIDF Vec")
running_model((LinearSVC(C=1)), model_3_bigram_tf_vec_train, label_model_3_bigram_tf_vec_train, model_3_bigram_tf_vec_test, label_model_3_bigram_tf_vec_test)
print("Model 4 Unigram Vec")
running_model((LinearSVC(C=1)), model_4_uni_vec_train, label_model_4_uni_vec_train, model_4_uni_vec_test, label_model_4_uni_vec_test)
print("Model 4 Bigram Vec")
running_model((LinearSVC(C=1)), model_4_bi_vec_train, label_model_4_bi_vec_train, model_4_bi_vec_test, label_model_4_bi_vec_test)
print("Model 4 Unigram TFIDF Vec")
running_model((LinearSVC(C=1)), model_4_uni_tf_vec_train, label_model_4_uni_tf_vec_train, model_4_uni_tf_vec_test, label_model_4_uni_tf_vec_test)
print("Model 4 Bigram TFIDF Vec")
running_model((LinearSVC(C=1)), model_4_bigram_tf_vec_train, label_model_4_bigram_tf_vec_train, model_4_bigram_tf_vec_test, label_model_4_bigram_tf_vec_test)
def return_features(vec, model):
for i,feature_probability in enumerate(model.coef_):
print('============ Sentiment Score: ', i)
df1 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
df2 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[-10:])
df3 = pd.concat([df1, df2], axis=1)
print(tabulate(df3, tablefmt="fancy_grid", headers=["Most","Likely","Least","Likely"], floatfmt=".2f"))