{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "train=p.read_csv(\"kaggle-sentiment/train.tsv\", delimiter='\\t')\n", "y=train['Sentiment'].values\n", "X=train['Phrase'].values" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.5529026288030471 | B? F | CV: 2 | Classifier: BernoulliNB\n", "0.5531524365695574 | B? F | CV: 3 | Classifier: BernoulliNB\n", "0.5592720169584305 | B? F | CV: 2 | Classifier: MultinomialNB\n", "0.5595474569680894 | B? F | CV: 3 | Classifier: MultinomialNB\n", "0.5596116298457374 | B? T | CV: 2 | Classifier: MultinomialNB\n", "0.5601369637205256 | B? T | CV: 3 | Classifier: MultinomialNB\n" ] } ], "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.model_selection import cross_val_score\n", "from sklearn.naive_bayes import BernoulliNB, MultinomialNB\n", "\n", "def runPipeline(classifier, boolean, cv):\n", " nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=boolean)),('nb', classifier)])\n", " scores = cross_val_score(nb_clf_pipe, X, y, cv=cv)\n", " avg=sum(scores)/len(scores)\n", " pretty_line = \"{} | B? {} | CV: {} | Classifier: {}\"\n", " print(pretty_line.format(avg, str(boolean)[0], cv, str(classifier).split('(')[0]))\n", " \n", "runPipeline(BernoulliNB(), False, 2)\n", "runPipeline(BernoulliNB(), False, 3)\n", "runPipeline(MultinomialNB(), False, 2)\n", "runPipeline(MultinomialNB(), False, 3)\n", "runPipeline(MultinomialNB(), True, 2)\n", "runPipeline(MultinomialNB(), True, 3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([1, 2, 2, ..., 3, 2, 2])" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('hw6_data_sentiment.csv')" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "PoN | \n", "tokens | \n", "num_tokens | \n", "sentences | \n", "num_sentences | \n", "no_sw | \n", "num_no_sw | \n", "topwords_unfil | \n", "topwords_fil | \n", "... | \n", "v_pos_fd | \n", "bow | \n", "bow_nosw | \n", "diy_cleaner | \n", "pruned | \n", "nltk_negs | \n", "unigram_feats | \n", "bigram_feats | \n", "bigram_feats_neg | \n", "no_shared_words | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "I went to XYZ restaurant last week and I was v... | \n", "N | \n", "['i', 'went', 'to', 'xyz', 'restaurant', 'last... | \n", "50 | \n", "['I went to XYZ restaurant last week and I was... | \n", "3 | \n", "['went', 'xyz', 'restaurant', 'last', 'week', ... | \n", "25 | \n", "[('was', 4), ('to', 3), ('i', 2), ('and', 2), ... | \n", "[('went', 1), ('xyz', 1), ('restaurant', 1), (... | \n", "... | \n", "0.186 | \n", "Counter({'was': 4, 'to': 3, 'i': 2, 'and': 2, ... | \n", "Counter({'went': 1, 'xyz': 1, 'restaurant': 1,... | \n", "i went to xyz restaurant last week and i was v... | \n", "went restaurant last week very disappointed. f... | \n", "['i', 'went', 'to', 'xyz', 'restaurant', 'last... | \n", "['was', 'to', 'i', 'and', 'the', 'a_NEG', 'for... | \n", "['i_went', 'went_to', 'to_xyz', 'xyz_restauran... | \n", "['i_went', 'went_to', 'to_xyz', 'xyz_restauran... | \n", "['i', 'to', 'xyz', 'week', 'and', 'i', 'was', ... | \n", "
1 | \n", "In each of the diner dish there are at least o... | \n", "N | \n", "['in', 'each', 'of', 'the', 'diner', 'dish', '... | \n", "78 | \n", "['In each of the diner dish there are at least... | \n", "4 | \n", "['diner', 'dish', 'least', 'one', 'fly', 'wait... | \n", "31 | \n", "[('the', 6), ('in', 4), ('to', 4), ('of', 3), ... | \n", "[('want', 3), ('dish', 2), ('diner', 1), ('lea... | \n", "... | \n", "0.042 | \n", "Counter({'the': 6, 'in': 4, 'to': 4, 'of': 3, ... | \n", "Counter({'want': 3, 'dish': 2, 'diner': 1, 'le... | \n", "in each of the diner dish there are at least o... | \n", "each diner dish there least waiting hour dish ... | \n", "['in', 'each', 'of', 'the', 'diner', 'dish', '... | \n", "['to_NEG', 'the', 'want_NEG', 'the_NEG', 'in',... | \n", "['in_each', 'each_of', 'of_the', 'the_diner', ... | \n", "['in_each', 'each_of', 'of_the', 'the_diner', ... | \n", "['in', 'of', 'the', 'diner', 'are', 'at', 'lea... | \n", "
2 | \n", "This is the last place you would want to dine ... | \n", "N | \n", "['this', 'is', 'the', 'last', 'place', 'you', ... | \n", "151 | \n", "['This is the last place you would want to din... | \n", "7 | \n", "['last', 'place', 'would', 'want', 'dine', 'pr... | \n", "61 | \n", "[('to', 10), ('the', 9), ('and', 7), ('we', 5)... | \n", "[('minutes', 3), ('place', 2), ('price', 2), (... | \n", "... | \n", "0.171 | \n", "Counter({'to': 10, 'the': 9, 'and': 7, 'we': 5... | \n", "Counter({'minutes': 3, 'place': 2, 'price': 2,... | \n", "this is the last place you would want to dine ... | \n", "this last place would want dine price that exp... | \n", "['this', 'is', 'the', 'last', 'place', 'you', ... | \n", "['to_NEG', 'the_NEG', 'and_NEG', 'we_NEG', 'ha... | \n", "['this_is', 'is_the', 'the_last', 'last_place'... | \n", "['this_is', 'is_the', 'the_last', 'last_place'... | \n", "['is', 'the', 'you', 'to', 'dine', 'at', 'the'... | \n", "
3 | \n", "I went to this restaurant where I had ordered ... | \n", "N | \n", "['i', 'went', 'to', 'this', 'restaurant', 'whe... | \n", "75 | \n", "['I went to this restaurant where I had ordere... | \n", "6 | \n", "['went', 'restaurant', 'ordered', 'complimenta... | \n", "33 | \n", "[('i', 6), ('the', 6), ('to', 3), ('for', 3), ... | \n", "[('salad', 3), ('restaurant', 2), ('waiter', 2... | \n", "... | \n", "0.162 | \n", "Counter({'i': 6, 'the': 6, 'to': 3, 'for': 3, ... | \n", "Counter({'salad': 3, 'restaurant': 2, 'waiter'... | \n", "i went to this restaurant where i had ordered ... | \n", "went this restaurant where ordered complimenta... | \n", "['i', 'went', 'to', 'this', 'restaurant', 'whe... | \n", "['the', 'i', 'salad', 'had', 'for', 'waiter', ... | \n", "['i_went', 'went_to', 'to_this', 'this_restaur... | \n", "['i_went', 'went_to', 'to_this', 'this_restaur... | \n", "['i', 'to', 'i', 'had', 'for', 'the', 'complim... | \n", "
4 | \n", "I went there with two friends at 6pm. Long que... | \n", "N | \n", "['i', 'went', 'there', 'with', 'two', 'friends... | \n", "73 | \n", "['I went there with two friends at 6pm.', 'Lon... | \n", "10 | \n", "['went', 'two', 'friends', 'long', 'queue', 'd... | \n", "38 | \n", "[('there', 3), ('but', 3), ('it', 3), ('a', 3)... | \n", "[('two', 2), ('friends', 2), ('long', 2), ('di... | \n", "... | \n", "0.353 | \n", "Counter({'there': 3, 'but': 3, 'it': 3, 'a': 3... | \n", "Counter({'two': 2, 'friends': 2, 'long': 2, 'd... | \n", "i went there with two friends at 6pm. long que... | \n", "went there with friends 6pm. long queue there.... | \n", "['i', 'went', 'there', 'with', 'two', 'friends... | \n", "['a_NEG', 'there', 'us_NEG', 'but_NEG', 'and_N... | \n", "['i_went', 'went_there', 'there_with', 'with_t... | \n", "['i_went', 'went_there', 'there_with', 'with_t... | \n", "['i', 'two', 'at', 'queue', 'was', 'but', 'it'... | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
83 | \n", "This place was one of the best restaurant I ha... | \n", "P | \n", "['this', 'place', 'was', 'one', 'of', 'the', '... | \n", "70 | \n", "['This place was one of the best restaurant I ... | \n", "6 | \n", "['place', 'one', 'best', 'restaurant', 'price'... | \n", "32 | \n", "[('the', 5), ('i', 3), ('and', 3), ('this', 2)... | \n", "[('best', 2), ('area', 2), ('place', 1), ('one... | \n", "... | \n", "0.300 | \n", "Counter({'the': 5, 'i': 3, 'and': 3, 'this': 2... | \n", "Counter({'best': 2, 'area': 2, 'place': 1, 'on... | \n", "this place was one of the best restaurant i ha... | \n", "this place best restaurant have been. price li... | \n", "['this', 'place', 'was', 'one', 'of', 'the', '... | \n", "['the', 'i', 'and', 'this', 'best', 'is', 'are... | \n", "['this_place', 'place_was', 'was_one', 'one_of... | \n", "['this_place', 'place_was', 'was_one', 'one_of... | \n", "['was', 'one', 'of', 'the', 'i', 'the', 'is', ... | \n", "
84 | \n", "The best experience I ever had happened in Lon... | \n", "P | \n", "['the', 'best', 'experience', 'i', 'ever', 'ha... | \n", "42 | \n", "['The best experience I ever had happened in L... | \n", "3 | \n", "['best', 'experience', 'ever', 'happened', 'lo... | \n", "21 | \n", "[('the', 3), ('in', 3), ('food', 2), ('a', 2),... | \n", "[('food', 2), ('best', 1), ('experience', 1), ... | \n", "... | \n", "0.283 | \n", "Counter({'the': 3, 'in': 3, 'food': 2, 'a': 2,... | \n", "Counter({'food': 2, 'best': 1, 'experience': 1... | \n", "the best experience i ever had happened in lon... | \n", "best experience ever happened london britain. ... | \n", "['the', 'best', 'experience', 'i', 'ever', 'ha... | \n", "['in', 'the', 'best', 'experience', 'i', 'ever... | \n", "['the_best', 'best_experience', 'experience_i'... | \n", "['the_best', 'best_experience', 'experience_i'... | \n", "['the', 'i', 'had', 'happened', 'in', 'london'... | \n", "
85 | \n", "This Japanese restaurant is so popular recentl... | \n", "P | \n", "['this', 'japanese', 'restaurant', 'is', 'so',... | \n", "88 | \n", "['This Japanese restaurant is so popular recen... | \n", "12 | \n", "['japanese', 'restaurant', 'popular', 'recentl... | \n", "49 | \n", "[('is', 4), ('the', 4), ('japanese', 2), ('a',... | \n", "[('japanese', 2), ('food', 2), ('right', 2), (... | \n", "... | \n", "0.462 | \n", "Counter({'is': 4, 'the': 4, 'japanese': 2, 'a'... | \n", "Counter({'japanese': 2, 'food': 2, 'right': 2,... | \n", "this japanese restaurant is so popular recentl... | \n", "this japanese restaurant popular recently that... | \n", "['this', 'japanese', 'restaurant', 'is', 'so',... | \n", "['the_NEG', 'is_NEG', 'japanese', 'a', 'and_NE... | \n", "['this_japanese', 'japanese_restaurant', 'rest... | \n", "['this_japanese', 'japanese_restaurant', 'rest... | \n", "['is', 'so', 'popular', 'recently', 'as', 'a',... | \n", "
86 | \n", "Hibachi the grill is one of my favorite restau... | \n", "P | \n", "['hibachi', 'the', 'grill', 'is', 'one', 'of',... | \n", "65 | \n", "['Hibachi the grill is one of my favorite rest... | \n", "5 | \n", "['hibachi', 'grill', 'one', 'favorite', 'resta... | \n", "30 | \n", "[('the', 8), ('is', 6), ('it', 3), ('hibachi',... | \n", "[('hibachi', 2), ('grill', 2), ('restaurants',... | \n", "... | \n", "0.388 | \n", "Counter({'the': 8, 'is': 6, 'it': 3, 'hibachi'... | \n", "Counter({'hibachi': 2, 'grill': 2, 'restaurant... | \n", "hibachi the grill is one of my favorite restau... | \n", "hibachi grill favorite restaurants. like drama... | \n", "['hibachi', 'the', 'grill', 'is', 'one', 'of',... | \n", "['the', 'is', 'it', 'hibachi', 'grill', 'of', ... | \n", "['hibachi_the', 'the_grill', 'grill_is', 'is_o... | \n", "['hibachi_the', 'the_grill', 'grill_is', 'is_o... | \n", "['hibachi', 'the', 'grill', 'is', 'one', 'of',... | \n", "
87 | \n", "I went to this ultra-luxurious restaurant in D... | \n", "P | \n", "['i', 'went', 'to', 'this', 'restaurant', 'in'... | \n", "63 | \n", "['I went to this ultra-luxurious restaurant in... | \n", "5 | \n", "['went', 'restaurant', 'downtown', 'new', 'yor... | \n", "35 | \n", "[('i', 4), ('this', 3), ('and', 3), ('restaura... | \n", "[('restaurant', 2), ('expensive', 2), ('went',... | \n", "... | \n", "0.223 | \n", "Counter({'i': 4, 'this': 3, 'and': 3, 'restaur... | \n", "Counter({'restaurant': 2, 'expensive': 2, 'wen... | \n", "i went to this ultra-luxurious restaurant in d... | \n", "went this ultra-luxurious restaurant downtown ... | \n", "['i', 'went', 'to', 'this', 'restaurant', 'in'... | \n", "['i', 'this', 'and', 'restaurant', 'in', 'expe... | \n", "['i_went', 'went_to', 'to_this', 'this_restaur... | \n", "['i_went', 'went_to', 'to_this', 'this_restaur... | \n", "['i', 'to', 'in', 'downtown', 'new', 'york', '... | \n", "
88 rows × 40 columns
\n", "