{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# HW2: VECTORIZATION (Pandas style!)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 1: Import ALL the things\n", "### Import libraries " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "##########################################\n", "# NOTE: I'm toying with the idea of requiring the library just above \n", "# when I use it so it makes more sense in context\n", "##########################################\n", "# import os\n", "# import pandas as pd\n", "# from nltk.tokenize import word_tokenize, sent_tokenize\n", "# from nltk.sentiment import SentimentAnalyzer\n", "# from nltk.sentiment.util import *\n", "# from nltk.probability import FreqDist\n", "# from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", "# sid = SentimentIntensityAnalyzer()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import data from files" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "import os\n", "def get_data_from_files(path):\n", " directory = os.listdir(path)\n", " results = []\n", " for file in directory:\n", " f=open(path+file)\n", " results.append(f.read())\n", " f.close()\n", " return results\n", "\n", "# neg = get_data_from_files('../neg_cornell/')\n", "# pos = get_data_from_files('../pos_cornell/')\n", "\n", "# v1\n", "# neg = get_data_from_files('../hw4_lie_false/')\n", "# pos = get_data_from_files('../hw4_lie_true/')\n", "\n", "pos = get_data_from_files('../hw4_lie_false/')\n", "neg = get_data_from_files('../hw4_lie_true/')\n", "\n", "# neg = get_data_from_files('../neg_hw4/')\n", "# pos = get_data_from_files('../pos_hw4/')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 2: Prep Data\n", "### STEP 2a: Turn that fresh text into a pandas DF" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "neg_df = pd.DataFrame(neg)\n", "pos_df = pd.DataFrame(pos)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### STEP 2b: Label it" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "pos_df['PoN'] = 'P'\n", "neg_df['PoN'] = 'N'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### STEP 2c: Combine the dfs" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "all_df = neg_df.append(pos_df)" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "PoN | \n", "
---|---|---|
0 | \n", "? | \n", "N | \n", "
1 | \n", "Twin Trees Cicero NY HUGE salad bar and high q... | \n", "N | \n", "
2 | \n", "The worst restaurant that I have ever eaten in... | \n", "N | \n", "
3 | \n", "? | \n", "N | \n", "
4 | \n", "I have been to a Asian restaurant in New York ... | \n", "N | \n", "
... | \n", "... | \n", "... | \n", "
41 | \n", "Mikes Pizza High Point NY Service was very slo... | \n", "P | \n", "
42 | \n", "After I went shopping with some of my friend w... | \n", "P | \n", "
43 | \n", "I entered the restaurant and a waitress came b... | \n", "P | \n", "
44 | \n", "Carlos Plate Shack was the worst dining experi... | \n", "P | \n", "
45 | \n", "Olive Oil Garden was very disappointing. I exp... | \n", "P | \n", "
92 rows × 2 columns
\n", "\n", " | 0 | \n", "PoN | \n", "sentences | \n", "num_sentences | \n", "tokens | \n", "num_tokens | \n", "
---|---|---|---|---|---|---|
0 | \n", "? | \n", "N | \n", "[?] | \n", "1 | \n", "[] | \n", "0 | \n", "
1 | \n", "Twin Trees Cicero NY HUGE salad bar and high q... | \n", "N | \n", "[Twin Trees Cicero NY HUGE salad bar and high ... | \n", "4 | \n", "[twin, trees, cicero, ny, huge, salad, bar, an... | \n", "53 | \n", "
2 | \n", "The worst restaurant that I have ever eaten in... | \n", "N | \n", "[The worst restaurant that I have ever eaten i... | \n", "5 | \n", "[the, worst, restaurant, that, i, have, ever, ... | \n", "105 | \n", "
3 | \n", "? | \n", "N | \n", "[?] | \n", "1 | \n", "[] | \n", "0 | \n", "
4 | \n", "I have been to a Asian restaurant in New York ... | \n", "N | \n", "[I have been to a Asian restaurant in New York... | \n", "4 | \n", "[i, have, been, to, a, asian, restaurant, in, ... | \n", "45 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
41 | \n", "Mikes Pizza High Point NY Service was very slo... | \n", "P | \n", "[Mikes Pizza High Point NY Service was very sl... | \n", "4 | \n", "[mikes, pizza, high, point, ny, service, was, ... | \n", "43 | \n", "
42 | \n", "After I went shopping with some of my friend w... | \n", "P | \n", "[After I went shopping with some of my friend ... | \n", "2 | \n", "[after, i, went, shopping, with, some, of, my,... | \n", "24 | \n", "
43 | \n", "I entered the restaurant and a waitress came b... | \n", "P | \n", "[I entered the restaurant and a waitress came ... | \n", "5 | \n", "[i, entered, the, restaurant, and, a, waitress... | \n", "99 | \n", "
44 | \n", "Carlos Plate Shack was the worst dining experi... | \n", "P | \n", "[Carlos Plate Shack was the worst dining exper... | \n", "9 | \n", "[carlos, plate, shack, was, the, worst, dining... | \n", "155 | \n", "
45 | \n", "Olive Oil Garden was very disappointing. I exp... | \n", "P | \n", "[Olive Oil Garden was very disappointing., I e... | \n", "5 | \n", "[olive, oil, garden, was, very, disappointing,... | \n", "43 | \n", "
92 rows × 6 columns
\n", "\n", " | 0 | \n", "PoN | \n", "sentences | \n", "num_sentences | \n", "tokens | \n", "num_tokens | \n", "no_sw | \n", "num_no_sw | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "? | \n", "N | \n", "[?] | \n", "1 | \n", "[] | \n", "0 | \n", "[] | \n", "0 | \n", "
1 | \n", "Twin Trees Cicero NY HUGE salad bar and high q... | \n", "N | \n", "[Twin Trees Cicero NY HUGE salad bar and high ... | \n", "4 | \n", "[twin, trees, cicero, ny, huge, salad, bar, an... | \n", "53 | \n", "[twin, trees, cicero, ny, huge, salad, bar, hi... | \n", "32 | \n", "
2 | \n", "The worst restaurant that I have ever eaten in... | \n", "N | \n", "[The worst restaurant that I have ever eaten i... | \n", "5 | \n", "[the, worst, restaurant, that, i, have, ever, ... | \n", "105 | \n", "[worst, restaurant, ever, eaten, undoubtedly, ... | \n", "49 | \n", "
3 | \n", "? | \n", "N | \n", "[?] | \n", "1 | \n", "[] | \n", "0 | \n", "[] | \n", "0 | \n", "
4 | \n", "I have been to a Asian restaurant in New York ... | \n", "N | \n", "[I have been to a Asian restaurant in New York... | \n", "4 | \n", "[i, have, been, to, a, asian, restaurant, in, ... | \n", "45 | \n", "[asian, restaurant, new, york, city, menu, wri... | \n", "23 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
41 | \n", "Mikes Pizza High Point NY Service was very slo... | \n", "P | \n", "[Mikes Pizza High Point NY Service was very sl... | \n", "4 | \n", "[mikes, pizza, high, point, ny, service, was, ... | \n", "43 | \n", "[mikes, pizza, high, point, ny, service, slow,... | \n", "26 | \n", "
42 | \n", "After I went shopping with some of my friend w... | \n", "P | \n", "[After I went shopping with some of my friend ... | \n", "2 | \n", "[after, i, went, shopping, with, some, of, my,... | \n", "24 | \n", "[went, shopping, friend, went, dodo, restauran... | \n", "11 | \n", "
43 | \n", "I entered the restaurant and a waitress came b... | \n", "P | \n", "[I entered the restaurant and a waitress came ... | \n", "5 | \n", "[i, entered, the, restaurant, and, a, waitress... | \n", "99 | \n", "[entered, restaurant, waitress, came, blanking... | \n", "49 | \n", "
44 | \n", "Carlos Plate Shack was the worst dining experi... | \n", "P | \n", "[Carlos Plate Shack was the worst dining exper... | \n", "9 | \n", "[carlos, plate, shack, was, the, worst, dining... | \n", "155 | \n", "[carlos, plate, shack, worst, dining, experien... | \n", "88 | \n", "
45 | \n", "Olive Oil Garden was very disappointing. I exp... | \n", "P | \n", "[Olive Oil Garden was very disappointing., I e... | \n", "5 | \n", "[olive, oil, garden, was, very, disappointing,... | \n", "43 | \n", "[olive, oil, garden, disappointing, expect, go... | \n", "23 | \n", "
92 rows × 8 columns
\n", "\n", " | 0 | \n", "PoN | \n", "sentences | \n", "num_sentences | \n", "tokens | \n", "num_tokens | \n", "no_sw | \n", "num_no_sw | \n", "topwords_unfil | \n", "topwords_fil | \n", "freq_dist | \n", "freq_dist_unfil | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "? | \n", "N | \n", "[?] | \n", "1 | \n", "[] | \n", "0 | \n", "[] | \n", "0 | \n", "[] | \n", "[] | \n", "{} | \n", "{} | \n", "
1 | \n", "Twin Trees Cicero NY HUGE salad bar and high q... | \n", "N | \n", "[Twin Trees Cicero NY HUGE salad bar and high ... | \n", "4 | \n", "[twin, trees, cicero, ny, huge, salad, bar, an... | \n", "53 | \n", "[twin, trees, cicero, ny, huge, salad, bar, hi... | \n", "32 | \n", "[(and, 3), (to, 3), (are, 2), (the, 2), (twin,... | \n", "[(twin, 1), (trees, 1), (cicero, 1), (ny, 1), ... | \n", "{'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... | \n", "{'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... | \n", "
2 | \n", "The worst restaurant that I have ever eaten in... | \n", "N | \n", "[The worst restaurant that I have ever eaten i... | \n", "5 | \n", "[the, worst, restaurant, that, i, have, ever, ... | \n", "105 | \n", "[worst, restaurant, ever, eaten, undoubtedly, ... | \n", "49 | \n", "[(the, 6), (i, 6), (a, 5), (was, 5), (and, 4),... | \n", "[(pepper, 3), (veggie, 2), (sandwich, 2), (red... | \n", "{'worst': 1, 'restaurant': 1, 'ever': 1, 'eate... | \n", "{'the': 6, 'worst': 1, 'restaurant': 1, 'that'... | \n", "
3 | \n", "? | \n", "N | \n", "[?] | \n", "1 | \n", "[] | \n", "0 | \n", "[] | \n", "0 | \n", "[] | \n", "[] | \n", "{} | \n", "{} | \n", "
4 | \n", "I have been to a Asian restaurant in New York ... | \n", "N | \n", "[I have been to a Asian restaurant in New York... | \n", "4 | \n", "[i, have, been, to, a, asian, restaurant, in, ... | \n", "45 | \n", "[asian, restaurant, new, york, city, menu, wri... | \n", "23 | \n", "[(i, 3), (a, 3), (the, 2), (is, 2), (by, 2), (... | \n", "[(asian, 1), (restaurant, 1), (new, 1), (york,... | \n", "{'asian': 1, 'restaurant': 1, 'new': 1, 'york'... | \n", "{'i': 3, 'have': 1, 'been': 1, 'to': 1, 'a': 3... | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
41 | \n", "Mikes Pizza High Point NY Service was very slo... | \n", "P | \n", "[Mikes Pizza High Point NY Service was very sl... | \n", "4 | \n", "[mikes, pizza, high, point, ny, service, was, ... | \n", "43 | \n", "[mikes, pizza, high, point, ny, service, slow,... | \n", "26 | \n", "[(pizza, 2), (was, 2), (you, 2), (would, 2), (... | \n", "[(pizza, 2), (would, 2), (mikes, 1), (high, 1)... | \n", "{'mikes': 1, 'pizza': 2, 'high': 1, 'point': 1... | \n", "{'mikes': 1, 'pizza': 2, 'high': 1, 'point': 1... | \n", "
42 | \n", "After I went shopping with some of my friend w... | \n", "P | \n", "[After I went shopping with some of my friend ... | \n", "2 | \n", "[after, i, went, shopping, with, some, of, my,... | \n", "24 | \n", "[went, shopping, friend, went, dodo, restauran... | \n", "11 | \n", "[(i, 2), (went, 2), (of, 2), (after, 1), (shop... | \n", "[(went, 2), (shopping, 1), (friend, 1), (dodo,... | \n", "{'went': 2, 'shopping': 1, 'friend': 1, 'dodo'... | \n", "{'after': 1, 'i': 2, 'went': 2, 'shopping': 1,... | \n", "
43 | \n", "I entered the restaurant and a waitress came b... | \n", "P | \n", "[I entered the restaurant and a waitress came ... | \n", "5 | \n", "[i, entered, the, restaurant, and, a, waitress... | \n", "99 | \n", "[entered, restaurant, waitress, came, blanking... | \n", "49 | \n", "[(the, 9), (i, 6), (and, 6), (to, 4), (a, 2), ... | \n", "[(waitress, 2), (waited, 2), (even, 2), (food,... | \n", "{'entered': 1, 'restaurant': 1, 'waitress': 2,... | \n", "{'i': 6, 'entered': 1, 'the': 9, 'restaurant':... | \n", "
44 | \n", "Carlos Plate Shack was the worst dining experi... | \n", "P | \n", "[Carlos Plate Shack was the worst dining exper... | \n", "9 | \n", "[carlos, plate, shack, was, the, worst, dining... | \n", "155 | \n", "[carlos, plate, shack, worst, dining, experien... | \n", "88 | \n", "[(the, 9), (to, 7), (plate, 6), (and, 5), (my,... | \n", "[(plate, 6), (southern, 3), (comfort, 3), (ext... | \n", "{'carlos': 1, 'plate': 6, 'shack': 1, 'worst':... | \n", "{'carlos': 1, 'plate': 6, 'shack': 1, 'was': 3... | \n", "
45 | \n", "Olive Oil Garden was very disappointing. I exp... | \n", "P | \n", "[Olive Oil Garden was very disappointing., I e... | \n", "5 | \n", "[olive, oil, garden, was, very, disappointing,... | \n", "43 | \n", "[olive, oil, garden, disappointing, expect, go... | \n", "23 | \n", "[(the, 3), (olive, 2), (oil, 2), (garden, 2), ... | \n", "[(olive, 2), (oil, 2), (garden, 2), (good, 2),... | \n", "{'olive': 2, 'oil': 2, 'garden': 2, 'disappoin... | \n", "{'olive': 2, 'oil': 2, 'garden': 2, 'was': 2, ... | \n", "
92 rows × 12 columns
\n", "