{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# HW4 [Deception] " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 1: GET THAT DATA" ] }, { "cell_type": "code", "execution_count": 172, "metadata": {}, "outputs": [], "source": [ "import os\n", "def get_data(file, path):\n", " f=open(path+file)\n", " data = f.read()\n", " f.close()\n", " return data\n", " \n", "def get_data_from_files(path):\n", " results = [get_data(file, path) for file in os.listdir(path)]\n", " return results\n", "\n", "# pos = get_data_from_files('../pos_cornell//')\n", "# neg = get_data_from_files('../neg_cornell/')\n", "pos = get_data_from_files('../hw4_lie_false/')\n", "neg = get_data_from_files('../hw4_lie_true/')" ] }, { "cell_type": "code", "execution_count": 173, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "PoN | \n", "
---|---|---|
0 | \n", "? | \n", "N | \n", "
1 | \n", "Twin Trees Cicero NY HUGE salad bar and high q... | \n", "N | \n", "
2 | \n", "The worst restaurant that I have ever eaten in... | \n", "N | \n", "
3 | \n", "? | \n", "N | \n", "
4 | \n", "I have been to a Asian restaurant in New York ... | \n", "N | \n", "
... | \n", "... | \n", "... | \n", "
87 | \n", "Mikes Pizza High Point NY Service was very slo... | \n", "P | \n", "
88 | \n", "After I went shopping with some of my friend w... | \n", "P | \n", "
89 | \n", "I entered the restaurant and a waitress came b... | \n", "P | \n", "
90 | \n", "Carlos Plate Shack was the worst dining experi... | \n", "P | \n", "
91 | \n", "Olive Oil Garden was very disappointing. I exp... | \n", "P | \n", "
92 rows × 2 columns
\n", "\n", " | 0 | \n", "PoN | \n", "sentences | \n", "num_sentences | \n", "tokens | \n", "num_tokens | \n", "
---|---|---|---|---|---|---|
0 | \n", "? | \n", "N | \n", "[?] | \n", "1 | \n", "[] | \n", "0 | \n", "
1 | \n", "Twin Trees Cicero NY HUGE salad bar and high q... | \n", "N | \n", "[Twin Trees Cicero NY HUGE salad bar and high ... | \n", "4 | \n", "[twin, trees, cicero, ny, huge, salad, bar, an... | \n", "53 | \n", "
2 | \n", "The worst restaurant that I have ever eaten in... | \n", "N | \n", "[The worst restaurant that I have ever eaten i... | \n", "5 | \n", "[the, worst, restaurant, that, i, have, ever, ... | \n", "105 | \n", "
\n", " | 0 | \n", "PoN | \n", "sentences | \n", "num_sentences | \n", "tokens | \n", "num_tokens | \n", "no_sw | \n", "num_no_sw | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "? | \n", "N | \n", "[?] | \n", "1 | \n", "[] | \n", "0 | \n", "[] | \n", "0 | \n", "
1 | \n", "Twin Trees Cicero NY HUGE salad bar and high q... | \n", "N | \n", "[Twin Trees Cicero NY HUGE salad bar and high ... | \n", "4 | \n", "[twin, trees, cicero, ny, huge, salad, bar, an... | \n", "53 | \n", "[twin, trees, cicero, ny, huge, salad, bar, hi... | \n", "32 | \n", "
2 | \n", "The worst restaurant that I have ever eaten in... | \n", "N | \n", "[The worst restaurant that I have ever eaten i... | \n", "5 | \n", "[the, worst, restaurant, that, i, have, ever, ... | \n", "105 | \n", "[worst, restaurant, ever, eaten, undoubtedly, ... | \n", "49 | \n", "
\n", " | 0 | \n", "PoN | \n", "sentences | \n", "num_sentences | \n", "tokens | \n", "num_tokens | \n", "no_sw | \n", "num_no_sw | \n", "stemmed | \n", "stemmed_no_sw | \n", "
---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "? | \n", "N | \n", "[?] | \n", "1 | \n", "[] | \n", "0 | \n", "[] | \n", "0 | \n", "[] | \n", "[] | \n", "
1 | \n", "Twin Trees Cicero NY HUGE salad bar and high q... | \n", "N | \n", "[Twin Trees Cicero NY HUGE salad bar and high ... | \n", "4 | \n", "[twin, trees, cicero, ny, huge, salad, bar, an... | \n", "53 | \n", "[twin, trees, cicero, ny, huge, salad, bar, hi... | \n", "32 | \n", "[twin, tree, cicero, ny, huge, salad, bar, and... | \n", "[twin, tree, cicero, ny, huge, salad, bar, hig... | \n", "
2 | \n", "The worst restaurant that I have ever eaten in... | \n", "N | \n", "[The worst restaurant that I have ever eaten i... | \n", "5 | \n", "[the, worst, restaurant, that, i, have, ever, ... | \n", "105 | \n", "[worst, restaurant, ever, eaten, undoubtedly, ... | \n", "49 | \n", "[the, worst, restaur, that, i, have, ever, eat... | \n", "[worst, restaur, ever, eaten, undoubtedli, pla... | \n", "
\n", " | 0 | \n", "PoN | \n", "sentences | \n", "num_sentences | \n", "tokens | \n", "num_tokens | \n", "no_sw | \n", "num_no_sw | \n", "stemmed | \n", "stemmed_no_sw | \n", "lemmed | \n", "lemmed_no_sw | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "? | \n", "N | \n", "[?] | \n", "1 | \n", "[] | \n", "0 | \n", "[] | \n", "0 | \n", "[] | \n", "[] | \n", "[] | \n", "[] | \n", "
1 | \n", "Twin Trees Cicero NY HUGE salad bar and high q... | \n", "N | \n", "[Twin Trees Cicero NY HUGE salad bar and high ... | \n", "4 | \n", "[twin, trees, cicero, ny, huge, salad, bar, an... | \n", "53 | \n", "[twin, trees, cicero, ny, huge, salad, bar, hi... | \n", "32 | \n", "[twin, tree, cicero, ny, huge, salad, bar, and... | \n", "[twin, tree, cicero, ny, huge, salad, bar, hig... | \n", "[twin, tree, cicero, ny, huge, salad, bar, and... | \n", "[twin, tree, cicero, ny, huge, salad, bar, hig... | \n", "
2 | \n", "The worst restaurant that I have ever eaten in... | \n", "N | \n", "[The worst restaurant that I have ever eaten i... | \n", "5 | \n", "[the, worst, restaurant, that, i, have, ever, ... | \n", "105 | \n", "[worst, restaurant, ever, eaten, undoubtedly, ... | \n", "49 | \n", "[the, worst, restaur, that, i, have, ever, eat... | \n", "[worst, restaur, ever, eaten, undoubtedli, pla... | \n", "[the, worst, restaurant, that, i, have, ever, ... | \n", "[worst, restaurant, ever, eaten, undoubtedly, ... | \n", "
\n", " | 0 | \n", "PoN | \n", "sentences | \n", "num_sentences | \n", "tokens | \n", "num_tokens | \n", "no_sw | \n", "num_no_sw | \n", "stemmed | \n", "stemmed_no_sw | \n", "lemmed | \n", "lemmed_no_sw | \n", "pos | \n", "pos_no_sw | \n", "pos_dict | \n", "pos_dict_no_sw | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "? | \n", "N | \n", "[?] | \n", "1 | \n", "[] | \n", "0 | \n", "[] | \n", "0 | \n", "[] | \n", "[] | \n", "[] | \n", "[] | \n", "[] | \n", "[] | \n", "{} | \n", "{} | \n", "
1 | \n", "Twin Trees Cicero NY HUGE salad bar and high q... | \n", "N | \n", "[Twin Trees Cicero NY HUGE salad bar and high ... | \n", "4 | \n", "[twin, trees, cicero, ny, huge, salad, bar, an... | \n", "53 | \n", "[twin, trees, cicero, ny, huge, salad, bar, hi... | \n", "32 | \n", "[twin, tree, cicero, ny, huge, salad, bar, and... | \n", "[twin, tree, cicero, ny, huge, salad, bar, hig... | \n", "[twin, tree, cicero, ny, huge, salad, bar, and... | \n", "[twin, tree, cicero, ny, huge, salad, bar, hig... | \n", "[(twin, NN), (trees, NNS), (cicero, VBP), (ny,... | \n", "[(twin, NN), (trees, NNS), (cicero, VBP), (ny,... | \n", "{'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ... | \n", "{'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ... | \n", "
2 | \n", "The worst restaurant that I have ever eaten in... | \n", "N | \n", "[The worst restaurant that I have ever eaten i... | \n", "5 | \n", "[the, worst, restaurant, that, i, have, ever, ... | \n", "105 | \n", "[worst, restaurant, ever, eaten, undoubtedly, ... | \n", "49 | \n", "[the, worst, restaur, that, i, have, ever, eat... | \n", "[worst, restaur, ever, eaten, undoubtedli, pla... | \n", "[the, worst, restaurant, that, i, have, ever, ... | \n", "[worst, restaurant, ever, eaten, undoubtedly, ... | \n", "[(the, DT), (worst, JJS), (restaurant, NN), (t... | \n", "[(worst, RBS), (restaurant, NN), (ever, RB), (... | \n", "{'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':... | \n", "{'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ... | \n", "
\n", " | 0 | \n", "PoN | \n", "sentences | \n", "num_sentences | \n", "tokens | \n", "num_tokens | \n", "no_sw | \n", "num_no_sw | \n", "stemmed | \n", "stemmed_no_sw | \n", "lemmed | \n", "lemmed_no_sw | \n", "pos | \n", "pos_no_sw | \n", "pos_dict | \n", "pos_dict_no_sw | \n", "bow | \n", "bow_no_sw | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "? | \n", "N | \n", "[?] | \n", "1 | \n", "[] | \n", "0 | \n", "[] | \n", "0 | \n", "[] | \n", "[] | \n", "[] | \n", "[] | \n", "[] | \n", "[] | \n", "{} | \n", "{} | \n", "{} | \n", "{} | \n", "
1 | \n", "Twin Trees Cicero NY HUGE salad bar and high q... | \n", "N | \n", "[Twin Trees Cicero NY HUGE salad bar and high ... | \n", "4 | \n", "[twin, trees, cicero, ny, huge, salad, bar, an... | \n", "53 | \n", "[twin, trees, cicero, ny, huge, salad, bar, hi... | \n", "32 | \n", "[twin, tree, cicero, ny, huge, salad, bar, and... | \n", "[twin, tree, cicero, ny, huge, salad, bar, hig... | \n", "[twin, tree, cicero, ny, huge, salad, bar, and... | \n", "[twin, tree, cicero, ny, huge, salad, bar, hig... | \n", "[(twin, NN), (trees, NNS), (cicero, VBP), (ny,... | \n", "[(twin, NN), (trees, NNS), (cicero, VBP), (ny,... | \n", "{'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ... | \n", "{'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ... | \n", "{'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... | \n", "{'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... | \n", "
2 | \n", "The worst restaurant that I have ever eaten in... | \n", "N | \n", "[The worst restaurant that I have ever eaten i... | \n", "5 | \n", "[the, worst, restaurant, that, i, have, ever, ... | \n", "105 | \n", "[worst, restaurant, ever, eaten, undoubtedly, ... | \n", "49 | \n", "[the, worst, restaur, that, i, have, ever, eat... | \n", "[worst, restaur, ever, eaten, undoubtedli, pla... | \n", "[the, worst, restaurant, that, i, have, ever, ... | \n", "[worst, restaurant, ever, eaten, undoubtedly, ... | \n", "[(the, DT), (worst, JJS), (restaurant, NN), (t... | \n", "[(worst, RBS), (restaurant, NN), (ever, RB), (... | \n", "{'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':... | \n", "{'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ... | \n", "{'the': 6, 'worst': 1, 'restaurant': 1, 'that'... | \n", "{'worst': 1, 'restaurant': 1, 'ever': 1, 'eate... | \n", "
\n", " | NN | \n", "NNS | \n", "VBP | \n", "JJ | \n", "CC | \n", "VBZ | \n", "DT | \n", "RB | \n", "VB | \n", "TO | \n", "... | \n", "VBG | \n", "EX | \n", "JJR | \n", "PDT | \n", "RP | \n", "WP | \n", "CD | \n", "RBR | \n", "MD | \n", "RBS | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PoN | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
N | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
N | \n", "11 | \n", "3 | \n", "3 | \n", "9 | \n", "3 | \n", "2 | \n", "4 | \n", "4 | \n", "4 | \n", "3 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
N | \n", "29 | \n", "1 | \n", "1 | \n", "7 | \n", "5 | \n", "1 | \n", "14 | \n", "8 | \n", "4 | \n", "4 | \n", "... | \n", "1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
N | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
N | \n", "13 | \n", "2 | \n", "2 | \n", "5 | \n", "1 | \n", "2 | \n", "5 | \n", "0 | \n", "0 | \n", "1 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
5 rows × 28 columns
\n", "\n", " | NN | \n", "NNS | \n", "VBP | \n", "JJ | \n", "CC | \n", "VBZ | \n", "DT | \n", "RB | \n", "VB | \n", "TO | \n", "... | \n", "VBG | \n", "EX | \n", "JJR | \n", "PDT | \n", "RP | \n", "WP | \n", "CD | \n", "RBR | \n", "MD | \n", "RBS | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
PoN | \n", "\n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " | \n", " |
N | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
N | \n", "0.207547 | \n", "0.056604 | \n", "0.056604 | \n", "0.169811 | \n", "0.056604 | \n", "0.037736 | \n", "0.075472 | \n", "0.075472 | \n", "0.075472 | \n", "0.056604 | \n", "... | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "0.000000 | \n", "
N | \n", "0.276190 | \n", "0.009524 | \n", "0.009524 | \n", "0.066667 | \n", "0.047619 | \n", "0.009524 | \n", "0.133333 | \n", "0.076190 | \n", "0.038095 | \n", "0.038095 | \n", "... | \n", "0.009524 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "0.000000 | \n", "
N | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "
N | \n", "0.288889 | \n", "0.044444 | \n", "0.044444 | \n", "0.111111 | \n", "0.022222 | \n", "0.044444 | \n", "0.111111 | \n", "0.000000 | \n", "0.000000 | \n", "0.022222 | \n", "... | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "0.000000 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
P | \n", "0.162791 | \n", "0.046512 | \n", "0.023256 | \n", "0.139535 | \n", "0.046512 | \n", "0.000000 | \n", "0.046512 | \n", "0.069767 | \n", "0.093023 | \n", "0.046512 | \n", "... | \n", "0.023256 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.069767 | \n", "0.000000 | \n", "
P | \n", "0.208333 | \n", "0.041667 | \n", "0.000000 | \n", "0.041667 | \n", "0.000000 | \n", "0.000000 | \n", "0.083333 | \n", "0.000000 | \n", "0.041667 | \n", "0.041667 | \n", "... | \n", "0.041667 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.041667 | \n", "0.0 | \n", "0.000000 | \n", "0.000000 | \n", "
P | \n", "0.191919 | \n", "0.010101 | \n", "0.000000 | \n", "0.070707 | \n", "0.070707 | \n", "0.010101 | \n", "0.141414 | \n", "0.101010 | \n", "0.070707 | \n", "0.040404 | \n", "... | \n", "0.010101 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.010101 | \n", "0.0 | \n", "0.010101 | \n", "0.020202 | \n", "
P | \n", "0.206452 | \n", "0.045161 | \n", "0.019355 | \n", "0.090323 | \n", "0.045161 | \n", "0.000000 | \n", "0.096774 | \n", "0.070968 | \n", "0.045161 | \n", "0.045161 | \n", "... | \n", "0.012903 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.025806 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "0.000000 | \n", "
P | \n", "0.232558 | \n", "0.023256 | \n", "0.046512 | \n", "0.139535 | \n", "0.046512 | \n", "0.000000 | \n", "0.093023 | \n", "0.069767 | \n", "0.046512 | \n", "0.046512 | \n", "... | \n", "0.000000 | \n", "0.0 | \n", "0.0 | \n", "0.0 | \n", "0.023256 | \n", "0.0 | \n", "0.000000 | \n", "0.0 | \n", "0.000000 | \n", "0.000000 | \n", "
92 rows × 28 columns
\n", "