{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['This', 'is', 'any', 'sentence', 'of', 'text', '.', 'It', 'can', 'have', 'punctuation', ',', 'CAPS', '!', ',', 'etc', '.']\n",
      "dict_items([('This', 1), ('is', 1), ('any', 1), ('sentence', 1), ('of', 1), ('text', 1), ('.', 2), ('It', 1), ('can', 1), ('have', 1), ('punctuation', 1), (',', 2), ('CAPS', 1), ('!', 1), ('etc', 1)])\n",
      "[('.', 2)]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import nltk\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.probability import FreqDist\n",
    "import matplotlib.pyplot as plt\n",
    "from nltk.corpus import stopwords\n",
    "## For Stemming\n",
    "from nltk.stem import PorterStemmer\n",
    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
    "import os\n",
    "#--------------------------------------\n",
    "## Tokenization\n",
    "## Breaking text into tokens -  in this case - words\n",
    "#-----------------------------------------------------\n",
    "text=\"This is any sentence of text. It can have punctuation, CAPS!, etc.\"\n",
    "tokenized_word=word_tokenize(text)\n",
    "print(tokenized_word)\n",
    "## Looking at word frequency\n",
    "fdist = FreqDist(tokenized_word)\n",
    "print(fdist.items())\n",
    "print(fdist.most_common(1))  ## most common lleft to right\n",
    "#print(fdist.most_common(2))  ## two most common\n",
    "#print(fdist.most_common(3))  ## three most common\n",
    "#print(fdist.freq(\"is\"))  ## how frequent \n",
    "## \"is\" occurs once in 17 words. 1/17 = .058\n",
    "fdist.N()  # freq of each\n",
    "# Visualize word frequency\n",
    "fdist.plot(30,cumulative=False)\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--------------------------------\n"
     ]
    }
   ],
   "source": [
    "#---------------------------\n",
    "## Stopwords - English\n",
    "#------------------------------\n",
    "stop_words=set(stopwords.words(\"english\"))\n",
    "#print(stop_words)\n",
    "#---------------------------\n",
    "## Removing Stopwords\n",
    "#------------------------------\n",
    "filtered_text=[]   ## Create a new empty list\n",
    "for w in tokenized_word:\n",
    " #   print(w)\n",
    "    if w not in stop_words:\n",
    "        filtered_text.append(w)\n",
    "#print(\"Tokenized text:\",tokenized_word)\n",
    "#print(\"Filterd text:\",filtered_text)\n",
    "\n",
    "#------------------------\n",
    "# Stemming\n",
    "#---------------------\n",
    "ps = PorterStemmer()   ## method from nltk\n",
    "\n",
    "stemmed_words=[]  ## make new empty list\n",
    "for w in filtered_text:\n",
    "    stemmed_words.append(ps.stem(w))\n",
    "\n",
    "#print(\"Filtered:\",filtered_text)\n",
    "#print(\"Stemmed:\",stemmed_words)\n",
    "\n",
    "##-------------------------------\n",
    "#Lemmatization reduces words to their base word\n",
    "#--------------------------------------------------\n",
    "# Lemmatization is usually more sophisticated than \n",
    "#stemming. Stemmer works on an individual word without\n",
    "# knowledge of the context. For example, The word \"better\"\n",
    "# has \"good\" as its lemma. This thing will miss by\n",
    "# stemming because it requires a dictionary look-up.\n",
    "from nltk.stem.wordnet import WordNetLemmatizer\n",
    "lem = WordNetLemmatizer()  ## method we are using\n",
    "from nltk.stem.porter import PorterStemmer\n",
    "stem = PorterStemmer()\n",
    "word = \"flying\"\n",
    "# print(\"Lemmatized Word:\",lem.lemmatize(word,\"v\"))\n",
    "# print(\"Stemmed Word:\",stem.stem(word))\n",
    "\n",
    "## ------------------------\n",
    "## Part-of-Speech(POS) tagging\n",
    "## ------------------------------------\n",
    "# identify the grammatical group\n",
    "# Whether it is a NOUN, PRONOUN, \n",
    "# ADJECTIVE, VERB, ADVERBS\n",
    "#------------------------------------------\n",
    "sent = \"Three wonderful things in life are hiking, moutains, and COFFEE!\"\n",
    "Mytokens=nltk.word_tokenize(sent)\n",
    "#print(Mytokens)\n",
    "MyTAGS = nltk.pos_tag(Mytokens)\n",
    "#print(MyTAGS)\n",
    "\n",
    "#----------------------------------------\n",
    "# Importing a Corpus\n",
    "#----------------------------------------\n",
    "## On my computer, there is a relative path to data\n",
    "## DATA/Movie_test/neg\n",
    "## and \n",
    "## DATA/Movie_test/pos\n",
    "## neg and pos are folders that contain .txt\n",
    "## files. neg are the negative sentiment and pos\n",
    "## are positive\n",
    "## We need to bring in this data into ONE\n",
    "## large matrix (or dataframe) with the words\n",
    "## as features (variables or columns)\n",
    "## and an extra column for the label (P or N)\n",
    "## This assumes import nltk\n",
    "print(\"--------------------------------\")\n",
    "#print(\"CORPUS EXAMPLES\")\n",
    "#from nltk.corpus import PlaintextCorpusReader\n",
    "\n",
    "#XXXXXXXXXXXXXXXXXXXXX\n",
    "## https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html\n",
    "#XXXXXXXXXXXXXXXXXXXXX\n",
    "#---------------\n",
    "#\n",
    "import sklearn\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "\n",
    "#######################\n",
    "## IMPORTANT - the following is an example\n",
    "## of many of the \"attributes\" that CountVectorizer\n",
    "## can use. HOWEVER !! if you do not understand them\n",
    "## they make create a HUGE problem for you. \n",
    "## DO not use them blindly. \n",
    "## You can use either:\n",
    "##  MyVect2=CountVectorizer(input='content')  OR\n",
    "##  MyVect3=CountVectorizer(input='filename')\n",
    "## with on one attribute - namely the input type. \n",
    "## \n",
    "## Below, while I show all the options, I actually \n",
    "## only use a few. I have created 4 examples of creating\n",
    "## a CountVectorizer. They all follow....\n",
    "#########################################################\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "## Example 1\n",
    "MyVectorizer1=CountVectorizer(\n",
    "        input='content', ## can be set as 'content', 'file', or 'filename'\n",
    "        #If set as ‘filename’, the **sequence passed as an argument to fit**\n",
    "        #is expected to be a list of filenames \n",
    "        #https://scikit-learn.org/stable/modules/generated/\n",
    "        ##sklearn.feature_extraction.text.CountVectorizer.html#\n",
    "        ##examples-using-sklearn-feature-extraction-text-countvectorizer\n",
    "        encoding='latin-1',\n",
    "        decode_error='ignore', #{‘strict’, ‘ignore’, ‘replace’}\n",
    "        strip_accents=None, # {‘ascii’, ‘unicode’, None}\n",
    "        lowercase=True, \n",
    "        preprocessor=None, \n",
    "        tokenizer=None, \n",
    "        #stop_words='english', #string {‘english’}, list, or None (default)\n",
    "        stop_words=None,\n",
    "        token_pattern='(?u)\\b\\w\\w+\\b', #Regular expression denoting what constitutes a “token”\n",
    "        ngram_range=(1, 1), \n",
    "        analyzer='word', \n",
    "        max_df=1.0, # ignore terms w document freq strictly > threshold \n",
    "        min_df=1, \n",
    "        max_features=None, \n",
    "        vocabulary=None, \n",
    "        binary=False, #If True, all non zero counts are set to 1\n",
    "        #dtype=<class 'numpy.int64'> \n",
    "        )\n",
    "#print(MyVectorizer1)\n",
    "## Examples 2 and 3 for creating CountVectorizers\n",
    "MyVect2=CountVectorizer(input='content')\n",
    "MyVect3=CountVectorizer(input='filename')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n",
       "                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',\n",
       "                lowercase=True, max_df=1.0, max_features=None, min_df=1,\n",
       "                ngram_range=(1, 1), preprocessor=None, stop_words=None,\n",
       "                strip_accents=None, token_pattern='(?u)\\\\b\\\\w\\\\w+\\\\b',\n",
       "                tokenizer=None, vocabulary=None)"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "MyVect2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/Documents/IST_736_TextMining/SmallTextDocs/Dog.txt\n",
      "/Users/danielcaraway/Documents/IST_736_TextMining/SmallTextDocs/Hike.txt\n",
      "full list...\n",
      "['/Users/danielcaraway/Documents/IST_736_TextMining/SmallTextDocs/Dog.txt', '/Users/danielcaraway/Documents/IST_736_TextMining/SmallTextDocs/Hike.txt']\n",
      "Building Vecotrizer....\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.7/site-packages/pandas/core/frame.py:7123: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
      "of pandas will change to not sort by default.\n",
      "\n",
      "To accept the future behavior, pass 'sort=False'.\n",
      "\n",
      "To retain the current behavior and silence the warning, pass 'sort=True'.\n",
      "\n",
      "  sort=sort,\n"
     ]
    }
   ],
   "source": [
    "\n",
    "## Use glob to create a LIST of files in your folder\n",
    "#path_file=\"C:\\\\Users\\\\profa\\\\Documents\\\\Python Scripts\\\\TextMining\\\\DATA\\\\SmallTextDocs\\\\Hike\"\n",
    "#import glob\n",
    "#import os\n",
    "all_file_names = []\n",
    "## Update this to YOUR PATH - the location where you place SmallTextDocs\n",
    "## SmallTextDocs is a folder that contains two text (.txt) files\n",
    "## called Dog.txt and Hike.txt\n",
    "## If you have MAC, you may or may not need both slashes \\\\ and/or the direction\n",
    "## may be different - experiment....\n",
    "path=\"/Users/danielcaraway/Documents/IST_736_TextMining/SmallTextDocs\"\n",
    "# path=\"C:\\\\Users\\\\profa\\\\Documents\\\\Python Scripts\\\\TextMining\\\\DATA\\\\SmallTextDocs\"\n",
    "#print(\"calling os...\")\n",
    "#print(os.listdir(path))\n",
    "FileNameList=os.listdir(path)\n",
    "#print(FileNameList)\n",
    "ListOfCompleteFiles=[]\n",
    "for name in os.listdir(path):\n",
    "    print(path+ \"/\" + name)\n",
    "    next=path+ \"/\" + name\n",
    "    ListOfCompleteFiles.append(next)\n",
    "#print(\"DONE...\")\n",
    "print(\"full list...\")\n",
    "print(ListOfCompleteFiles)\n",
    "\n",
    "AllText_AllFiles=[]\n",
    "for file in ListOfCompleteFiles:\n",
    "    FILE=open(file)\n",
    "   # print(FILE.read())\n",
    "    content=FILE.read()\n",
    "    AllText_AllFiles.append(content)\n",
    "    FILE.close()\n",
    "#print(\"AllText_AllFiles is....\\n\")\n",
    "#print(AllText_AllFiles)  \n",
    "#print(\"FIT TRANSFORM-----------------\")\n",
    "## FIT TRANSFORM USING a CountVect  \n",
    "#print(\"The ListOfCompleteFiles is ...\")\n",
    "#print(ListOfCompleteFiles)\n",
    "X3=MyVect3.fit_transform(ListOfCompleteFiles)\n",
    "#print(\"The AllText_AllFiles is...\")\n",
    "#print(AllText_AllFiles)\n",
    "X2=MyVect2.fit_transform(AllText_AllFiles)\n",
    "#print(type(X2))\n",
    "#print(type(X3))\n",
    "#print(X2.get_shape())\n",
    "ColumnNames2=MyVect2.get_feature_names()\n",
    "ColumnNames3=MyVect3.get_feature_names()\n",
    "#print(\"The col name for 3 \", ColumnNames3)\n",
    "\n",
    "#import pandas as pd\n",
    "#https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.html\n",
    "## !!!!!!!!!! The final DTM in Python!! (this took 20 hours :)\n",
    "CorpusDF_A2=pd.DataFrame(X2.toarray(),columns=ColumnNames2)\n",
    "#print(\"The DF for 2 is...\", CorpusDF_A2)\n",
    "CorpusDF_A=pd.DataFrame(X3.toarray(),columns=ColumnNames3)\n",
    "#print(\"The DF for 3 is...\", CorpusDF_A)\n",
    "#print(\"COMPLETE -------------------------------\")\n",
    "\n",
    "###-------------------------------------------------\n",
    "### Creating Testing and Training Data\n",
    "### from folders of txt files\n",
    "### and from a single csv\n",
    "###------------------------------------------------\n",
    "\n",
    "### Here again, we will create tiny datasets\n",
    "### to try and test our methods\n",
    "### (1) I will create a new folder called NEG\n",
    "### and another new folder called POS\n",
    "### Inside of each, I will add 5 text documents\n",
    "### The text docs can be very small - couple of sentences\n",
    "### in the NEG will be negative text docs\n",
    "### in the POS will be positive\n",
    "### (2) Next, I will create a .csv with\n",
    "### each row have features: sent, text, date\n",
    "### We will use these two formats to create two\n",
    "### seperate text/train sets. \n",
    "### !!!!!! These are two DIFFERENT examples\n",
    "###-----------------------------------------------\n",
    "###\n",
    "### POS and NEG\n",
    "###\n",
    "###\n",
    "### This is tricky because right now we have sep pos and neg\n",
    "### but we want the train and test sets to have a balance\n",
    "### of pos and neg AND we need to label them!\n",
    "### One option is to read in all POS into DF1\n",
    "### and add a column with P for pos\n",
    "### then, read all NEG into DF2\n",
    "### and add a column with N for neg\n",
    "### Finally, join and shuffle DF1 and DF2\n",
    "### From there, pull the test and train datasets.\n",
    "\n",
    "### Step 1: Read in the POS files corpus into a DF1\n",
    "## Example 4 for creating a CountVectorizer\n",
    "print(\"Building Vecotrizer....\")\n",
    "MyVect4=CountVectorizer(input='filename',\n",
    "                        analyzer = 'word',\n",
    "                        stop_words='english',\n",
    "                        token_pattern='(?u)[a-zA-Z]+'\n",
    "                        )\n",
    "path=\"/Users/danielcaraway/Documents/IST_736_TextMining/POS\"\n",
    "# path=\"C:\\\\Users\\\\profa\\\\Documents\\\\Python Scripts\\\\TextMining\\\\DATA\\\\POS\"\n",
    "\n",
    "##Create empty list\n",
    "POSListOfCompleteFiles=[]\n",
    "for name in os.listdir(path):\n",
    "#   print(path+ \"\\\\\" + name)\n",
    "    next=path+ \"/\" + name\n",
    "    POSListOfCompleteFiles.append(next)\n",
    "\n",
    "#print(\"POS full list...\")\n",
    "#print(POSListOfCompleteFiles)\n",
    "\n",
    "#print(\"FIT with Vectorizer...\")\n",
    "X4=MyVect4.fit_transform(POSListOfCompleteFiles)\n",
    "#print(type(X4))\n",
    "#print(X4.get_shape())\n",
    "POSColumnNames=MyVect4.get_feature_names()\n",
    "#print(\"Column names: \", POSColumnNames[0:10])\n",
    "#print(\"Building DF....\")\n",
    "#import pandas as pd\n",
    "POS_CorpusDF_A=pd.DataFrame(X4.toarray(),columns=POSColumnNames)\n",
    "#print(POS_CorpusDF_A)\n",
    "\n",
    "## This looks good. Notice that when I built the Vectorizer\n",
    "## above, that I used [a-zA-Z] which means \n",
    "## letter ONLY - no numbers.\n",
    "\n",
    "## Now, we need to add a column for P or N. \n",
    "## I will call it PosORNeg and because all of these\n",
    "## are positive, I will fill it with P\n",
    "## DataFrame.insert(loc, column, value, allow_duplicates=False)\n",
    "#Length=POS_CorpusDF_A.shape\n",
    "#print(Length[0])  ## num of rows\n",
    "#print(Length[1])  ## num of columns\n",
    "\n",
    "## Add column\n",
    "#print(\"Adding new column....\")\n",
    "POS_CorpusDF_A[\"PosORNeg\"]=\"P\"\n",
    "#print(POS_CorpusDF_A)\n",
    "\n",
    "## OK - now we have a labeled DF\n",
    "## !!!!!!!!!!!!!!!\n",
    "## To use this as a label later\n",
    "## it must be type categorical\n",
    "## we will get to that...\n",
    "## !!!!!!!!!!!!!!!!!!\n",
    "\n",
    "### Now - we will do tha above for\n",
    "## the negative docs....\n",
    "pathN=\"/Users/danielcaraway/Documents/IST_736_TextMining/NEG\"\n",
    "# pathN=\"C:\\\\Users\\\\profa\\\\Documents\\\\Python Scripts\\\\TextMining\\\\DATA\\\\NEG\"\n",
    "##Create empty list\n",
    "NEGListOfCompleteFiles=[]\n",
    "for name in os.listdir(pathN):\n",
    "#   print(pathN+ \"\\\\\" + name)\n",
    "    next=pathN+ \"/\" + name\n",
    "    NEGListOfCompleteFiles.append(next)\n",
    "\n",
    "#print(\"full list...\")\n",
    "#print(NEGListOfCompleteFiles)\n",
    "X5=MyVect4.fit_transform(NEGListOfCompleteFiles)\n",
    "#print(type(X5))\n",
    "#print(X5.get_shape())\n",
    "NEGColumnNames=MyVect4.get_feature_names()\n",
    "#print(NEGColumnNames)\n",
    "\n",
    "#import pandas as pd\n",
    "NEG_CorpusDF_A=pd.DataFrame(X5.toarray(),columns=NEGColumnNames)\n",
    "#print(NEG_CorpusDF_A)\n",
    "NEG_CorpusDF_A[\"PosORNeg\"]=\"N\"\n",
    "#print(NEG_CorpusDF_A)\n",
    "\n",
    "##### GOOD!\n",
    "## Now its time to join the two dataframes together\n",
    "## From there, we will sample from it to get the\n",
    "## test and train sets...\n",
    "#######################################\n",
    "\n",
    "## Create a new large Pos and Neg DF\n",
    "## https://pandas.pydata.org/pandas-docs/stable/merging.html\n",
    "result = NEG_CorpusDF_A.append(POS_CorpusDF_A)\n",
    "#print(result)\n",
    "## Replace the NaN with 0 because it actually \n",
    "## means none in this case\n",
    "result=result.fillna(0)\n",
    "#print(result)\n",
    "\n",
    "## CREATE TEXT AND TRAIN\n",
    "##\n",
    "## Now that we have a complete dataframe\n",
    "## with a label (in this case P or N)\n",
    "## we can create a testing and training set.\n",
    "## Recall that to train any model and then test\n",
    "## that model (such as NB, DT, RF, SVM, etc)\n",
    "## we must have DISJOINT and balanced training\n",
    "## and testing data\n",
    "## EACH CASE CAN BE DIFFERENT\n",
    "## In this case, our \"result\" dataframe has all the N\n",
    "## labels first and the P labels after.\n",
    "## So we CANNOT grab the first X rows as the test set or \n",
    "## they will all be N (not balanced!)\n",
    "##\n",
    "## SHUFFLE the DATAFRAME\n",
    "## df = df.sample(frac=1).reset_index(drop=True)\n",
    "## Here, specifying drop=True prevents .reset_index \n",
    "## from creating a column containing the old index entries.\n",
    "## the frac=1 means \"resample\" (shuffle) 100% of the data\n",
    "result=result.sample(frac=1).reset_index(drop=True)\n",
    "#print(result)\n",
    "## This worked! You can see that the shape is the same\n",
    "## and that the label is no longer all N and then all P\n",
    "\n",
    "## From here, we can create (randomly) the test and train sets\n",
    "# make results reproducible\n",
    "import numpy as np\n",
    "np.random.seed(140) ## or any number - does not matter\n",
    "# sample without replacement\n",
    "## I am choosing \"6\" here to make the training set\n",
    "## of size 6. This will make the test set of size 4\n",
    "## This can be any choice depending on YOU and your data\n",
    "train_ix = np.random.choice(result.index, 6, replace=False)\n",
    "df_training = result.iloc[train_ix]\n",
    "df_test = result.drop(train_ix)\n",
    "#print(\"Training set...\")\n",
    "#print(df_training)\n",
    "#print(\"Testing set...\")\n",
    "#print(df_test)\n",
    "\n",
    "\n",
    "########################################\n",
    "###\n",
    "###  READING IN ONE .csv \n",
    "###\n",
    "#########################################\n",
    "## In the above, we had two folders\n",
    "##  - one with Pos text files and\n",
    "## one with neg text files.\n",
    "## We read these in as a corpus\n",
    "## Created dataframes and merged\n",
    "## the dataframes.\n",
    "## However, this is not the only option\n",
    "## for reading in text or data - there are MANY!\n",
    "##\n",
    "## Another common option is to have one .csv\n",
    "## file that contains labels, text, and other \n",
    "## attributes.\n",
    "## You may want to extract the text and labels\n",
    "## and again build a labeled dataframe as well\n",
    "## as test and train sets.\n",
    "##\n",
    "##------------------\n",
    "## The following code will explore this:\n",
    "##################################################\n",
    "#import pandas as pd\n",
    "################"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PosORNeg</th>\n",
       "      <th>abberline</th>\n",
       "      <th>able</th>\n",
       "      <th>ably</th>\n",
       "      <th>abo</th>\n",
       "      <th>aboard</th>\n",
       "      <th>absinthe</th>\n",
       "      <th>absolutely</th>\n",
       "      <th>academy</th>\n",
       "      <th>accent</th>\n",
       "      <th>...</th>\n",
       "      <th>writer</th>\n",
       "      <th>writihing</th>\n",
       "      <th>yakov</th>\n",
       "      <th>year</th>\n",
       "      <th>years</th>\n",
       "      <th>yellow</th>\n",
       "      <th>yes</th>\n",
       "      <th>yglesias</th>\n",
       "      <th>york</th>\n",
       "      <th>young</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>N</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>N</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>N</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>P</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>4 rows × 1893 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  PosORNeg  abberline  able  ably  abo  aboard  absinthe  absolutely  academy  \\\n",
       "6        N        0.0   0.0   0.0  0.0     0.0       0.0         0.0      0.0   \n",
       "7        N        0.0   0.0   0.0  0.0     0.0       0.0         0.0      0.0   \n",
       "8        N        0.0   0.0   0.0  2.0     0.0       0.0         0.0      0.0   \n",
       "9        P        0.0   0.0   0.0  0.0     0.0       0.0         1.0      0.0   \n",
       "\n",
       "   accent  ...  writer  writihing  yakov  year  years  yellow  yes  yglesias  \\\n",
       "6       1  ...     0.0        0.0    1.0     2      0     0.0  0.0       0.0   \n",
       "7       0  ...     1.0        0.0    0.0     1      1     1.0  1.0       0.0   \n",
       "8       0  ...     0.0        0.0    0.0     0      0     0.0  0.0       0.0   \n",
       "9       0  ...     0.0        0.0    0.0     2      0     0.0  0.0       0.0   \n",
       "\n",
       "   york  young  \n",
       "6   0.0      0  \n",
       "7   0.0      0  \n",
       "8   0.0      0  \n",
       "9   0.0      0  \n",
       "\n",
       "[4 rows x 1893 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PosORNeg</th>\n",
       "      <th>abberline</th>\n",
       "      <th>able</th>\n",
       "      <th>ably</th>\n",
       "      <th>abo</th>\n",
       "      <th>aboard</th>\n",
       "      <th>absinthe</th>\n",
       "      <th>absolutely</th>\n",
       "      <th>academy</th>\n",
       "      <th>accent</th>\n",
       "      <th>...</th>\n",
       "      <th>writer</th>\n",
       "      <th>writihing</th>\n",
       "      <th>yakov</th>\n",
       "      <th>year</th>\n",
       "      <th>years</th>\n",
       "      <th>yellow</th>\n",
       "      <th>yes</th>\n",
       "      <th>yglesias</th>\n",
       "      <th>york</th>\n",
       "      <th>young</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>P</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>N</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>N</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>P</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>P</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>P</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>N</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>N</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>N</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>P</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10 rows × 1893 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  PosORNeg  abberline  able  ably  abo  aboard  absinthe  absolutely  academy  \\\n",
       "0        P        0.0   0.0   0.0  0.0     0.0       0.0         0.0      0.0   \n",
       "1        N        0.0   0.0   0.0  0.0     0.0       0.0         0.0      0.0   \n",
       "2        N        0.0   1.0   0.0  0.0     0.0       0.0         0.0      0.0   \n",
       "3        P        0.0   0.0   0.0  0.0     0.0       0.0         0.0      0.0   \n",
       "4        P        2.0   0.0   1.0  0.0     0.0       1.0         0.0      0.0   \n",
       "5        P        0.0   0.0   0.0  0.0     1.0       0.0         0.0      1.0   \n",
       "6        N        0.0   0.0   0.0  0.0     0.0       0.0         0.0      0.0   \n",
       "7        N        0.0   0.0   0.0  0.0     0.0       0.0         0.0      0.0   \n",
       "8        N        0.0   0.0   0.0  2.0     0.0       0.0         0.0      0.0   \n",
       "9        P        0.0   0.0   0.0  0.0     0.0       0.0         1.0      0.0   \n",
       "\n",
       "   accent  ...  writer  writihing  yakov  year  years  yellow  yes  yglesias  \\\n",
       "0       0  ...     0.0        0.0    0.0     1      0     0.0  0.0       0.0   \n",
       "1       0  ...     0.0        0.0    0.0     0      0     0.0  0.0       0.0   \n",
       "2       0  ...     0.0        0.0    0.0     1      0     0.0  0.0       0.0   \n",
       "3       0  ...     0.0        0.0    0.0     0      0     0.0  0.0       0.0   \n",
       "4       2  ...     0.0        0.0    0.0     0      0     0.0  0.0       1.0   \n",
       "5       0  ...     0.0        1.0    0.0     0      1     0.0  0.0       0.0   \n",
       "6       1  ...     0.0        0.0    1.0     2      0     0.0  0.0       0.0   \n",
       "7       0  ...     1.0        0.0    0.0     1      1     1.0  1.0       0.0   \n",
       "8       0  ...     0.0        0.0    0.0     0      0     0.0  0.0       0.0   \n",
       "9       0  ...     0.0        0.0    0.0     2      0     0.0  0.0       0.0   \n",
       "\n",
       "   york  young  \n",
       "0   0.0      0  \n",
       "1   0.0      1  \n",
       "2   0.0      0  \n",
       "3   1.0      0  \n",
       "4   0.0      0  \n",
       "5   1.0      1  \n",
       "6   0.0      0  \n",
       "7   0.0      0  \n",
       "8   0.0      0  \n",
       "9   0.0      0  \n",
       "\n",
       "[10 rows x 1893 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
