{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "## =======================================================\n",
    "## IMPORTING\n",
    "## =======================================================\n",
    "import os\n",
    "def get_data_from_files(path):\n",
    "    directory = os.listdir(path)\n",
    "    results = []\n",
    "    for file in directory:\n",
    "        f=open(path+file)\n",
    "        results.append(f.read())\n",
    "        f.close()\n",
    "    return results\n",
    "\n",
    "## =======================================================\n",
    "## MACHINE LEARNING\n",
    "## =======================================================\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.naive_bayes import BernoulliNB, MultinomialNB\n",
    "\n",
    "from nltk.stem import PorterStemmer\n",
    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
    "from nltk.stem.wordnet import WordNetLemmatizer\n",
    "from nltk.stem.porter import PorterStemmer\n",
    "\n",
    "\n",
    "# unigram_bool_cv = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')\n",
    "# unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')\n",
    "# bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')\n",
    "# unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')\n",
    "# bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')\n",
    "\n",
    "vectorizers = [\n",
    "    CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english'),\n",
    "    CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english'),\n",
    "    CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english'),\n",
    "    TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english'),\n",
    "    TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, max_df=0.50, stop_words='english'),\n",
    "    TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')\n",
    "]\n",
    "\n",
    "def get_test_train_vec(X,y,vectorizer):\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)\n",
    "    X_train_vec = vectorizer.fit_transform(X_train)\n",
    "    X_test_vec = vectorizer.transform(X_test)\n",
    "    return X_train_vec, X_test_vec, y_train, y_test\n",
    "\n",
    "def run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):\n",
    "    mnb_clf = MultinomialNB()\n",
    "    mnb_clf.fit(X_train_vec, y_train)\n",
    "    print('*****MNB*****')\n",
    "    y_pred = mnb_clf.predict(X_test_vec)\n",
    "    cm=confusion_matrix(y_test, y_pred, labels=labels)\n",
    "    print('*****CONFUSION MATRIX*****')\n",
    "    print(cm)\n",
    "    target_names = target_names\n",
    "    print('*****CLASSIFICATION REPORT*****')\n",
    "    print(classification_report(y_test, y_pred, target_names=target_names))\n",
    "    print('*****SCORES*****')\n",
    "    print(mnb_clf.score(X_test_vec, y_test))\n",
    "    \n",
    "def run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):\n",
    "    svm_clf = LinearSVC(C=1)\n",
    "    svm_clf.fit(X_train_vec,y_train)\n",
    "    print('=====SVM=====')\n",
    "    y_pred = svm_clf.predict(X_test_vec)\n",
    "    cm=confusion_matrix(y_test, y_pred, labels=labels)\n",
    "    print('=====CONFUSION MATRIX=====')\n",
    "    print(cm)\n",
    "\n",
    "    target_names = target_names\n",
    "    print('=====CLASSIFICATION REPORT=====')\n",
    "    print(classification_report(y_test, y_pred, target_names=target_names))\n",
    "\n",
    "    svm_confidence_scores = svm_clf.decision_function(X_test_vec)\n",
    "    print('=====CONFIDENCE SCORES=====')\n",
    "    print(svm_confidence_scores[0])\n",
    "    print('=====SCORES=====')\n",
    "    print(svm_clf.score(X_test_vec,y_test))\n",
    "    \n",
    "def do_the_thing(X,y,labels, target_names):\n",
    "    for i,vec in enumerate(vectorizers):\n",
    "        vec_type = str(vec).split('(')[0]\n",
    "        print('++'* 20)\n",
    "        print('Vectorizer Scores for '+ str(i)+ '_' + vec_type)\n",
    "        print('++'* 20)\n",
    "        X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)\n",
    "        run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)\n",
    "        run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "++++++++++++++++++++++++++++++++++++++++\n",
      "Vectorizer Scores for 0_CountVectorizer\n",
      "++++++++++++++++++++++++++++++++++++++++\n",
      "*****MNB*****\n",
      "*****CONFUSION MATRIX*****\n",
      "[[  733  1264   817   106    11]\n",
      " [  602  4132  5411   649    30]\n",
      " [  246  2397 25756  3226   239]\n",
      " [   19   454  5580  6248   767]\n",
      " [    1    54   725  1972   985]]\n",
      "*****CLASSIFICATION REPORT*****\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.46      0.25      0.32      2931\n",
      "           1       0.50      0.38      0.43     10824\n",
      "           2       0.67      0.81      0.73     31864\n",
      "           3       0.51      0.48      0.49     13068\n",
      "           4       0.48      0.26      0.34      3737\n",
      "\n",
      "    accuracy                           0.61     62424\n",
      "   macro avg       0.53      0.44      0.47     62424\n",
      "weighted avg       0.59      0.61      0.59     62424\n",
      "\n",
      "*****SCORES*****\n",
      "0.606401384083045\n",
      "=====SVM=====\n",
      "=====CONFUSION MATRIX=====\n",
      "[[  913  1229   696    79    14]\n",
      " [  705  4094  5472   527    26]\n",
      " [  190  2111 27063  2324   176]\n",
      " [   33   394  6011  5568  1062]\n",
      " [    3    51   582  1775  1326]]\n",
      "=====CLASSIFICATION REPORT=====\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.50      0.31      0.38      2931\n",
      "           1       0.52      0.38      0.44     10824\n",
      "           2       0.68      0.85      0.76     31864\n",
      "           3       0.54      0.43      0.48     13068\n",
      "           4       0.51      0.35      0.42      3737\n",
      "\n",
      "    accuracy                           0.62     62424\n",
      "   macro avg       0.55      0.46      0.49     62424\n",
      "weighted avg       0.60      0.62      0.60     62424\n",
      "\n",
      "=====CONFIDENCE SCORES=====\n",
      "[-1.0482547  -0.50286654  0.2091063  -0.97398092 -1.15145378]\n",
      "=====SCORES=====\n",
      "0.6241830065359477\n",
      "++++++++++++++++++++++++++++++++++++++++\n",
      "Vectorizer Scores for 1_CountVectorizer\n",
      "++++++++++++++++++++++++++++++++++++++++\n",
      "*****MNB*****\n",
      "*****CONFUSION MATRIX*****\n",
      "[[  742  1276   797   105    11]\n",
      " [  614  4126  5397   655    32]\n",
      " [  248  2385 25756  3239   236]\n",
      " [   19   456  5570  6253   770]\n",
      " [    1    53   729  1977   977]]\n",
      "*****CLASSIFICATION REPORT*****\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.46      0.25      0.33      2931\n",
      "           1       0.50      0.38      0.43     10824\n",
      "           2       0.67      0.81      0.73     31864\n",
      "           3       0.51      0.48      0.49     13068\n",
      "           4       0.48      0.26      0.34      3737\n",
      "\n",
      "    accuracy                           0.61     62424\n",
      "   macro avg       0.52      0.44      0.47     62424\n",
      "weighted avg       0.59      0.61      0.59     62424\n",
      "\n",
      "*****SCORES*****\n",
      "0.606401384083045\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.7/site-packages/sklearn/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
      "  \"the number of iterations.\", ConvergenceWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=====SVM=====\n",
      "=====CONFUSION MATRIX=====\n",
      "[[  918  1221   697    82    13]\n",
      " [  701  4080  5504   514    25]\n",
      " [  195  2106 27081  2310   172]\n",
      " [   34   396  6048  5533  1057]\n",
      " [    3    51   590  1772  1321]]\n",
      "=====CLASSIFICATION REPORT=====\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.50      0.31      0.38      2931\n",
      "           1       0.52      0.38      0.44     10824\n",
      "           2       0.68      0.85      0.75     31864\n",
      "           3       0.54      0.42      0.48     13068\n",
      "           4       0.51      0.35      0.42      3737\n",
      "\n",
      "    accuracy                           0.62     62424\n",
      "   macro avg       0.55      0.46      0.49     62424\n",
      "weighted avg       0.60      0.62      0.60     62424\n",
      "\n",
      "=====CONFIDENCE SCORES=====\n",
      "[-1.01718415 -0.5076005   0.22331207 -0.97514731 -1.24718848]\n",
      "=====SCORES=====\n",
      "0.6236864026656415\n",
      "++++++++++++++++++++++++++++++++++++++++\n",
      "Vectorizer Scores for 2_CountVectorizer\n",
      "++++++++++++++++++++++++++++++++++++++++\n",
      "*****MNB*****\n",
      "*****CONFUSION MATRIX*****\n",
      "[[  867  1253   725    69    17]\n",
      " [  786  4440  4943   609    46]\n",
      " [  459  2961 24437  3600   407]\n",
      " [   41   513  5082  6375  1057]\n",
      " [    6    46   602  1911  1172]]\n",
      "*****CLASSIFICATION REPORT*****\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.40      0.30      0.34      2931\n",
      "           1       0.48      0.41      0.44     10824\n",
      "           2       0.68      0.77      0.72     31864\n",
      "           3       0.51      0.49      0.50     13068\n",
      "           4       0.43      0.31      0.36      3737\n",
      "\n",
      "    accuracy                           0.60     62424\n",
      "   macro avg       0.50      0.45      0.47     62424\n",
      "weighted avg       0.58      0.60      0.59     62424\n",
      "\n",
      "*****SCORES*****\n",
      "0.5973824170190952\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.7/site-packages/sklearn/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
      "  \"the number of iterations.\", ConvergenceWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=====SVM=====\n",
      "=====CONFUSION MATRIX=====\n",
      "[[ 1039  1276   542    63    11]\n",
      " [  864  4555  4911   457    37]\n",
      " [  252  2470 26246  2700   196]\n",
      " [   28   358  5383  6034  1265]\n",
      " [    5    27   452  1794  1459]]\n",
      "=====CLASSIFICATION REPORT=====\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.47      0.35      0.41      2931\n",
      "           1       0.52      0.42      0.47     10824\n",
      "           2       0.70      0.82      0.76     31864\n",
      "           3       0.55      0.46      0.50     13068\n",
      "           4       0.49      0.39      0.44      3737\n",
      "\n",
      "    accuracy                           0.63     62424\n",
      "   macro avg       0.55      0.49      0.51     62424\n",
      "weighted avg       0.61      0.63      0.62     62424\n",
      "\n",
      "=====CONFIDENCE SCORES=====\n",
      "[-1.35329509 -0.56433734  0.50417972 -0.98434221 -1.14487822]\n",
      "=====SCORES=====\n",
      "0.6300941945405614\n",
      "++++++++++++++++++++++++++++++++++++++++\n",
      "Vectorizer Scores for 3_TfidfVectorizer\n",
      "++++++++++++++++++++++++++++++++++++++++\n",
      "*****MNB*****\n",
      "*****CONFUSION MATRIX*****\n",
      "[[  107  1144  1613    67     0]\n",
      " [   61  2580  7821   361     1]\n",
      " [   19  1168 28673  1987    17]\n",
      " [    0   147  7942  4883    96]\n",
      " [    0    11  1374  2164   188]]\n",
      "*****CLASSIFICATION REPORT*****\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.57      0.04      0.07      2931\n",
      "           1       0.51      0.24      0.33     10824\n",
      "           2       0.60      0.90      0.72     31864\n",
      "           3       0.52      0.37      0.43     13068\n",
      "           4       0.62      0.05      0.09      3737\n",
      "\n",
      "    accuracy                           0.58     62424\n",
      "   macro avg       0.57      0.32      0.33     62424\n",
      "weighted avg       0.57      0.58      0.53     62424\n",
      "\n",
      "*****SCORES*****\n",
      "0.5836056644880174\n",
      "=====SVM=====\n",
      "=====CONFUSION MATRIX=====\n",
      "[[  795  1387   624   117     8]\n",
      " [  589  4336  5245   629    25]\n",
      " [  163  2299 26557  2684   161]\n",
      " [   24   408  5604  6220   812]\n",
      " [    2    40   551  2010  1134]]\n",
      "=====CLASSIFICATION REPORT=====\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.51      0.27      0.35      2931\n",
      "           1       0.51      0.40      0.45     10824\n",
      "           2       0.69      0.83      0.75     31864\n",
      "           3       0.53      0.48      0.50     13068\n",
      "           4       0.53      0.30      0.39      3737\n",
      "\n",
      "    accuracy                           0.63     62424\n",
      "   macro avg       0.55      0.46      0.49     62424\n",
      "weighted avg       0.61      0.63      0.61     62424\n",
      "\n",
      "=====CONFIDENCE SCORES=====\n",
      "[-1.01488208 -0.38030889  0.16542161 -0.97048325 -1.23292618]\n",
      "=====SCORES=====\n",
      "0.6254325259515571\n",
      "++++++++++++++++++++++++++++++++++++++++\n",
      "Vectorizer Scores for 4_TfidfVectorizer\n",
      "++++++++++++++++++++++++++++++++++++++++\n",
      "*****MNB*****\n",
      "*****CONFUSION MATRIX*****\n",
      "[[  107  1144  1613    67     0]\n",
      " [   61  2580  7821   361     1]\n",
      " [   19  1168 28673  1987    17]\n",
      " [    0   147  7942  4883    96]\n",
      " [    0    11  1374  2164   188]]\n",
      "*****CLASSIFICATION REPORT*****\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.57      0.04      0.07      2931\n",
      "           1       0.51      0.24      0.33     10824\n",
      "           2       0.60      0.90      0.72     31864\n",
      "           3       0.52      0.37      0.43     13068\n",
      "           4       0.62      0.05      0.09      3737\n",
      "\n",
      "    accuracy                           0.58     62424\n",
      "   macro avg       0.57      0.32      0.33     62424\n",
      "weighted avg       0.57      0.58      0.53     62424\n",
      "\n",
      "*****SCORES*****\n",
      "0.5836056644880174\n",
      "=====SVM=====\n",
      "=====CONFUSION MATRIX=====\n",
      "[[  795  1387   624   117     8]\n",
      " [  589  4336  5245   629    25]\n",
      " [  163  2299 26557  2684   161]\n",
      " [   24   408  5604  6220   812]\n",
      " [    2    40   551  2010  1134]]\n",
      "=====CLASSIFICATION REPORT=====\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.51      0.27      0.35      2931\n",
      "           1       0.51      0.40      0.45     10824\n",
      "           2       0.69      0.83      0.75     31864\n",
      "           3       0.53      0.48      0.50     13068\n",
      "           4       0.53      0.30      0.39      3737\n",
      "\n",
      "    accuracy                           0.63     62424\n",
      "   macro avg       0.55      0.46      0.49     62424\n",
      "weighted avg       0.61      0.63      0.61     62424\n",
      "\n",
      "=====CONFIDENCE SCORES=====\n",
      "[-1.01488249 -0.38032514  0.16541625 -0.97048002 -1.23292607]\n",
      "=====SCORES=====\n",
      "0.6254325259515571\n",
      "++++++++++++++++++++++++++++++++++++++++\n",
      "Vectorizer Scores for 5_TfidfVectorizer\n",
      "++++++++++++++++++++++++++++++++++++++++\n",
      "*****MNB*****\n",
      "*****CONFUSION MATRIX*****\n",
      "[[  179  1186  1513    52     1]\n",
      " [   77  2868  7598   279     2]\n",
      " [   18  1242 28695  1897    12]\n",
      " [    1   140  7680  5128   119]\n",
      " [    0    18  1326  2127   266]]\n",
      "*****CLASSIFICATION REPORT*****\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.65      0.06      0.11      2931\n",
      "           1       0.53      0.26      0.35     10824\n",
      "           2       0.61      0.90      0.73     31864\n",
      "           3       0.54      0.39      0.45     13068\n",
      "           4       0.67      0.07      0.13      3737\n",
      "\n",
      "    accuracy                           0.59     62424\n",
      "   macro avg       0.60      0.34      0.36     62424\n",
      "weighted avg       0.59      0.59      0.54     62424\n",
      "\n",
      "*****SCORES*****\n",
      "0.5948993976675637\n",
      "=====SVM=====\n",
      "=====CONFUSION MATRIX=====\n",
      "[[  916  1373   565    69     8]\n",
      " [  696  4666  4947   493    22]\n",
      " [  217  2507 26156  2827   157]\n",
      " [   25   364  5343  6334  1002]\n",
      " [    5    32   475  1962  1263]]\n",
      "=====CLASSIFICATION REPORT=====\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.49      0.31      0.38      2931\n",
      "           1       0.52      0.43      0.47     10824\n",
      "           2       0.70      0.82      0.75     31864\n",
      "           3       0.54      0.48      0.51     13068\n",
      "           4       0.52      0.34      0.41      3737\n",
      "\n",
      "    accuracy                           0.63     62424\n",
      "   macro avg       0.55      0.48      0.51     62424\n",
      "weighted avg       0.61      0.63      0.62     62424\n",
      "\n",
      "=====CONFIDENCE SCORES=====\n",
      "[-1.17972911 -0.41383963  0.29126027 -0.87403664 -1.04112626]\n",
      "=====SCORES=====\n",
      "0.6301262334999359\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "train=pd.read_csv(\"kaggle-sentiment/train.tsv\", delimiter='\\t')\n",
    "y=train['Sentiment'].values\n",
    "X=train['Phrase'].values\n",
    "do_the_thing(X,y,[0,1,2,3,4],['0','1','2','3','4'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Figure size 1000x600 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import numpy as np; np.random.seed(1)\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "\n",
    "data=pd.DataFrame({\"VarX\" : np.arange(10), \n",
    "                   'VarY1': np.random.rand(10),\n",
    "                   'VarY2': np.random.rand(10),\n",
    "                   'VarY3': np.random.rand(10)})\n",
    "\n",
    "fig = plt.figure(figsize=(10,6))\n",
    "sns.regplot(x='VarX', y='VarY1', data=data)\n",
    "sns.regplot(x='VarX', y='VarY2', data=data)\n",
    "sns.regplot(x='VarX', y='VarY3', data=data)\n",
    "fig.legend(labels=['First','Second','Third'])\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
