{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# HW7: Comparing MNB and SVMs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## INTRODUCTION"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### MNB and SVM\n",
    "How do we take something with 3000 columns and turn it into something meaninful? In short, we, as humans, can't. But computers can!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## ANALYSIS & MODELS\n",
    "### About the Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "## =======================================================\n",
    "## IMPORTING\n",
    "## =======================================================\n",
    "import os\n",
    "def get_data_from_files(path):\n",
    "    directory = os.listdir(path)\n",
    "    results = []\n",
    "    for file in directory:\n",
    "        f=open(path+file)\n",
    "        results.append(f.read())\n",
    "        f.close()\n",
    "    return results\n",
    "\n",
    "## =======================================================\n",
    "## MACHINE LEARNING\n",
    "## =======================================================\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.svm import LinearSVC\n",
    "\n",
    "\n",
    "unigram_bool_vectorizer = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')\n",
    "unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')\n",
    "bigram_count_vectorizer = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')\n",
    "unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')\n",
    "bigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')\n",
    "\n",
    "def get_test_train_vec(X,y,vectorizer):\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)\n",
    "    X_train_vec = vectorizer.fit_transform(X_train)\n",
    "    X_test_vec = vectorizer.transform(X_test)\n",
    "    return X_train_vec, X_test_vec, y_train, y_test\n",
    "\n",
    "def do_the_xy(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):\n",
    "    svm_clf = LinearSVC(C=1)\n",
    "    svm_clf.fit(X_train_vec,y_train)\n",
    "\n",
    "    y_pred = svm_clf.predict(X_test_vec)\n",
    "    cm=confusion_matrix(y_test, y_pred, labels=labels)\n",
    "    print('=====CONFUSION MATRIX=====')\n",
    "    print(cm)\n",
    "\n",
    "    target_names = target_names\n",
    "    print('=====CLASSIFICATION REPORT=====')\n",
    "    print(classification_report(y_test, y_pred, target_names=target_names))\n",
    "\n",
    "    svm_confidence_scores = svm_clf.decision_function(X_test_vec)\n",
    "    print('=====CONFIDENCE SCORES=====')\n",
    "    print(svm_confidence_scores[0])\n",
    "    print('=====SCORES=====')\n",
    "    print(svm_clf.score(X_test_vec,y_test))\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# With Kaggle Sentiment Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "train=pd.read_csv(\"kaggle-sentiment/train.tsv\", delimiter='\\t')\n",
    "y=train['Sentiment'].values\n",
    "X=train['Phrase'].values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## KAGGLE: Unigram Bool Vectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=====CONFUSION MATRIX=====\n",
      "[[  913  1229   696    79    14]\n",
      " [  705  4094  5472   527    26]\n",
      " [  190  2111 27063  2324   176]\n",
      " [   33   394  6011  5568  1062]\n",
      " [    3    51   582  1775  1326]]\n",
      "=====CLASSIFICATION REPORT=====\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.50      0.31      0.38      2931\n",
      "           1       0.52      0.38      0.44     10824\n",
      "           2       0.68      0.85      0.76     31864\n",
      "           3       0.54      0.43      0.48     13068\n",
      "           4       0.51      0.35      0.42      3737\n",
      "\n",
      "    accuracy                           0.62     62424\n",
      "   macro avg       0.55      0.46      0.49     62424\n",
      "weighted avg       0.60      0.62      0.60     62424\n",
      "\n",
      "=====CONFIDENCE SCORES=====\n",
      "[-1.04825473 -0.50286701  0.20910622 -0.97398091 -1.15145374]\n",
      "=====SCORES=====\n",
      "0.6241830065359477\n"
     ]
    }
   ],
   "source": [
    "X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, unigram_bool_vectorizer)\n",
    "do_the_xy(X_train_vec, X_test_vec, y_train, y_test, [0,1,2,3,4],['0','1','2','3','4'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## KAGGLE: Unigram Count Vectorizer\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=====CONFUSION MATRIX=====\n",
      "[[  918  1221   697    82    13]\n",
      " [  701  4080  5504   514    25]\n",
      " [  195  2106 27081  2310   172]\n",
      " [   34   396  6048  5533  1057]\n",
      " [    3    51   590  1772  1321]]\n",
      "=====CLASSIFICATION REPORT=====\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.50      0.31      0.38      2931\n",
      "           1       0.52      0.38      0.44     10824\n",
      "           2       0.68      0.85      0.75     31864\n",
      "           3       0.54      0.42      0.48     13068\n",
      "           4       0.51      0.35      0.42      3737\n",
      "\n",
      "    accuracy                           0.62     62424\n",
      "   macro avg       0.55      0.46      0.49     62424\n",
      "weighted avg       0.60      0.62      0.60     62424\n",
      "\n",
      "=====CONFIDENCE SCORES=====\n",
      "[-1.01718399 -0.50760034  0.22331216 -0.97514731 -1.24718845]\n",
      "=====SCORES=====\n",
      "0.6236864026656415\n"
     ]
    }
   ],
   "source": [
    "X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, unigram_count_vectorizer)\n",
    "do_the_xy(X_train_vec, X_test_vec, y_train, y_test, [0,1,2,3,4],['0','1','2','3','4'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## KAGGLE: Bigram Count Vectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=====CONFUSION MATRIX=====\n",
      "[[ 1039  1276   542    63    11]\n",
      " [  864  4555  4911   457    37]\n",
      " [  252  2470 26246  2700   196]\n",
      " [   28   358  5383  6034  1265]\n",
      " [    5    27   452  1793  1460]]\n",
      "=====CLASSIFICATION REPORT=====\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.47      0.35      0.41      2931\n",
      "           1       0.52      0.42      0.47     10824\n",
      "           2       0.70      0.82      0.76     31864\n",
      "           3       0.55      0.46      0.50     13068\n",
      "           4       0.49      0.39      0.44      3737\n",
      "\n",
      "    accuracy                           0.63     62424\n",
      "   macro avg       0.55      0.49      0.51     62424\n",
      "weighted avg       0.61      0.63      0.62     62424\n",
      "\n",
      "=====CONFIDENCE SCORES=====\n",
      "[-1.35329329 -0.56433728  0.50420228 -0.98431383 -1.1448782 ]\n",
      "=====SCORES=====\n",
      "0.6301102140202486\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.7/site-packages/sklearn/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.\n",
      "  \"the number of iterations.\", ConvergenceWarning)\n"
     ]
    }
   ],
   "source": [
    "X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, bigram_count_vectorizer)\n",
    "do_the_xy(X_train_vec, X_test_vec, y_train, y_test, [0,1,2,3,4],['0','1','2','3','4'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## KAGGLE: Unigram TFIDF Vectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=====CONFUSION MATRIX=====\n",
      "[[  795  1387   624   117     8]\n",
      " [  589  4336  5245   629    25]\n",
      " [  163  2299 26557  2684   161]\n",
      " [   24   408  5604  6220   812]\n",
      " [    2    40   551  2010  1134]]\n",
      "=====CLASSIFICATION REPORT=====\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.51      0.27      0.35      2931\n",
      "           1       0.51      0.40      0.45     10824\n",
      "           2       0.69      0.83      0.75     31864\n",
      "           3       0.53      0.48      0.50     13068\n",
      "           4       0.53      0.30      0.39      3737\n",
      "\n",
      "    accuracy                           0.63     62424\n",
      "   macro avg       0.55      0.46      0.49     62424\n",
      "weighted avg       0.61      0.63      0.61     62424\n",
      "\n",
      "=====CONFIDENCE SCORES=====\n",
      "[-1.01487712 -0.38031632  0.16541388 -0.97047847 -1.23293474]\n",
      "=====SCORES=====\n",
      "0.6254325259515571\n"
     ]
    }
   ],
   "source": [
    "X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, unigram_tfidf_vectorizer)\n",
    "do_the_xy(X_train_vec, X_test_vec, y_train, y_test, [0,1,2,3,4],['0','1','2','3','4'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## KAGGLE: Bigram TFIDF Vectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=====CONFUSION MATRIX=====\n",
      "[[  916  1373   565    69     8]\n",
      " [  696  4666  4947   493    22]\n",
      " [  217  2507 26156  2827   157]\n",
      " [   25   364  5343  6334  1002]\n",
      " [    5    32   475  1962  1263]]\n",
      "=====CLASSIFICATION REPORT=====\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.49      0.31      0.38      2931\n",
      "           1       0.52      0.43      0.47     10824\n",
      "           2       0.70      0.82      0.75     31864\n",
      "           3       0.54      0.48      0.51     13068\n",
      "           4       0.52      0.34      0.41      3737\n",
      "\n",
      "    accuracy                           0.63     62424\n",
      "   macro avg       0.55      0.48      0.51     62424\n",
      "weighted avg       0.61      0.63      0.62     62424\n",
      "\n",
      "=====CONFIDENCE SCORES=====\n",
      "[-1.17972335 -0.4138446   0.29125406 -0.87403192 -1.04112914]\n",
      "=====SCORES=====\n",
      "0.6301262334999359\n"
     ]
    }
   ],
   "source": [
    "X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, bigram_tfidf_vectorizer)\n",
    "do_the_xy(X_train_vec, X_test_vec, y_train, y_test, [0,1,2,3,4],['0','1','2','3','4'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# With Joker Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=====CONFUSION MATRIX=====\n",
      "[[11 13]\n",
      " [ 4 12]]\n",
      "=====CLASSIFICATION REPORT=====\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           P       0.48      0.75      0.59        16\n",
      "           N       0.73      0.46      0.56        24\n",
      "\n",
      "    accuracy                           0.57        40\n",
      "   macro avg       0.61      0.60      0.57        40\n",
      "weighted avg       0.63      0.57      0.57        40\n",
      "\n",
      "=====CONFIDENCE SCORES=====\n",
      "0.5249293629884589\n",
      "=====SCORES=====\n",
      "0.575\n"
     ]
    }
   ],
   "source": [
    "neg = get_data_from_files('../NEG_JK_E/')\n",
    "pos = get_data_from_files('../POS_JK_E/')\n",
    "neg_df = pd.DataFrame(neg)\n",
    "pos_df = pd.DataFrame(pos)\n",
    "pos_df['PoN'] = 'P'\n",
    "neg_df['PoN'] = 'N'\n",
    "all_df = neg_df.append(pos_df)\n",
    "y=all_df['PoN'].values\n",
    "X=all_df[0].values"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## JOKER: Unigram Count Vectorizer`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=====CONFUSION MATRIX=====\n",
      "[[11 13]\n",
      " [ 4 12]]\n",
      "=====CLASSIFICATION REPORT=====\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           P       0.48      0.75      0.59        16\n",
      "           N       0.73      0.46      0.56        24\n",
      "\n",
      "    accuracy                           0.57        40\n",
      "   macro avg       0.61      0.60      0.57        40\n",
      "weighted avg       0.63      0.57      0.57        40\n",
      "\n",
      "=====CONFIDENCE SCORES=====\n",
      "0.5249216209994625\n",
      "=====SCORES=====\n",
      "0.575\n"
     ]
    }
   ],
   "source": [
    "X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, unigram_count_vectorizer)\n",
    "do_the_xy(X_train_vec, X_test_vec, y_train, y_test, ['P','N'],['P','N'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## JOKER: Unigram Tfidf Vectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "=====CONFUSION MATRIX=====\n",
      "[[13 11]\n",
      " [ 4 12]]\n",
      "=====CLASSIFICATION REPORT=====\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           P       0.52      0.75      0.62        16\n",
      "           N       0.76      0.54      0.63        24\n",
      "\n",
      "    accuracy                           0.62        40\n",
      "   macro avg       0.64      0.65      0.62        40\n",
      "weighted avg       0.67      0.62      0.63        40\n",
      "\n",
      "=====CONFIDENCE SCORES=====\n",
      "0.056820071563634\n",
      "=====SCORES=====\n",
      "0.625\n"
     ]
    }
   ],
   "source": [
    "X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, unigram_tfidf_vectorizer)\n",
    "do_the_xy(X_train_vec, X_test_vec, y_train, y_test, ['P','N'],['P','N'])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
