{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "train=pd.read_csv(\"kaggle-sentiment/train.tsv\", delimiter='\\t')\n",
    "y=train['Sentiment'].values\n",
    "X=train['Phrase'].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')\n",
    "\n",
    "def get_test_train_vec(X,y,vectorizer):\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)\n",
    "#     X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0, stratify=y)\n",
    "    X_train_vec = vectorizer.fit_transform(X_train)\n",
    "    X_test_vec = vectorizer.transform(X_test)\n",
    "    return X_train_vec, X_test_vec, y_train, y_test\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vectorizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "train = pd.DataFrame(X_train_vec, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.reset_index(inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2    47718\n",
       "3    19859\n",
       "1    16449\n",
       "4     5469\n",
       "0     4141\n",
       "Name: index, dtype: int64"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train['index'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2    51\n",
       "3    21\n",
       "1    17\n",
       "4     6\n",
       "0     5\n",
       "Name: index, dtype: int64"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def get_test_train_vec_2(X,y,vectorizer):\n",
    "#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=100, test_size=50, random_state=0, stratify=y)\n",
    "    X_train_vec = vectorizer.fit_transform(X_train)\n",
    "    X_test_vec = vectorizer.transform(X_test)\n",
    "    return X_train_vec, X_test_vec, y_train, y_test\n",
    "\n",
    "\n",
    "X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec_2(X,y,vectorizer)\n",
    "train = pd.DataFrame(X_train_vec, y_train)\n",
    "train.reset_index(inplace=True)\n",
    "train['index'].value_counts()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([3, 2, 0, 4, 1])"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.utils.class_weight import compute_class_weight\n",
    "X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vectorizer)\n",
    "train = pd.DataFrame(X_train_vec, y_train)\n",
    "train.reset_index(inplace=True)\n",
    "train['index'].value_counts()\n",
    "train['index'].unique()\n",
    "\n",
    "# compute_class_weight('balanced', y.unique(), y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([1., 1., 1., 1., 1.])"
      ]
     },
     "execution_count": 47,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "compute_class_weight('balanced', train['index'].unique(), train['index'])\n",
    "compute_class_weight(None, train['index'].unique(), train['index'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4141"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# shuffle\n",
    "# get_smallest_num(count_of_groups)\n",
    "# count_of_groups.min\n",
    "# take first N (where N is num of smallest group) of each group\n",
    "# run train test\n",
    "\n",
    "train['index'].value_counts().min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
