import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
def get_test_train_vec(X,y,vectorizer):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
# X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0, stratify=y)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
return X_train_vec, X_test_vec, y_train, y_test
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vectorizer)
import pandas as pd
train = pd.DataFrame(X_train_vec, y_train)
train.reset_index(inplace=True)
train['index'].value_counts()
def get_test_train_vec_2(X,y,vectorizer):
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=100, test_size=50, random_state=0, stratify=y)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
return X_train_vec, X_test_vec, y_train, y_test
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec_2(X,y,vectorizer)
train = pd.DataFrame(X_train_vec, y_train)
train.reset_index(inplace=True)
train['index'].value_counts()
from sklearn.utils.class_weight import compute_class_weight
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vectorizer)
train = pd.DataFrame(X_train_vec, y_train)
train.reset_index(inplace=True)
train['index'].value_counts()
train['index'].unique()
# compute_class_weight('balanced', y.unique(), y)
compute_class_weight('balanced', train['index'].unique(), train['index'])
compute_class_weight(None, train['index'].unique(), train['index'])
# shuffle
# get_smallest_num(count_of_groups)
# count_of_groups.min
# take first N (where N is num of smallest group) of each group
# run train test
train['index'].value_counts().min()