How do we take something with 3000 columns and turn it into something meaninful? In short, we, as humans, can't. But computers can!
## =======================================================
## IMPORTING
## =======================================================
import os
def get_data_from_files(path):
directory = os.listdir(path)
results = []
for file in directory:
f=open(path+file)
results.append(f.read())
f.close()
return results
## =======================================================
## MACHINE LEARNING
## =======================================================
def do_the_xy(x,y,labels, target_names):
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(X_train[0])
print(y_train[0])
print(X_test[0])
print(y_test[0])
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# several commonly used vectorizer setting
# unigram boolean vectorizer, set minimum document frequency to 5
unigram_bool_vectorizer = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
# unigram term frequency vectorizer, set minimum document frequency to 5
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
# unigram and bigram term frequency vectorizer, set minimum document frequency to 5
gram12_count_vectorizer = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
# unigram tfidf vectorizer, set minimum document frequency to 5
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
X_train_vec = unigram_count_vectorizer.fit_transform(X_train)
X_test_vec = unigram_count_vectorizer.transform(X_test)
# import the LinearSVC module
from sklearn.svm import LinearSVC
# initialize the LinearSVC model
svm_clf = LinearSVC(C=1)
# use the training data to train the model
svm_clf.fit(X_train_vec,y_train)
from sklearn.metrics import confusion_matrix
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred, labels=labels)
print(cm)
print()
from sklearn.metrics import classification_report
# target_names =
target_names = target_names
print(classification_report(y_test, y_pred, target_names=target_names))
svm_confidence_scores = svm_clf.decision_function(X_test_vec)
## get the confidence score for the first test example
print(svm_confidence_scores[0])
print(svm_clf.score(X_test_vec,y_test))
import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
do_the_xy(X,y,[0,1,2,3,4],['0','1','2','3','4'])
import pandas as pd
import numpy as np
neg_df = pd.DataFrame(neg)
pos_df = pd.DataFrame(pos)
neg = get_data_from_files('../NEG_JK_E/')
pos = get_data_from_files('../POS_JK_E/')
pos_df['PoN'] = 'P'
neg_df['PoN'] = 'N'
all_df = neg_df.append(pos_df)
y=all_df['PoN'].values
X=all_df[0].values
do_the_xy(X,y,['P','N'],['P','N'])