This tutorial demonstrates how to use the Sci-kit Learn (sklearn) package to vectorize text documents.
# read in the tsv file as input
import pandas as p
input=p.read_csv("../A-data/moviereview.tsv", delimiter='\t')
docs=input['text'].values
print(docs[0:2])
# sklearn contains two vectorizers
# CountVectorizer can give you Boolean or TF vectors
# https://urldefense.proofpoint.com/v2/url?u=http-3A__scikit-2Dlearn.org_stable_modules_generated_sklearn.feature-5Fextraction.text.CountVectorizer.html&d=DwIGAg&c=KqtxL2Lt1AKmPhqmvvNjR0MTQm8XwKWV11VtWfYv1LQ&r=2VcUQbUmLN6zU85f94-1ZxUreXcE3be7opAgeVcjVKw&m=2Gqo51Rev6xsuBteQ1AYkPfNXwicgzl0rBQ9uDRqb44&s=P4MNn6qr6Mgi3IcTSJqAWTaHS8zbaomxmgSUBAahYe0&e=
# TfidfVectorizer can give you TF or TFIDF vectors
# https://urldefense.proofpoint.com/v2/url?u=http-3A__scikit-2Dlearn.org_stable_modules_generated_sklearn.feature-5Fextraction.text.TfidfVectorizer.html&d=DwIGAg&c=KqtxL2Lt1AKmPhqmvvNjR0MTQm8XwKWV11VtWfYv1LQ&r=2VcUQbUmLN6zU85f94-1ZxUreXcE3be7opAgeVcjVKw&m=2Gqo51Rev6xsuBteQ1AYkPfNXwicgzl0rBQ9uDRqb44&s=5hK9g_5oHP7rl7T3jCGTh1iX-MFiFfK4BaBYML4_Sdk&e=
# Read the sklearn documentation to understand all vectorization options
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# several commonly used vectorizer setting
# unigram boolean vectorizer, set minimum document frequency to 5
unigram_bool_vectorizer = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
# unigram term frequency vectorizer, set minimum document frequency to 5
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
# unigram and bigram term frequency vectorizer, set minimum document frequency to 5
gram12_count_vectorizer = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
# unigram tfidf vectorizer, set minimum document frequency to 5
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
# The vectorizer can do "fit" and "transform"
# fit is a process to collect unique tokens into the vocabulary
# transform is a process to convert each document to vector based on the vocabulary
# These two processes can be done together using fit_transform(), or used individually: fit() or transform()
# fit vocabulary in documents and transform the documents into vectors
vecs = unigram_count_vectorizer.fit_transform(docs)
# check the content of a document vector
print(vecs.shape)
print(vecs[0].toarray())
# check the size of the constructed vocabulary
print(len(unigram_count_vectorizer.vocabulary_))
# print out the first 10 items in the vocabulary
print(list(unigram_count_vectorizer.vocabulary_.items())[:10])
# check word index in vocabulary
print(unigram_count_vectorizer.vocabulary_.get('year'))