This tutorial demonstrates how to use the Sci-kit Learn (sklearn) package to vectorize text documents.
# read in the tsv file as input
import pandas as p
input=p.read_csv("../A-data/moviereview.tsv", delimiter='\t')
# sklearn contains two vectorizers
# CountVectorizer can give you Boolean or TF vectors
# TfidfVectorizer can give you TF or TFIDF vectors
# Read the sklearn documentation to understand all vectorization options
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
# several commonly used vectorizer setting
# unigram boolean vectorizer, set minimum document frequency to 5
unigram_bool_vectorizer = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
# unigram term frequency vectorizer, set minimum document frequency to 5
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
# unigram and bigram term frequency vectorizer, set minimum document frequency to 5
gram12_count_vectorizer = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
# unigram tfidf vectorizer, set minimum document frequency to 5
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
# The vectorizer can do "fit" and "transform"
# fit is a process to collect unique tokens into the vocabulary
# transform is a process to convert each document to vector based on the vocabulary
# These two processes can be done together using fit_transform(), or used individually: fit() or transform()
# fit vocabulary in documents and transform the documents into vectors
vecs = unigram_count_vectorizer.fit_transform(docs)
# check the content of a document vector
# check the size of the constructed vocabulary
# print out the first 10 items in the vocabulary
# check word index in vocabulary