# Tutorial - vectorize text documents in sklearn

This tutorial demonstrates how to use the Sci-kit Learn (sklearn) package to vectorize text documents.

# Step 1: Read in data

In [1]:
# read in the tsv file as input
import pandas as p
input=p.read_csv("../A-data/moviereview.tsv", delimiter='\t')
docs=input['text'].values
print(docs[0:2])

['\'plot : two teen couples go to a church party , drink and then drive . \\nthey get into an accident . \\none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \\nwhat\\\'s the deal ? \\nwatch the movie and \\" sorta \\" find out . . . \\ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \\nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn\\\'t snag this one correctly . \\nthey seem to have taken this pretty neat concept , but executed it terribly . \\nso what are the problems with the movie ? \\nwell , its main problem is that it\\\'s simply too jumbled . \\nit starts off \\" normal \\" but then downshifts into this \\" fantasy \\" world in w

# Step 2: Prepare Vectorizer

In [4]:
# sklearn contains two vectorizers

# CountVectorizer can give you Boolean or TF vectors
# https://urldefense.proofpoint.com/v2/url?u=http-3A__scikit-2Dlearn.org_stable_modules_generated_sklearn.feature-5Fextraction.text.CountVectorizer.html&d=DwIGAg&c=KqtxL2Lt1AKmPhqmvvNjR0MTQm8XwKWV11VtWfYv1LQ&r=2VcUQbUmLN6zU85f94-1ZxUreXcE3be7opAgeVcjVKw&m=2Gqo51Rev6xsuBteQ1AYkPfNXwicgzl0rBQ9uDRqb44&s=P4MNn6qr6Mgi3IcTSJqAWTaHS8zbaomxmgSUBAahYe0&e=

# TfidfVectorizer can give you TF or TFIDF vectors
# https://urldefense.proofpoint.com/v2/url?u=http-3A__scikit-2Dlearn.org_stable_modules_generated_sklearn.feature-5Fextraction.text.TfidfVectorizer.html&d=DwIGAg&c=KqtxL2Lt1AKmPhqmvvNjR0MTQm8XwKWV11VtWfYv1LQ&r=2VcUQbUmLN6zU85f94-1ZxUreXcE3be7opAgeVcjVKw&m=2Gqo51Rev6xsuBteQ1AYkPfNXwicgzl0rBQ9uDRqb44&s=5hK9g_5oHP7rl7T3jCGTh1iX-MFiFfK4BaBYML4_Sdk&e=

# Read the sklearn documentation to understand all vectorization options

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# several commonly used vectorizer setting

#  unigram boolean vectorizer, set minimum document frequency to 5
unigram_bool_vectorizer = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')

#  unigram term frequency vectorizer, set minimum document frequency to 5
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')

#  unigram and bigram term frequency vectorizer, set minimum document frequency to 5
gram12_count_vectorizer = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')

#  unigram tfidf vectorizer, set minimum document frequency to 5
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')


# Step 3: Vectorize the text documents

In [5]:
# The vectorizer can do "fit" and "transform"
# fit is a process to collect unique tokens into the vocabulary
# transform is a process to convert each document to vector based on the vocabulary
# These two processes can be done together using fit_transform(), or used individually: fit() or transform()

# fit vocabulary in documents and transform the documents into vectors
vecs = unigram_count_vectorizer.fit_transform(docs)

# check the content of a document vector
print(vecs.shape)
print(vecs[0].toarray())

# check the size of the constructed vocabulary
print(len(unigram_count_vectorizer.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(unigram_count_vectorizer.vocabulary_.items())[:10])

# check word index in vocabulary
print(unigram_count_vectorizer.vocabulary_.get('year'))

(2000, 13724)
[[0 0 0 ... 0 0 0]]
13724
[('plot', 9380), ('teen', 12334), ('couples', 2622), ('church', 2013), ('party', 9067), ('drink', 3551), ('drive', 3556), ('nthey', 8563), ('accident', 196), ('guys', 5200)]
13670
