Tutorial: How to Calculate Cosine Similarity

Method 1: Use numpy's dot and norm functions

In [1]:
from numpy import dot
from numpy.linalg import norm
X = [1,2]
Y = [2,2]
cos_sim = dot(X,Y) / (norm(X)*norm(Y))
print(cos_sim)
0.9486832980505138

Method 2: Use scipy's built-in cosine function

In [6]:
# note that this function actually calculates cosine similarity 
# and then use "1-similarity" to convert similarity to distance
# to get the actual cosine similarity, you need to do 1-distance

from scipy import spatial
X = [1,2]
Y = [2,2]
cos_sim = 1 - spatial.distance.cosine(X, Y)
print(cos_sim)
0.948683298051

Method 3: Use sklearn to calculate the cosine similarity matrix among vectors

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
X = np.array([1,2])
Y = np.array([2,2])
Z = np.array([2,4])

# calculate cosine similarity between [X] and [Y,Z]
# sending input as arrays would allow for calculating both cosine_sim(X,Y) and cosine_sim (X,Y)
cos_sim = cosine_similarity([X], [Y,Z])
print(cos_sim)

# calculate the entire cosie similarity matrix among X, Y, and Z
cos_sim = cosine_similarity([X, Y, Z])
print(cos_sim)
print()
[[ 0.9486833  1.       ]]
[[ 1.         0.9486833  1.       ]
 [ 0.9486833  1.         0.9486833]
 [ 1.         0.9486833  1.       ]]

Use cosine similarity for plagiarism detection

In [8]:
# data from https://www.bowdoin.edu/studentaffairs/academic-honesty/examples/mosaic/index.shtml

txt_original = "Contrast the condition into which all these friendly Indians are suddenly plunged now, with their condition only two years previous: martial law now in force on all their reservations; themselves in danger of starvation, and constantly exposed to the influence of emissaries from their friends and relations, urging them to join in fighting this treacherous government that had kept faith with nobody--neither with friend nor with foe."
txt_plagiarized = "Only two years later, all these friendly Sioux were suddenly plunged into new conditions, including starvation, martial law on all their reservations, and constant urging by their friends and relations to join in warfare against the treacherous government that had kept faith with neither friend nor foe."
txt_control = "Only two years later, all the money he won from lottery was gone."

txts = [txt_original, txt_plagiarized, txt_control]
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

unigram_count = CountVectorizer(encoding='latin-1', binary=False)
unigram_count_stop_remove = CountVectorizer(encoding='latin-1', binary=False, stop_words='english')

vecs = unigram_count.fit_transform(txts)

print(vecs.shape)

from sklearn.metrics.pairwise import linear_kernel
cos_sim = cosine_similarity(vecs[0], vecs)
print(cos_sim)
sim_sorted_doc_idx = cos_sim.argsort()
print(sim_sorted_doc_idx.shape)

# print the most similar doc; it's actually the original doc itself
print(txts[sim_sorted_doc_idx[0][len(txts)-1]])
print()

# print the second most similar doc; it's the most likely plagiarized one
print(txts[sim_sorted_doc_idx[0][len(txts)-2]])
(3, 70)
[[ 1.          0.69376514  0.21550898]]
(1, 3)
Contrast the condition into which all these friendly Indians are suddenly plunged now, with their condition only two years previous: martial law now in force on all their reservations; themselves in danger of starvation, and constantly exposed to the influence of emissaries from their friends and relations, urging them to join in fighting this treacherous government that had kept faith with nobody--neither with friend nor with foe.

Only two years later, all these friendly Sioux were suddenly plunged into new conditions, including starvation, martial law on all their reservations, and constant urging by their friends and relations to join in warfare against the treacherous government that had kept faith with neither friend nor foe.
In [ ]: