# read in the csv file that contains function words and their normalized frequencies
import pandas as pd
from sklearn.cluster import KMeans
papers2 = pd.read_csv('fedPapers85.csv')
author_labels = papers2['author']
filenames = papers2['filename']
papers2_vecs = papers2.drop(['author', 'filename'], axis=1)
print(papers2.shape)
print(papers2_vecs.shape)
print()
H_subset = papers2[papers2['author']=='Hamilton']
M_subset = papers2[papers2['author']=='Madison']
D_subset = papers2[papers2['author']=='dispt']
frames = [H_subset, M_subset, D_subset]
HMD_subset = pd.concat(frames)
HMD_subset = HMD_subset.reset_index(drop=True)
HMD_subset_labels = HMD_subset['author'].values
HMD_subset_filenames = HMD_subset['filename']
HMD_vecs = HMD_subset.drop(['author', 'filename'], axis=1)
print(HMD_subset.shape)
print(HMD_vecs.shape)
# clustering using hamilton and madison essays only
k = 2
km = KMeans(n_clusters=k, algorithm='auto', init='random', n_init=20, random_state=0, verbose=False)
km.fit(HMD_vecs)
cm = pd.crosstab(HMD_subset_labels, km.labels_)
print(cm)
print()
This clustering result was not able to distinguish Hamilton and Madison's writing styles. It also attributes the authorship of disputed essays evenly to Hamilton and Madison.
Now let's try using zscore transformation to transform all attribute values and redo kMeans clustering.
from scipy.stats import zscore
HMD_vecs = HMD_vecs.apply(zscore)
k = 2
km2 = KMeans(n_clusters=k, algorithm='auto', init='random', n_init=5, random_state=0, verbose=False)
km2.fit(HMD_vecs)
cm2 = pd.crosstab(HMD_subset_labels, km2.labels_)
print(cm2)
print()
# clustering using all essays and all features
# transform feature values with zscore transformation
from scipy.stats import zscore
papers2_vecs = papers2_vecs.apply(zscore)
k=2
km3 = KMeans(n_clusters=k, algorithm='auto', init='random', n_init=20, random_state=0, verbose=False)
km3.fit(papers2_vecs)
cm3 = pd.crosstab(author_labels, km3.labels_)
print(cm3)
This clustering result makes much better sense by clustering almost all Hamilton's essays into one cluster. It also shows the disputed essays were more similar to Madison's essays.
This study shows that sklearn's kMeans algorithm does not do normalization as Weka's kMeans does. So users need to normalize the data themsevles.
See this stackoverflow discussion on normalization for kmeans in sklearn: https://stackoverflow.com/questions/20027645/does-kmeans-normalize-features-automatically-in-sklearn