import pandas as pd
import numpy as np
pos = pd.read_csv('AMT_pos.csv')
neg = pd.read_csv('AMT_neg.csv')
pos[:5]
pos.columns
unique_turkers_pos = np.unique(pos['WorkerId'], return_counts=True)
print(len(unique_turkers_pos[0]))
turker_df = pd.DataFrame(zip(unique_turkers_pos[0], unique_turkers_pos[1]))
sorted(turker_df)
df = pd.DataFrame(turker_df.sort_values(by=1, ascending=False))
# plt.plot(df)
# plt.show()
type(df)
unique_turkers_neg = np.unique(neg['WorkerId'], return_counts=True)
len(unique_turkers_neg[0])
pos_workers = pos.groupby('WorkerId')
len(pos_workers)
bot = neg[neg['WorkerId'] == 'A3HAEQW13YPT6A']
for_plot = np.unique(bot['Answer.sentiment.label'].tolist(), return_counts=True)
for_plot = zip(for_plot[0], for_plot[1])
for_plot
import seaborn as sns
def make_plot(x, y, col, color, size, data):
sns.set()
sns.relplot(x=x, y=y, col=col, hue=color, size=size, data=data)
data = sns.load_dataset("tips")
make_plot('total_bill', 'tip', 'time', 'smoker', 'size', data)
data[:5]
# make_plot('total_bill', 'tip', 'time', 'smoker', 'size', pos)
# sns.scatterplot(x="total_bill", y="tip", hue="day", data=tips, ax=axes[1]);
sns.boxplot(x="Answer.sentiment.label", y="WorkTimeInSeconds", data=pos)
sns.catplot(x="Answer.sentiment.label", y="WorkTimeInSeconds", kind="bar", data=pos);
sns.catplot(x="Answer.sentiment.label", y="WorkTimeInSeconds", kind="bar", data=neg);
unique_turkers_pos = np.unique(pos['WorkerId'], return_counts=True)
print(len(unique_turkers_pos[0]))
turker_df = pd.DataFrame(zip(unique_turkers_pos[0], unique_turkers_pos[1]))
sorted(turker_df)
df = pd.DataFrame(turker_df.sort_values(by=1, ascending=False))
df.plot(kind='bar',x=0,y=1)
from sklearn.metrics import cohen_kappa_score
y1 = [0,1,2,3,4,0,1,2,3,4,0,1,2,3,4]
y2 = [0,1,2,2,4,1,2,3,0,0,0,2,2,4,4]
cohen_kappa_score(y1,y2)
df1=pos.groupby('WorkerId')['Answer.sentiment.label'].apply(lambda x:
(x=='Neutral').sum()).reset_index(name='Neutral')
df2=pos.groupby('WorkerId')['Answer.sentiment.label'].apply(lambda x:
(x=='Positive').sum()).reset_index(name='Positive')
df3=pos.groupby('WorkerId')['Answer.sentiment.label'].apply(lambda x:
(x=='Negative').sum()).reset_index(name='Negative')
# pd.concat([df1, df2, df3], p='')
import pandas as pd
import numpy as np
np.random.seed(2016)
df = pd.DataFrame({'Turker': ['A', 'C', 'B', 'A', 'C', 'C'],
'SENTIMENT': ['Neg', 'Pos', 'Neg', 'Neg', 'Pos', 'Neu'],
'REVIEW': [1, 2, 3, 4, 5, 6]})
grouped = df.groupby('Turker')
values = grouped['REVIEW'].agg('sum')
id_df = grouped['SENTIMENT'].apply(lambda x: pd.Series(x.values)).unstack()
id_df = id_df.rename(columns={i: 'SENTIMENT{}'.format(i + 1) for i in range(id_df.shape[1])})
result = pd.concat([id_df, values], axis=1)
print(result.T)
df