import pandas as pd
import numpy as np
neg = pd.read_csv('AMT_neg.csv')
pos = pd.read_csv('AMT_pos.csv')
neg[:3]
pos[:3]
neg.columns.tolist()
def get_unique(df, column):
unique = np.unique(df[column], return_counts=True)
df = pd.DataFrame(zip(unique[0], unique[1]))
return len(unique[0]), unique, df
num_neg, unique_neg, u_neg_df = get_unique(neg, 'WorkerId')
num_pos, unique_pos, u_pos_df = get_unique(pos, 'WorkerId')
print(num_neg, 'Turkers worked on NEG batch')
print(num_pos, 'Turkers worked on POS batch')
u_neg_df.plot(kind='bar',x=0,y=1)
u_pos_df.plot(kind='bar',x=0,y=1)
max
and min
HIT for unique turkers¶print('For {}, the min was: {} and the max was: {}'.format('neg', unique_neg[1].min(), unique_neg[1].max()))
print('For {}, the min was: {} and the max was: {}'.format('pos', unique_pos[1].min(), unique_pos[1].max()))
import seaborn as sns
import matplotlib.pyplot as plt
sns.catplot(x="Answer.sentiment.label",
y="WorkTimeInSeconds",
kind="bar",
order=['Negative', 'Neutral', 'Positive'],
data=neg);
plt.title('Negative')
sns.catplot(x="Answer.sentiment.label",
y="WorkTimeInSeconds",
kind="bar",
order=['Negative', 'Neutral', 'Positive'],
data=pos)
plt.title('Positive')
response_time = neg[neg['WorkTimeInSeconds'] < 10]
response_time_check = neg[neg['WorkTimeInSeconds'] > 10]
len(response_time)
len(response_time_check)
count = pos.groupby(['WorkerId'])['HITId'].count()
work_time = pos.groupby(['WorkerId'])['WorkTimeInSeconds'].mean()
new_df = pd.DataFrame([work_time, count]).T
new_df[:5]
new_df['WorkTimeInMin'] = new_df['WorkTimeInSeconds']/60
new_df[:5]
count = pos.groupby(['WorkerId', 'Answer.sentiment.label'])['Answer.sentiment.label'].count()
# count = pos.groupby(['WorkerId'])['Answer.sentiment.label'].count()
count
pnn = pd.DataFrame()
pnn['Neutral'] = pos.groupby('WorkerId')['Answer.sentiment.label'].apply(lambda x: (x=='Neutral').sum())
pnn['Positive'] = pos.groupby('WorkerId')['Answer.sentiment.label'].apply(lambda x: (x=='Positive').sum())
pnn['Negative'] = pos.groupby('WorkerId')['Answer.sentiment.label'].apply(lambda x: (x=='Negative').sum())
pnn['Total'] = pos.groupby('WorkerId')['Answer.sentiment.label'].apply(lambda x: x.count())
pnn[:5]
top = pnn.sort_values(by=['Total'], ascending=False)
top[:10]
Interesting!! Looking from here, we have three workers who ONLY chose positive.
Let's look at their response time to see if we can determine if they are a bot!!
top['Avg_WorkTimeInSeconds'] = pos.groupby('WorkerId')['WorkTimeInSeconds'].apply(lambda x: x.mean())
top['Avg_WorkTimeInMin'] = pos.groupby('WorkerId')['WorkTimeInSeconds'].apply(lambda x: x.mean()/60)
top['Min_WorkTimeInMin'] = pos.groupby('WorkerId')['WorkTimeInSeconds'].apply(lambda x: x.min()/60)
top['Max_WorkTimeInMin'] = pos.groupby('WorkerId')['WorkTimeInSeconds'].apply(lambda x: x.max()/60)
top[:10]
Even more interesting! These two don't appear to be bots, based on our current metric which is time variability.
HOWEVER, worker A681XM15AN28F
appears to only work for an average of 13 seconds per review which doesn't seem like enough time to read and judge a review...
TOO MANY REVIEWERS!
Here is when we realized that doing a kappa score with over 30 individual reviewers would be tricky, so we rusubmitted to AMT and required the turkers to be 'Master' in the hopes that this additional barrier-to-entry would help reduce the amount of turkers working on the project
v2 = pd.read_csv('HW5_amt_v2.csv')
v2[:5]
len(v2)
This time, I didn't separate the df into pos and neg before submitting to AMT, so we have to reimport the labels.
labels = pd.read_csv('all_JK_extremes_labeled.csv')
len(labels)
Oops! That's right, we replicated each review * 3 so three separate people could look at each review
labels2 = labels.append([labels] * 2, ignore_index=True)
len(labels2)
labels2.sort_values(by='0')
Shoot! I realized I had to delete some emojis for the csv to be accepted by AMT, so the reviews themselves won't actually be matching... solution: Create two 'for-matching' columns made up of the first 5 words of each review
v2['for_matching'] = v2.apply(lambda x: x['Input.text'].split()[:5], axis=1)
labels2['for_matching'] = labels2.apply(lambda x: x['0'].split()[:5], axis=1)
Annnnnd why did I do that when I could just sort the df and apply the PoN
sorted_labels = labels2.sort_values(by='0')
sorted_labels[:6]
sorted_v2 = v2.sort_values(by='Input.text')
sorted_v2[sorted_v2.columns[-5:]][:6]
all_df = sorted_v2.copy()
# all_df['PoN'] = sorted_labels['PoN'].tolist()
# THIS DIDN'T WORK BECAUSE I DIDN'T WAIT UNTIL ALL WERE DONE FROM AMT. RESEARCHER ERROR BUT OMG I HATE MYSELF
len(all_df)
293/3
Confirming that YEP. 293 isn't divisible by 3, meaning I didn't wait until the last turker finished. omg.
turker = pd.read_csv('HW5_amt_294.csv')
print(len(turker))
turker[turker.columns[-5:]][:5]
# Getting labels...
labels = pd.read_csv('all_JK_extremes_labeled.csv')
# X3
labels = labels.append([labels] * 2, ignore_index=True)
print(len(labels))
labels[:5]
sorted_labels = labels.sort_values(by=['0'])
sorted_turker = turker.sort_values(by=['Input.text'])
sorted_labels[:5]
sorted_turker['Input.text'][:5]
OMG HOORAY HOORAY HOORAY!!
NOTE: FUN FACT!! I can type here and then hit the esc
key to turn this cell into markdown!!
# YUCK THIS IS SO AGGRIVATING!! This line below doens't work because it still uses indexes.
# So the P and N didn't match up
# sorted_turker['PoN'] = sorted_labels['PoN']
sorted_turker['PoN'] = sorted_labels['PoN'].tolist()
sorted_turker[sorted_turker.columns[-5:]][:5]
First, let's clean ALL the things
all_df = sorted_turker[['Input.text', 'WorkerId', 'Answer.sentiment.label', 'PoN']]
all_df[:5]
all_df_all = all_df.copy()
all_df_all['APoN'] = all_df_all.apply(lambda x: x['Answer.sentiment.label'][0], axis=1)
all_df_all
all_df_all['agree'] = all_df_all.apply(lambda x: x['PoN'] == x['APoN'], axis=1)
all_df_all[-10:]
gdf = pd.DataFrame(all_df_all.groupby(['Input.text','PoN'])['agree'].mean())
gdf_forplot = gdf.copy()
gdf
OK so this actually gave us something we want... BUT PLEASE TELL ME THE BETTER WAY!!
three_agreed = gdf[gdf['agree'] == 1]
len(three_agreed)
three_agreed_but_wrong = gdf[gdf['agree'] == 0]
len(three_agreed_but_wrong)
disparity = gdf[(gdf['agree'] > 0) & (gdf['agree'] < 1)]
len(disparity)
quickdf = pd.DataFrame({'labels': ['agreed', 'agreed, incorrect', 'disparity'], 'counts': [33,31,34]})
quickdf
quickdf.plot(kind='bar', x='labels', y='counts')
Lol that is not super useful
# # three_agreed = gdf[(gdf['agree'] == 1) & (gdf['PoN'] == 'P')]
# # len(three_agreed)
# # gdf['PoN']
# gdf = gdf.reset_index()
# p_three_agreed = gdf[(gdf['agree'] == 1) & (gdf['PoN'] == 'P')]
# len(p_three_agreed)
# gdf = gdf.reset_index(drop=True)
# n_three_agreed = gdf[(gdf['agree'] == 1) & (gdf['PoN'] == 'N')]
# len(n_three_agreed)
# gdf = gdf.reset_index(drop=True)
# p_three_agreed_wrong = gdf[(gdf['agree'] == 0) & (gdf['PoN'] == 'P')]
# len(p_three_agreed_wrong)
# gdf = gdf.reset_index(drop=True)
# n_three_agreed_wrong = gdf[(gdf['agree'] == 0) & (gdf['PoN'] == 'N')]
# len(n_three_agreed_wrong)
# gdf = gdf.reset_index(drop=True)
# p_disparity = gdf[((gdf['agree'] / 1) != 0 ) & (gdf['PoN'] == 'P')]
# len(p_disparity)
# gdf = gdf.reset_index(drop=True)
# n_disparity = gdf[(gdf['agree'] / 1 != 0 ) & (gdf['PoN'] == 'N')]
# len(n_disparity)
# quickdf = pd.DataFrame({'labels': ['positive', 'negative'], 'counts': [18,15]})
# quickdf.plot(kind='bar', x='labels', y='counts')
# sns.catplot(x="Answer.sentiment.label",
# y="WorkTimeInSeconds",
# kind="bar",
# order=['Negative', 'Neutral', 'Positive'],
# data=gdp_forplot);
# plt.title('By Polarity')
gdf_forplot = gdf_forplot.reset_index()
gdf_forplot.groupby(['agree'])['agree'].count()
gdf_forplot.groupby(['agree','PoN'])['agree'].count()