In [116]:
## =======================================================
## IMPORTING
## =======================================================
import os
def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file)
        results.append(f.read())
        f.close()
    return results

## =======================================================
## TOKENIZING
## =======================================================
from nltk.tokenize import word_tokenize, sent_tokenize
def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

def get_sentence_tokens(review):
    return sent_tokenize(review)

## =======================================================
## REMOVING STOPWORDS
## =======================================================
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def remove_stopwords(sentence):
    filtered_text = []
    for word in sentence:
        if word not in stop_words:
            filtered_text.append(word)
    return filtered_text

## =======================================================
## FREQUENCY DISTRIBUTIONS
## =======================================================
from nltk.probability import FreqDist
def get_most_common(tokens):
    fdist = FreqDist(tokens)
    return fdist.most_common(12)

def get_most_common(tokens):
    fdist = FreqDist(tokens)
    return fdist.most_common(12)

def get_fdist(tokens):
    return (FreqDist(tokens))

## =======================================================
## SENTIMENT ANALYSIS
## =======================================================
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

def get_vader_score(review):
    return sid.polarity_scores(review)

def separate_vader_score(vader_score, key):
    return vader_score[key]

## =======================================================
## SUMMARIZER
## =======================================================
def get_weighted_freq_dist(review, freq_dist):
    max_freq = max(freq_dist.values())
    for word in freq_dist.keys():
        freq_dist[word] = (freq_dist[word]/max_freq)
    return freq_dist

def get_sentence_score(review, freq_dist):
    sentence_scores = {}
    for sent in review:
        for word in nltk.word_tokenize(sent.lower()):
            if word in freq_dist.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = freq_dist[word]
                    else:
                        sentence_scores[sent] += freq_dist[word]
    return sentence_scores

def get_summary_sentences(sentence_scores):
    sorted_sentences = sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)
    return ''.join(sent[0] for sent in sorted_sentences[:5])

def get_freq_words(freq_dist):
    sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)
    return ' '.join(word[0] for word in sorted_words[:50])


def clean_rogue_characters(string):
    exclude = ['\\',"\'",'"']
    string = ''.join(string.split('\\n'))
    string = ''.join(ch for ch in string if ch not in exclude)
    return string


import re
def clean_rogue_characters_2(string):
    return re.sub('[^0-9a-zA-Z.]+', ' ', string)


def most_freq_words(freq_dist):
    return freq_dist.most_common(100)

def pruner(review):
    clean_review = ' '.join([word.lower() for word in review.split() if len(word) > 3])
    return clean_review

def pruner_v2(review):
    clean_review = ' '.join([word.lower() for word in review.split() if len(word) > 3 and word not in stop_words])
    
    return clean_review

def get_bow_from_column(df, column):
    all_column_data = ' '.join(df[column].tolist())
    all_column_fd = Counter(all_column_data.split())
    return all_column_fd

def get_common_words(num):
    most_common_neg = [word[0] for word in big_bow_n.most_common(num)]
    most_common_pos = [word[0] for word in big_bow_p.most_common(num)]
    in_both = np.intersect1d(most_common_neg, most_common_pos)
    neg_notpos = np.setdiff1d(most_common_neg, most_common_pos)
    pos_notneg = np.setdiff1d(most_common_pos, most_common_neg)
    return [len(in_both), len(neg_notpos), len(pos_notneg), len(in_both)/num, in_both, neg_notpos, pos_notneg]

def get_only_polarized(tokens, common_words):
    return [token for token in tokens if token not in common_words[4]] # 70

In [117]:
# pos = get_data_from_files('../hw4_lie_false/')
data = get_data_from_files('AmazonPhotoTextCorpus/')

import pandas as pd
import numpy as np
df = pd.DataFrame(data)
all_df = df

In [118]:
all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)
all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)

all_df = all_df.drop(all_df[all_df.num_tokens < 1].index)

all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)
all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)

all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)
all_df['freq_dist'] = all_df.apply(lambda x: get_fdist(x['no_sw']),axis=1)

all_df['weighted_freq_dist'] = all_df.apply(lambda x: get_weighted_freq_dist(x['sentences'], x['freq_dist']),axis=1)
all_df['sentence_scores'] = all_df.apply(lambda x: get_sentence_score(x['sentences'], x['freq_dist']),axis=1)
all_df['summary_sentences'] = all_df.apply(lambda x: get_summary_sentences(x['sentence_scores']), axis=1)

all_df['clean'] = all_df.apply(lambda x: clean_rogue_characters(x[0]), axis=1)
all_df['clean_v2'] = all_df.apply(lambda x: clean_rogue_characters_2(x[0]), axis=1)

In [119]:
clean_df = pd.DataFrame(all_df['clean_v2'].tolist())

In [121]:
all_df = clean_df.copy()
all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)
all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)

all_df = all_df.drop(all_df[all_df.num_tokens < 1].index)

all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)
all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)

all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)
all_df['freq_dist'] = all_df.apply(lambda x: get_fdist(x['no_sw']),axis=1)

all_df['weighted_freq_dist'] = all_df.apply(lambda x: get_weighted_freq_dist(x['sentences'], x['freq_dist']),axis=1)
all_df['sentence_scores'] = all_df.apply(lambda x: get_sentence_score(x['sentences'], x['freq_dist']),axis=1)
all_df['summary_sentences'] = all_df.apply(lambda x: get_summary_sentences(x['sentence_scores']), axis=1)

all_df['clean'] = all_df.apply(lambda x: clean_rogue_characters(x[0]), axis=1)
all_df['clean_v2'] = all_df.apply(lambda x: clean_rogue_characters_2(x[0]), axis=1)

all_df['FREQ'] = all_df.apply(lambda x: most_freq_words(x['freq_dist']), axis=1)
all_df['pruned'] = all_df.apply(lambda x: pruner(x[0]), axis=1)


all_df['pruned_nosw'] = all_df.apply(lambda x: pruner_v2(x[0]), axis=1)

In [112]:
all_df

Unnamed: 0,0,tokens,num_tokens,sentences,num_sentences,no_sw,freq_dist,weighted_freq_dist,sentence_scores,summary_sentences,clean,clean_v2,FREQ,pruned,pruned_nosw
0,iS 11 23 1 Q Search If shine a white LED ligh...,"[is, q, search, if, shine, a, white, led, ligh...",135,[ iS 11 23 1 Q Search If shine a white LED lig...,2,"[q, search, shine, white, led, light, prism, w...","{'q': 0.25, 'search': 0.25, 'shine': 0.75, 'wh...","{'q': 0.25, 'search': 0.25, 'shine': 0.75, 'wh...",{'r theemotionmac 140 1 Share r Serendipity u ...,r theemotionmac 140 1 Share r Serendipity u se...,iS 11 23 1 Q Search If shine a white LED ligh...,iS 11 23 1 Q Search If shine a white LED ligh...,"[(r, 1.0), (shine, 0.75), (see, 0.75), (line, ...",search shine white light through prism would s...,search shine white light prism would spectrum ...
1,6 51 . 0 forums.nexusmods.com ee BrettM fosaym...,"[ee, brettm, fosaym, apr, stealth, archery, wo...",340,"[6 51 ., 0 forums.nexusmods.com ee BrettM fosa...",16,"[ee, brettm, fosaym, apr, stealth, archery, wo...","{'ee': 0.25, 'brettm': 0.25, 'fosaym': 0.25, '...","{'ee': 0.25, 'brettm': 0.25, 'fosaym': 0.25, '...",{'0 forums.nexusmods.com ee BrettM fosaym 617 ...,You should be far enough away from the first f...,6 51 . 0 forums.nexusmods.com ee BrettM fosaym...,6 51 . 0 forums.nexusmods.com ee BrettM fosaym...,"[(sneak, 1.0), (one, 1.0), (get, 1.0), (bow, 1...",forums.nexusmods.com brettm fosaym 2012 stealt...,forums.nexusmods.com brettm fosaym 2012 stealt...
2,6 52 al Se forums.nexusmods.com 17 Apr 2012 St...,"[al, se, apr, stealth, archery, works, great, ...",345,[6 52 al Se forums.nexusmods.com 17 Apr 2012 S...,15,"[al, se, apr, stealth, archery, works, great, ...","{'al': 0.25, 'se': 0.25, 'apr': 0.25, 'stealth...","{'al': 0.25, 'se': 0.25, 'apr': 0.25, 'stealth...",{'6 52 al Se forums.nexusmods.com 17 Apr 2012 ...,You should be far enough away from the first f...,6 52 al Se forums.nexusmods.com 17 Apr 2012 St...,6 52 al Se forums.nexusmods.com 17 Apr 2012 St...,"[(sneak, 1.0), (one, 1.0), (get, 1.0), (bow, 1...",forums.nexusmods.com 2012 stealth archery work...,forums.nexusmods.com 2012 stealth archery work...
3,4 06 aw Fe spokesman.com 4 ILULL. LL 1S SUUULI...,"[aw, fe, ilull, ll, suuuliiis, aalllu, culilal...",121,"[4 06 aw Fe spokesman.com 4 ILULL., LL 1S SUUU...",10,"[aw, fe, ilull, suuuliiis, aalllu, culilallis,...","{'aw': 0.3333333333333333, 'fe': 0.33333333333...","{'aw': 0.3333333333333333, 'fe': 0.33333333333...","{'4 06 aw Fe spokesman.com 4 ILULL.': 1.0, 'LL...",Listerine contains herbal oils that fight fung...,4 06 aw Fe spokesman.com 4 ILULL. LL 1S SUUULI...,4 06 aw Fe spokesman.com 4 ILULL. LL 1S SUUULI...,"[(fungus, 1.0), (antifungal, 0.666666666666666...",spokesman.com ilull. suuuliiis aalllu culilall...,spokesman.com ilull. suuuliiis aalllu culilall...
4,Parents do not own their children. No one owns...,"[parents, do, not, own, their, children, no, o...",315,"[Parents do not own their children., No one ow...",14,"[parents, children, one, owns, anything, loan,...","{'parents': 0.7142857142857143, 'children': 0....","{'parents': 0.7142857142857143, 'children': 0....",{'Parents do not own their children.': 0.78571...,Kids are not carbon copies of parents adoptive...,Parents do not own their children. No one owns...,Parents do not own their children. No one owns...,"[(kids, 1.0), (parents, 0.7142857142857143), (...",parents their children. owns anything just loa...,parents children. owns anything loan duration ...
5,o changed shampoos cut out dairy litres of wat...,"[o, changed, shampoos, cut, out, dairy, litres...",236,[o changed shampoos cut out dairy litres of wa...,12,"[changed, shampoos, cut, dairy, litres, water,...","{'changed': 0.2, 'shampoos': 0.2, 'cut': 0.4, ...","{'changed': 0.2, 'shampoos': 0.2, 'cut': 0.4, ...",{'o changed shampoos cut out dairy litres of w...,So searched about antiseptics agents on MRSA a...,o changed shampoos cut out dairy litres of wat...,o changed shampoos cut out dairy litres of wat...,"[(mrsa, 1.0), (folliculitis, 0.8), (like, 0.6)...",changed shampoos dairy litres water fast food ...,changed shampoos dairy litres water fast food ...
6,WEIS 2P 40p o2ua8 yp i deyiaao SpJ0M Jo 3eq su...,"[weis, yp, i, deyiaao, jo, suunseayy, z, e, ae...",183,[WEIS 2P 40p o2ua8 yp i deyiaao SpJ0M Jo 3eq s...,1,"[weis, yp, deyiaao, jo, suunseayy, z, e, aeazz...","{'weis': 0.2, 'yp': 0.2, 'deyiaao': 0.2, 'jo':...","{'weis': 0.2, 'yp': 0.2, 'deyiaao': 0.2, 'jo':...",{},,WEIS 2P 40p o2ua8 yp i deyiaao SpJ0M Jo 3eq su...,WEIS 2P 40p o2ua8 yp i deyiaao SpJ0M Jo 3eq su...,"[(aq, 1.0), (jo, 0.8), (z, 0.6), (du, 0.6), (t...",weis o2ua8 deyiaao spj0m suunseayy aeazze yoty...,weis o2ua8 deyiaao spj0m suunseayy aeazze yoty...
7,casispie hugealienpie thechubbynerd just showe...,"[casispie, hugealienpie, thechubbynerd, just, ...",70,[casispie hugealienpie thechubbynerd just show...,7,"[casispie, hugealienpie, thechubbynerd, shower...","{'casispie': 0.5, 'hugealienpie': 0.5, 'thechu...","{'casispie': 0.5, 'hugealienpie': 0.5, 'thechu...",{'casispie hugealienpie thechubbynerd just sho...,casispie hugealienpie thechubbynerd just showe...,casispie hugealienpie thechubbynerd just showe...,casispie hugealienpie thechubbynerd just showe...,"[(language, 1.0), (kind, 1.0), (casispie, 0.5)...",casispie hugealienpie thechubbynerd just showe...,casispie hugealienpie thechubbynerd shower tho...
8,527k J 173k it Share Oo elfmere 16h Why do you...,"[j, it, share, oo, elfmere, why, do, you, work...",241,[527k J 173k it Share Oo elfmere 16h Why do yo...,11,"[j, share, oo, elfmere, work, factory, honors,...","{'j': 0.5, 'share': 0.25, 'oo': 0.25, 'elfmere...","{'j': 0.5, 'share': 0.25, 'oo': 0.25, 'elfmere...",{'distinctly recall a coworker of mine who wor...,asked him one time vy what he was doing workin...,527k J 173k it Share Oo elfmere 16h Why do you...,527k J 173k it Share Oo elfmere 16h Why do you...,"[(one, 1.0), (working, 1.0), (home, 0.75), (de...",527k 173k share elfmere work factory when have...,527k 173k share elfmere work factory honors de...
9,6 55 at google.com h Google strange women lyin...,"[at, h, google, strange, women, lying, in, pon...",50,[6 55 at google.com h Google strange women lyi...,2,"[h, google, strange, women, lying, ponds, dist...","{'h': 0.25, 'google': 0.25, 'strange': 1.0, 'w...","{'h': 0.25, 'google': 0.25, 'strange': 1.0, 'w...",{'In Pon Etsy Redbubble Turtles STRANGE WOMEN ...,In Pon Etsy Redbubble Turtles STRANGE WOMEN AA...,6 55 at google.com h Google strange women lyin...,6 55 at google.com h Google strange women lyin...,"[(strange, 1.0), (women, 1.0), (lying, 0.75), ...",google.com google strange women lying ponds di...,google.com google strange women lying ponds di...


In [113]:
def get_bow_from_column(df, column):
    all_column_data = ' '.join(df[column].tolist())
    all_column_fd = Counter(all_column_data.split())
    return all_column_fd

big_bow = get_bow_from_column(all_df, 'pruned_nosw')


In [114]:
big_bow.most_common(50)

[('share', 34),
 ('time', 32),
 ('like', 32),
 ('would', 26),
 ('best', 22),
 ('enough', 21),
 ('back', 21),
 ('more', 19),
 ('first', 17),
 ('people', 17),
 ('using', 16),
 ('arrows', 16),
 ('parents', 16),
 ('reply', 16),
 ('good', 15),
 ('make', 15),
 ('work', 15),
 ('think', 15),
 ('problem', 14),
 ('this', 14),
 ('years', 14),
 ('replies', 14),
 ('damage', 13),
 ('kids', 13),
 ('research', 13),
 ('every', 13),
 ('also', 13),
 ('thing', 12),
 ('things', 12),
 ('know', 12),
 ('acomment', 12),
 ('long', 11),
 ('sneak', 11),
 ('even', 11),
 ('many', 11),
 ('vote', 11),
 ('solutions', 11),
 ('find', 10),
 ('able', 10),
 ('take', 10),
 ('away', 10),
 ('said', 10),
 ('need', 10),
 ('made', 10),
 ('since', 9),
 ('less', 9),
 ('much', 9),
 ('couple', 9),
 ('keep', 9),
 ('then', 9)]

In [122]:
def get_bow_from_dict(df, column):
    each_freq = df[column].tolist()
    most_freq = []
    for freq in each_freq:
        for f in freq:
            most_freq.append(f[0])
#         print(freq[0])
        
#     print(most_freq)
    print(Counter(most_freq))
    
#     just_words = [k for (k,v) in  df[column].tolist()]
#     print(just_words)
    
#     all_column_data = ' '.join(df[column].tolist())
#     all_column_fd = Counter(all_column_data.split())
#     return all_column_fd

get_bow_from_dict(all_df, 'FREQ')

# [my_series[c].value_counts() for c in list(my_series.select_dtypes(include=['O']).columns)]




In [124]:
# 1. create a frequen