In [69]:
import os
negative = os.listdir('NEG/')
positive = os.listdir('POS/')
In [70]:
positive_alltext = []
for file in positive:
    f=open('POS/'+file)
    content=f.read()
    positive_alltext.append(content)
negative_alltext = []
for file in negative:
    f=open('NEG/'+file)
    content=f.read()
    negative_alltext.append(content)
In [71]:
import pandas as pd
In [72]:
positive_df = pd.DataFrame(positive_alltext)
negative_df = pd.DataFrame(negative_alltext)
In [73]:
positive_df['PoN'] = 'P'
negative_df['PoN'] = 'N'
In [74]:
all_df = positive_df.append(negative_df)
In [75]:
all_df
Out[75]:
0 PoN
0 films adapted from comic books have had plenty... P
1 you've got mail works alot better than it dese... P
2 " jaws " is a rare film that grabs your atten... P
3 every now and then a movie comes along from a ... P
4 moviemaking is a lot like being the general ma... P
0 that's exactly how long the movie felt to me .... N
1 " quest for camelot " is warner bros . ' firs... N
2 so ask yourself what " 8mm " ( " eight millime... N
3 synopsis : a mentally unstable man undergoing ... N
4 capsule : in 2176 on the planet mars police ta... N
In [76]:
from nltk.tokenize import word_tokenize
def tokenizer(sentence):
    return word_tokenize(sentence)

all_df['tokenized'] = all_df.apply(lambda x: tokenizer(x[0]),axis=1)
all_df['tokenized_count'] = all_df.apply(lambda x: len(x['tokenized']),axis=1)
In [77]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def stopword_remover(sentence):
    filtered_text = []
    for word in sentence:
        if word not in stop_words:
            filtered_text.append(word)
    return filtered_text
all_df['no_stopwords'] = all_df.apply(lambda x: stopword_remover(x['tokenized']),axis=1)
all_df['no_stopwords_count'] = all_df.apply(lambda x: len(x['no_stopwords']),axis=1)
In [78]:
all_df
Out[78]:
0 PoN tokenized tokenized_count no_stopwords no_stopwords_count
0 films adapted from comic books have had plenty... P [films, adapted, from, comic, books, have, had... 826 [films, adapted, comic, books, plenty, success... 540
1 you've got mail works alot better than it dese... P [you, 've, got, mail, works, alot, better, tha... 476 ['ve, got, mail, works, alot, better, deserves... 267
2 " jaws " is a rare film that grabs your atten... P [``, jaws, ``, is, a, rare, film, that, grabs,... 1197 [``, jaws, ``, rare, film, grabs, attention, s... 756
3 every now and then a movie comes along from a ... P [every, now, and, then, a, movie, comes, along... 786 [every, movie, comes, along, suspect, studio, ... 484
4 moviemaking is a lot like being the general ma... P [moviemaking, is, a, lot, like, being, the, ge... 764 [moviemaking, lot, like, general, manager, nfl... 479
0 that's exactly how long the movie felt to me .... N [that, 's, exactly, how, long, the, movie, fel... 689 ['s, exactly, long, movie, felt, ., n't, even,... 447
1 " quest for camelot " is warner bros . ' firs... N [``, quest, for, camelot, ``, is, warner, bros... 574 [``, quest, camelot, ``, warner, bros, ., ', f... 377
2 so ask yourself what " 8mm " ( " eight millime... N [so, ask, yourself, what, ``, 8mm, ``, (, ``, ... 656 [ask, ``, 8mm, ``, (, ``, eight, millimeter, `... 412
3 synopsis : a mentally unstable man undergoing ... N [synopsis, :, a, mentally, unstable, man, unde... 855 [synopsis, :, mentally, unstable, man, undergo... 520
4 capsule : in 2176 on the planet mars police ta... N [capsule, :, in, 2176, on, the, planet, mars, ... 748 [capsule, :, 2176, planet, mars, police, takin... 454
In [ ]:
 
In [ ]: