# file = open('../summary_test_positive.txt').readlines()
file = open('HP1.txt').readlines()
file[:10]
all_text = ""
for line in file:
all_text += line
type(all_text)
all_text = all_text.replace("\n", " ")
all_text = all_text.replace("\'", "")
import re
# article_text = re.sub(r'\[[0-9]*\]', '', article_text)
all_text = re.sub(r'[0-9]', '', all_text)
chapters = all_text.split('CHAPTER ')
chapters[1][:100]
re.sub
to remove square brackets and extra spaces from ORIGINAL article_text¶import re
# article_text = re.sub(r'\[[0-9]*\]', '', article_text)
formatted_article_text = re.sub(r'\n+', ' ', article_text)
formatted_article_text[:1000]
re.sub
to remove extra characters and digits for a new FORMATTED_TEXT variable¶# formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text)
# formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)
# formatted_article_text[:1000]
import nltk
sentence_list = nltk.sent_tokenize(article_text)
sentence_list[:5]
# formatted_article
stopwords = nltk.corpus.stopwords.words('english')
word_frequencies = {}
for word in nltk.word_tokenize(formatted_article_text):
if word not in stopwords:
if word not in word_frequencies.keys():
word_frequencies[word] = 1
else:
word_frequencies[word] += 1
max_frequency = max(word_frequencies.values())
for word in word_frequencies.keys():
word_frequencies[word] = (word_frequencies[word]/max_frequency)
## ILLUSTRATIVE EXAMPLE
## Nothing removed
for sent in sentence_list[:1]:
for word in nltk.word_tokenize(sent.lower()):
print(word)
## ILLUSTRATIVE EXAMPLE
## Stopwords etc. removed
## We are ONLY assigning values/weights to the words in the sentences that are inside our freq dist!
for sent in sentence_list[:1]:
for word in nltk.word_tokenize(sent.lower()):
if word in word_frequencies.keys():
print(word)
sentence_scores = {}
for sent in sentence_list:
for word in nltk.word_tokenize(sent.lower()):
if word in word_frequencies.keys():
if len(sent.split(' ')) < 30:
if sent not in sentence_scores.keys():
sentence_scores[sent] = word_frequencies[word]
else:
sentence_scores[sent] += word_frequencies[word]
sorted_sentences = sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)
sorted_sentences[:5]
summary = [sent[0] for sent in sorted_sentences[:10]]
''.join(summary)
lolsummary = ''.join(summary).strip()
# lolsummary = re.sub(r'\\', ' ', lolsummary)
# lolsummary
lolsummary = lolsummary.replace("\'", "")
lolsummary
# type(lolsummary)