# file = open('HP1.txt').readlines()
# all_text = ""
# for line in file:
# all_text += line
# all_text = all_text.replace("\n", " ")
# all_text = all_text.replace("\'", "")
# import re
# all_text = re.sub(r'[0-9]', '', all_text)
# chapters = all_text.split('CHAPTER ')
# ch1 = chapters[1]
file = open('../WK5/wk5_pdf.txt').readlines()
all_text = ""
for line in file:
all_text += line
all_text = all_text.replace("\n", " ")
all_text = all_text.replace("\'", "")
import re
# all_text = re.sub(r'[0-9]', '', all_text)
# chapters = all_text.split('CHAPTER ')
ch1 = all_text
import nltk
sentence_list = nltk.sent_tokenize(ch1)
sentence_list[:10]
stopwords = nltk.corpus.stopwords.words('english')
word_frequencies = {}
for word in nltk.word_tokenize(ch1):
if word not in stopwords:
if word not in word_frequencies.keys():
word_frequencies[word] = 1
else:
word_frequencies[word] += 1
max_frequency = max(word_frequencies.values())
for word in word_frequencies.keys():
word_frequencies[word] = (word_frequencies[word]/max_frequency)
for sent in sentence_list[:1]:
for word in nltk.word_tokenize(sent.lower()):
print(word)
for sent in sentence_list[:1]:
for word in nltk.word_tokenize(sent.lower()):
if word in word_frequencies.keys():
print(word)
sentence_scores = {}
for sent in sentence_list:
for word in nltk.word_tokenize(sent.lower()):
if word in word_frequencies.keys():
if len(sent.split(' ')) < 30:
if sent not in sentence_scores.keys():
sentence_scores[sent] = word_frequencies[word]
else:
sentence_scores[sent] += word_frequencies[word]
sorted_sentences = sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)
sorted_sentences[:10]
summary = [sent[0] for sent in sorted_sentences[:10]]
''.join(summary)
list(sentence_scores.items())[:10]
newly_sorted = [sent[0] for sent in sentence_scores.items() if sent[1] > 5]
newly_sorted
''.join(newly_sorted)
import nltk
def get_sentence_list(many_sentences):
return nltk.sent_tokenize(many_sentences)
def get_word_frequencies(many_sentences):
stopwords = nltk.corpus.stopwords.words('english')
word_frequencies = {}
for word in nltk.word_tokenize(many_sentences):
if word not in stopwords:
if word not in word_frequencies.keys():
word_frequencies[word] = 1
else:
word_frequencies[word] += 1
return word_frequencies
def get_weighted_frequencies(word_frequencies):
max_frequency = max(word_frequencies.values())
for word in word_frequencies.keys():
word_frequencies[word] = (word_frequencies[word]/max_frequency)
return word_frequencies
def get_sentence_scores(sentence_list, word_frequencies):
sentence_scores = {}
for sent in sentence_list:
for word in nltk.word_tokenize(sent.lower()):
if word in word_frequencies.keys():
if len(sent.split(' ')) < 30:
if sent not in sentence_scores.keys():
sentence_scores[sent] = word_frequencies[word]
else:
sentence_scores[sent] += word_frequencies[word]
return sentence_scores
def get_summary(many_sentences):
sentence_list = get_sentence_list(many_sentences)
word_frequencies = get_word_frequencies(many_sentences)
weighted_word_frequencies = get_weighted_frequencies(word_frequencies)
sentence_scores = get_sentence_scores(sentence_list, weighted_word_frequencies)
newly_sorted = [sent[0] for sent in sentence_scores.items() if sent[1] > 5]
print(''.join(newly_sorted))
def get_summary_by_chapters(chapters):
for ch,chapter in enumerate(chapters):
print('****** CHAPTER ' + str(ch) + '*******')
get_summary(chapter)
# get_summary_by_chapters(chapters)
get_summary(all_text)