In [59]:
# file = open('HP1.txt').readlines()

# all_text = ""
# for line in file:
#     all_text += line

# all_text = all_text.replace("\n", " ")
# all_text = all_text.replace("\'", "")

# import re
# all_text = re.sub(r'[0-9]', '', all_text)
# chapters = all_text.split('CHAPTER ')
# ch1 = chapters[1]

# file = open('../WK5/wk5_pdf.txt').readlines()
file = open('../WK5/biglebowski_dialogue_only_v4.txt').readlines()

all_text = ""
for line in file:
    all_text += line

all_text = all_text.replace("@", ". ")
all_text = all_text.replace(".", ". ")
all_text = all_text.replace("\'", "")

import re
# all_text = re.sub(r'[0-9]', '', all_text)
# chapters = all_text.split('CHAPTER ')
ch1 = all_text
In [60]:
import nltk
sentence_list = nltk.sent_tokenize(ch1)
sentence_list[:10]
Out[60]:
['.',
 'A way out west there was a fella, fella I want to tell you about, fella by the name of Jeff Lebowski.',
 'At least, that was the handle his lovin parents gave him, but he never had much use for it himself.',
 'This Lebowski, he called himself the Dude.',
 'Now, Dude, thats a name no one would self-apply where I come from.',
 'But then, there was a lot about the Dude that didnt make a whole lot of sense to me.',
 'And a lot about where he lived, like- wise.',
 'But then again, maybe thats why I found the place sdurned innarestin.',
 '.',
 'They call Los Angeles the City of Angels.']
In [61]:
stopwords = nltk.corpus.stopwords.words('english')
word_frequencies = {}
for word in nltk.word_tokenize(ch1):
    if word not in stopwords:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1
In [62]:
max_frequency = max(word_frequencies.values())
for word in word_frequencies.keys():
    word_frequencies[word] = (word_frequencies[word]/max_frequency)
In [63]:
for sent in sentence_list[:1]:
    for word in nltk.word_tokenize(sent.lower()):
        print(word)
.
In [64]:
for sent in sentence_list[:1]:
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_frequencies.keys():
            print(word)
.
In [65]:
sentence_scores = {}
for sent in sentence_list:
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_frequencies.keys():
            if len(sent.split(' ')) < 30:
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]
In [78]:
sorted_sentences = sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)
sorted_sentences[:50]
Out[78]:
[('.', 659.0),
 ('Huh?.', 19.517693315858455),
 ('Yeah.', 16.111839231105286),
 ('Uh-huh.', 13.0),
 ('Is this your homework, Larry?.', 8.044997815640016),
 ('Shut the fuck up, Donny.', 5.9327217125382266),
 ('Huh.', 5.015290519877675),
 ('Yeah, but--.', 4.7824377457404985),
 ('Am I wrong?.', 4.606378331148974),
 ('Hes a good man, and thorough.', 4.564875491480995),
 ('I just want to say, sir, that were both enormous--on a personal level, Branded, especially the early episodes, has been a source of, uh, inspir---.',
  4.531236347750109),
 ('Walter--.', 4.506771515945828),
 ('I just got a, an informal report, uh, that a uh, a member of your team, uh, Walter Sobchak, drew a loaded weapon during league play--.',
  4.4491044124071655),
 ('And lets also not forget--lets not forget, Dude--that keeping wildlife, an amphibious rodent, for uh, domestic, you know, within the city--that isnt legal either.',
  4.2140672782874615),
 ('Okay.', 4.010484927916121),
 ('Yeah, well, right man, there are many facets to this, uh, you know, many interested parties.',
  3.8899082568807346),
 ('And you will, uh, you know, youll, uh, you know what Im trying to say--.',
  3.5495849716033208),
 ('Yeah?.', 3.4560943643512454),
 ('Excuse me?.', 3.4351245085190043),
 ('Yeah well, thats just, ya know, like, your opinion, man.',
  3.417212756662298),
 ('money, yeah, I gotta respecfully, 69 you know, tender my resignation on that matter, cause it looks like your mother really was kidnapped after all.',
  3.413280908693753),
 ('Walter!.', 3.4076015727391873),
 ('Well, okay, youre not privy to all the new shit, so uh, you know, but thats what you pay me for.',
  3.398427260812582),
 ('He suspects that the culprits might be the very people who, uh, soiled your rug, and youre in a unique position to confirm or, uh, disconfirm that suspicion.',
  3.383136740934906),
 ('Wheres the money, Lebowski!.', 3.2407164700742683),
 ('--so, like I say, just thought, you know, fair warning.',
  3.0266491917868064),
 ('Across this line you do not, uh--and also, Dude, Chinaman is not the preferred, uh.',
  3.0218435998252513),
 ('Fuckin A, man.', 3.0187854958497162),
 ('Well.', 3.018348623853211),
 ('Okay sir, youre a Lebowski, Im a Lebowski, thats terrific, Im very busy so what can I do for you?.',
  3.016164263870686),
 ('And yes, well be near the, uh--some burgers, some beers, a few laughs.',
  3.006553079947575),
 ('Jeez.', 3.0),
 ('Thank you.', 3.0),
 ('Jesus.', 3.0),
 ('No.', 3.0),
 ('Well, yeah I did, but I spent most of my time occupying various, um, administration buildings--.',
  2.9908256880733948),
 ('Vee vant zat money, Lebowski.', 2.982961992136304),
 ('First of all, Dude, you dont have an ex, secondly, its a fucking show dog with fucking papers.',
  2.969418960244649),
 ('Oh, shit.', 2.943643512450852),
 ('Hey, cool it Walter.', 2.927916120576671),
 ('Its a complicated case, Maude.', 2.926168632590651),
 ('Fortunately Ive been adhering to a pretty strict, uh, drug regimen to keep my mind, you know, limber.',
  2.90694626474443),
 ('Well, you know, sometimes you eat the bear, and, uh.', 2.90694626474443),
 ('You thought, hey, a deadbeat, a loser, someone the square community wont give a shit about.',
  2.868501529051988),
 ('Some brown, or, uh, rust, coloration.', 2.8671909130624726),
 ('Wal, a wiser fella than mself once said, sometimes you eat the bar and sometimes the bar, wal, he eats you.',
  2.8623853211009176),
 ('Listen, Maude, Im sorry if your stepmother is a nympho, but I dont see what it has to do with--do you have any kalhua?.',
  2.6985583224115333),
 ('THEN WHY CANT YOU--fuck, never mind, just call Donny then, and ask him to--.',
  2.6819571865443423),
 ('And so, Theodore--Donald--Karabotsos, in accordance with what we think   your dying wishes might well have been, we commit your mortal remains to the bosom of.',
  2.6513761467889916),
 ('Im not a--ah, fuck it, just stay away from my fucking lady friend, man.',
  2.636085626911315)]
In [67]:
summary = [sent[0] for sent in sorted_sentences[:10]]
''.join(summary)
Out[67]:
'.Huh?.Yeah.Uh-huh.Is this your homework, Larry?.Shut the fuck up, Donny.Huh.Yeah, but--.Am I wrong?.Hes a good man, and thorough.'
In [68]:
list(sentence_scores.items())[:10]
Out[68]:
[('.', 659.0),
 ('A way out west there was a fella, fella I want to tell you about, fella by the name of Jeff Lebowski.',
  1.9563128003494976),
 ('At least, that was the handle his lovin parents gave him, but he never had much use for it himself.',
  1.9370904325032767),
 ('This Lebowski, he called himself the Dude.', 1.4626474442988204),
 ('Now, Dude, thats a name no one would self-apply where I come from.',
  1.9480122324159022),
 ('But then, there was a lot about the Dude that didnt make a whole lot of sense to me.',
  1.4853647881170817),
 ('And a lot about where he lived, like- wise.', 1.4652686762778506),
 ('But then again, maybe thats why I found the place sdurned innarestin.',
  1.4740061162079512),
 ('They call Los Angeles the City of Angels.', 1.0078636959370904),
 ('I didnt find it to be that exactly, but Ill allow as there are some nice folks there.',
  1.4788117081695065)]
In [76]:
newly_sorted = [sent[0] for sent in sentence_scores.items() if sent[1] > 4]
In [77]:
newly_sorted
''.join(newly_sorted)
Out[77]:
'.Huh?.Am I wrong?.Yeah, but--.Okay.Uh-huh.Walter--.Huh.I just got a, an informal report, uh, that a uh, a member of your team, uh, Walter Sobchak, drew a loaded weapon during league play--.Shut the fuck up, Donny.Yeah.Hes a good man, and thorough.And lets also not forget--lets not forget, Dude--that keeping wildlife, an amphibious rodent, for uh, domestic, you know, within the city--that isnt legal either.I just want to say, sir, that were both enormous--on a personal level, Branded, especially the early episodes, has been a source of, uh, inspir---.Is this your homework, Larry?.'
In [71]:
import nltk
def get_sentence_list(many_sentences):
    return nltk.sent_tokenize(many_sentences)
    
def get_word_frequencies(many_sentences):
    stopwords = nltk.corpus.stopwords.words('english')
    word_frequencies = {}
    for word in nltk.word_tokenize(many_sentences):
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1
    return word_frequencies
                       
def get_weighted_frequencies(word_frequencies):
    max_frequency = max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word] = (word_frequencies[word]/max_frequency)
    return word_frequencies

def get_sentence_scores(sentence_list, word_frequencies):
    sentence_scores = {}
    for sent in sentence_list:
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]
    return sentence_scores
    

def get_summary(many_sentences):
    sentence_list = get_sentence_list(many_sentences)
    word_frequencies = get_word_frequencies(many_sentences)
    weighted_word_frequencies = get_weighted_frequencies(word_frequencies)
    sentence_scores = get_sentence_scores(sentence_list, weighted_word_frequencies)

    newly_sorted = [sent[0] for sent in sentence_scores.items() if sent[1] > 5]
    print(''.join(newly_sorted))
    
def get_summary_by_chapters(chapters):
    for ch,chapter in enumerate(chapters):
        print('****** CHAPTER ' + str(ch) + '*******')
        get_summary(chapter)

# get_summary_by_chapters(chapters)

get_summary(all_text)
.Huh?.Uh-huh.Huh.Shut the fuck up, Donny.Yeah.Is this your homework, Larry?.
In [ ]:
 
In [ ]: