This tutorial is written based on chapters 11-13 from the book "Python for Everyone" https://www.py4e.com
Step 1: import all necessary packages
import re
import urllib
from bs4 import BeautifulSoup
import pprint
import pandas as pd
Step 2: download a sample webpage. You can save the html page onto your computer and use text editor to view its content
url = "http://www.metrolyrics.com/you-belong-with-me-lyrics-taylor-swift.html"
html = urllib.request.urlopen(url).read()
Step 3: use BeautifulSoup to parse the webpage and extract the lyrics content. The division that includes the lyrics starts from the html tag "lyrics-body-text"
soup = BeautifulSoup(html, 'html.parser')
print(soup.title.string)
text = soup.body.find_all(id='lyrics-body-text')
text = text[0].text
print(text)
Taylor Swift - You Belong With Me Lyrics | MetroLyrics You're on the phone with your girlfriend—she's upset She's going off about something that you said 'Cause she doesn't get your humor like I do.I'm in the room, it's a typical Tuesday night. I'm listening to the kind of music she doesn't like. And she'll never know your story like I doBut she wears short skirts I wear t-shirt She's cheer captain And I'm on the bleachersDreaming about the day when you wake up and find That what you're looking for has been here the whole time. Related 11 Delicious Misheard Lyrics About Food NEW SONG: Taylor Swift - 'Lover' - LYRICS Prime Day concert by Amazon Music will be headlined by Taylor Swift If you could see That I'm the one Who understands you Been here all along So, why can't you see You belong with me You belong with me?Walk in the streets with you in your worn out jeans I can't help thinking this is how it ought to be. Laughing on a park bench thinking to myself "Hey, isn't this easy?"And you've got a smile That can light up this whole town I haven't seen it in awhile Since she brought you down.You say you're fine—I know you better than that Hey, what you doing with a girl like that? She wears high heels I wear sneakers She's cheer captain And I'm on the bleachersDreaming about the day when you wake up and find That what you're looking for has been here the whole timeIf you could see That I'm the one Who understands you Been here all along So, why can't you see You belong with me?Standing by and waiting at your backdoor. All this time how could you not know, baby? You belong with me You belong with meOh, I remember you driving to my house In the middle of the night I'm the one who makes you laugh When you know you're 'bout to cryI know your favorite songs And you tell me about your dreams Think I know where you belong Think I know it's with meCan't you se that I'm the one Who understands you? Been here all along So, why can't you see You belong with me?Standing by and waiting at your backdoor. All this time how could you not know, baby? You belong with me You belong with me You belong with me Have you ever thought just maybe You belong with me? You belong with me
Step 4: split text into individual words
words = text.split()
print(words)
["You're", 'on', 'the', 'phone', 'with', 'your', "girlfriend—she's", 'upset', "She's", 'going', 'off', 'about', 'something', 'that', 'you', 'said', "'Cause", 'she', "doesn't", 'get', 'your', 'humor', 'like', 'I', "do.I'm", 'in', 'the', 'room,', "it's", 'a', 'typical', 'Tuesday', 'night.', "I'm", 'listening', 'to', 'the', 'kind', 'of', 'music', 'she', "doesn't", 'like.', 'And', "she'll", 'never', 'know', 'your', 'story', 'like', 'I', 'doBut', 'she', 'wears', 'short', 'skirts', 'I', 'wear', 't-shirt', "She's", 'cheer', 'captain', 'And', "I'm", 'on', 'the', 'bleachersDreaming', 'about', 'the', 'day', 'when', 'you', 'wake', 'up', 'and', 'find', 'That', 'what', "you're", 'looking', 'for', 'has', 'been', 'here', 'the', 'whole', 'time.', 'Related', '11', 'Delicious', 'Misheard', 'Lyrics', 'About', 'Food', 'NEW', 'SONG:', 'Taylor', 'Swift', '-', "'Lover'", '-', 'LYRICS', 'Prime', 'Day', 'concert', 'by', 'Amazon', 'Music', 'will', 'be', 'headlined', 'by', 'Taylor', 'Swift', 'If', 'you', 'could', 'see', 'That', "I'm", 'the', 'one', 'Who', 'understands', 'you', 'Been', 'here', 'all', 'along', 'So,', 'why', "can't", 'you', 'see', 'You', 'belong', 'with', 'me', 'You', 'belong', 'with', 'me?Walk', 'in', 'the', 'streets', 'with', 'you', 'in', 'your', 'worn', 'out', 'jeans', 'I', "can't", 'help', 'thinking', 'this', 'is', 'how', 'it', 'ought', 'to', 'be.', 'Laughing', 'on', 'a', 'park', 'bench', 'thinking', 'to', 'myself', '"Hey,', "isn't", 'this', 'easy?"And', "you've", 'got', 'a', 'smile', 'That', 'can', 'light', 'up', 'this', 'whole', 'town', 'I', "haven't", 'seen', 'it', 'in', 'awhile', 'Since', 'she', 'brought', 'you', 'down.You', 'say', "you're", 'fine—I', 'know', 'you', 'better', 'than', 'that', 'Hey,', 'what', 'you', 'doing', 'with', 'a', 'girl', 'like', 'that?', 'She', 'wears', 'high', 'heels', 'I', 'wear', 'sneakers', "She's", 'cheer', 'captain', 'And', "I'm", 'on', 'the', 'bleachersDreaming', 'about', 'the', 'day', 'when', 'you', 'wake', 'up', 'and', 'find', 'That', 'what', "you're", 'looking', 'for', 'has', 'been', 'here', 'the', 'whole', 'timeIf', 'you', 'could', 'see', 'That', "I'm", 'the', 'one', 'Who', 'understands', 'you', 'Been', 'here', 'all', 'along', 'So,', 'why', "can't", 'you', 'see', 'You', 'belong', 'with', 'me?Standing', 'by', 'and', 'waiting', 'at', 'your', 'backdoor.', 'All', 'this', 'time', 'how', 'could', 'you', 'not', 'know,', 'baby?', 'You', 'belong', 'with', 'me', 'You', 'belong', 'with', 'meOh,', 'I', 'remember', 'you', 'driving', 'to', 'my', 'house', 'In', 'the', 'middle', 'of', 'the', 'night', "I'm", 'the', 'one', 'who', 'makes', 'you', 'laugh', 'When', 'you', 'know', "you're", "'bout", 'to', 'cryI', 'know', 'your', 'favorite', 'songs', 'And', 'you', 'tell', 'me', 'about', 'your', 'dreams', 'Think', 'I', 'know', 'where', 'you', 'belong', 'Think', 'I', 'know', "it's", 'with', "meCan't", 'you', 'se', 'that', "I'm", 'the', 'one', 'Who', 'understands', 'you?', 'Been', 'here', 'all', 'along', 'So,', 'why', "can't", 'you', 'see', 'You', 'belong', 'with', 'me?Standing', 'by', 'and', 'waiting', 'at', 'your', 'backdoor.', 'All', 'this', 'time', 'how', 'could', 'you', 'not', 'know,', 'baby?', 'You', 'belong', 'with', 'me', 'You', 'belong', 'with', 'me', 'You', 'belong', 'with', 'me', 'Have', 'you', 'ever', 'thought', 'just', 'maybe', 'You', 'belong', 'with', 'me?', 'You', 'belong', 'with', 'me']
Remove stopwords
stopwords = ['is', 'are', 'the', 'a', 'an']
def removeStopwords(wordlist, stopwords):
return [w for w in wordlist if w not in stopwords]
words = removeStopwords(words, stopwords)
print(words)
["You're", 'on', 'phone', 'with', 'your', "girlfriend—she's", 'upset', "She's", 'going', 'off', 'about', 'something', 'that', 'you', 'said', "'Cause", 'she', "doesn't", 'get', 'your', 'humor', 'like', 'I', "do.I'm", 'in', 'room,', "it's", 'typical', 'Tuesday', 'night.', "I'm", 'listening', 'to', 'kind', 'of', 'music', 'she', "doesn't", 'like.', 'And', "she'll", 'never', 'know', 'your', 'story', 'like', 'I', 'doBut', 'she', 'wears', 'short', 'skirts', 'I', 'wear', 't-shirt', "She's", 'cheer', 'captain', 'And', "I'm", 'on', 'bleachersDreaming', 'about', 'day', 'when', 'you', 'wake', 'up', 'and', 'find', 'That', 'what', "you're", 'looking', 'for', 'has', 'been', 'here', 'whole', 'time.', 'Related', '11', 'Delicious', 'Misheard', 'Lyrics', 'About', 'Food', 'NEW', 'SONG:', 'Taylor', 'Swift', '-', "'Lover'", '-', 'LYRICS', 'Prime', 'Day', 'concert', 'by', 'Amazon', 'Music', 'will', 'be', 'headlined', 'by', 'Taylor', 'Swift', 'If', 'you', 'could', 'see', 'That', "I'm", 'one', 'Who', 'understands', 'you', 'Been', 'here', 'all', 'along', 'So,', 'why', "can't", 'you', 'see', 'You', 'belong', 'with', 'me', 'You', 'belong', 'with', 'me?Walk', 'in', 'streets', 'with', 'you', 'in', 'your', 'worn', 'out', 'jeans', 'I', "can't", 'help', 'thinking', 'this', 'how', 'it', 'ought', 'to', 'be.', 'Laughing', 'on', 'park', 'bench', 'thinking', 'to', 'myself', '"Hey,', "isn't", 'this', 'easy?"And', "you've", 'got', 'smile', 'That', 'can', 'light', 'up', 'this', 'whole', 'town', 'I', "haven't", 'seen', 'it', 'in', 'awhile', 'Since', 'she', 'brought', 'you', 'down.You', 'say', "you're", 'fine—I', 'know', 'you', 'better', 'than', 'that', 'Hey,', 'what', 'you', 'doing', 'with', 'girl', 'like', 'that?', 'She', 'wears', 'high', 'heels', 'I', 'wear', 'sneakers', "She's", 'cheer', 'captain', 'And', "I'm", 'on', 'bleachersDreaming', 'about', 'day', 'when', 'you', 'wake', 'up', 'and', 'find', 'That', 'what', "you're", 'looking', 'for', 'has', 'been', 'here', 'whole', 'timeIf', 'you', 'could', 'see', 'That', "I'm", 'one', 'Who', 'understands', 'you', 'Been', 'here', 'all', 'along', 'So,', 'why', "can't", 'you', 'see', 'You', 'belong', 'with', 'me?Standing', 'by', 'and', 'waiting', 'at', 'your', 'backdoor.', 'All', 'this', 'time', 'how', 'could', 'you', 'not', 'know,', 'baby?', 'You', 'belong', 'with', 'me', 'You', 'belong', 'with', 'meOh,', 'I', 'remember', 'you', 'driving', 'to', 'my', 'house', 'In', 'middle', 'of', 'night', "I'm", 'one', 'who', 'makes', 'you', 'laugh', 'When', 'you', 'know', "you're", "'bout", 'to', 'cryI', 'know', 'your', 'favorite', 'songs', 'And', 'you', 'tell', 'me', 'about', 'your', 'dreams', 'Think', 'I', 'know', 'where', 'you', 'belong', 'Think', 'I', 'know', "it's", 'with', "meCan't", 'you', 'se', 'that', "I'm", 'one', 'Who', 'understands', 'you?', 'Been', 'here', 'all', 'along', 'So,', 'why', "can't", 'you', 'see', 'You', 'belong', 'with', 'me?Standing', 'by', 'and', 'waiting', 'at', 'your', 'backdoor.', 'All', 'this', 'time', 'how', 'could', 'you', 'not', 'know,', 'baby?', 'You', 'belong', 'with', 'me', 'You', 'belong', 'with', 'me', 'You', 'belong', 'with', 'me', 'Have', 'you', 'ever', 'thought', 'just', 'maybe', 'You', 'belong', 'with', 'me?', 'You', 'belong', 'with', 'me']
count word frequency
counts = dict()
for word in words:
counts[word] = counts.get(word,0) + 1
sorted(counts, key=counts.__getitem__, reverse=True)
pprint.pprint(counts)
{'"Hey,': 1, "'Cause": 1, "'Lover'": 1, "'bout": 1, '-': 2, '11': 1, 'About': 1, 'All': 2, 'Amazon': 1, 'And': 4, 'Been': 3, 'Day': 1, 'Delicious': 1, 'Food': 1, 'Have': 1, 'Hey,': 1, 'I': 9, "I'm": 7, 'If': 1, 'In': 1, 'LYRICS': 1, 'Laughing': 1, 'Lyrics': 1, 'Misheard': 1, 'Music': 1, 'NEW': 1, 'Prime': 1, 'Related': 1, 'SONG:': 1, 'She': 1, "She's": 3, 'Since': 1, 'So,': 3, 'Swift': 2, 'Taylor': 2, 'That': 5, 'Think': 2, 'Tuesday': 1, 'When': 1, 'Who': 3, 'You': 11, "You're": 1, 'about': 4, 'all': 3, 'along': 3, 'and': 4, 'at': 2, 'awhile': 1, 'baby?': 2, 'backdoor.': 2, 'be': 1, 'be.': 1, 'been': 2, 'belong': 12, 'bench': 1, 'better': 1, 'bleachersDreaming': 2, 'brought': 1, 'by': 4, 'can': 1, "can't": 4, 'captain': 2, 'cheer': 2, 'concert': 1, 'could': 4, 'cryI': 1, 'day': 2, "do.I'm": 1, 'doBut': 1, "doesn't": 2, 'doing': 1, 'down.You': 1, 'dreams': 1, 'driving': 1, 'easy?"And': 1, 'ever': 1, 'favorite': 1, 'find': 2, 'fine—I': 1, 'for': 2, 'get': 1, 'girl': 1, "girlfriend—she's": 1, 'going': 1, 'got': 1, 'has': 2, "haven't": 1, 'headlined': 1, 'heels': 1, 'help': 1, 'here': 5, 'high': 1, 'house': 1, 'how': 3, 'humor': 1, 'in': 4, "isn't": 1, 'it': 2, "it's": 2, 'jeans': 1, 'just': 1, 'kind': 1, 'know': 6, 'know,': 2, 'laugh': 1, 'light': 1, 'like': 3, 'like.': 1, 'listening': 1, 'looking': 2, 'makes': 1, 'maybe': 1, 'me': 7, 'me?': 1, 'me?Standing': 2, 'me?Walk': 1, "meCan't": 1, 'meOh,': 1, 'middle': 1, 'music': 1, 'my': 1, 'myself': 1, 'never': 1, 'night': 1, 'night.': 1, 'not': 2, 'of': 2, 'off': 1, 'on': 4, 'one': 4, 'ought': 1, 'out': 1, 'park': 1, 'phone': 1, 'remember': 1, 'room,': 1, 'said': 1, 'say': 1, 'se': 1, 'see': 5, 'seen': 1, 'she': 4, "she'll": 1, 'short': 1, 'skirts': 1, 'smile': 1, 'sneakers': 1, 'something': 1, 'songs': 1, 'story': 1, 'streets': 1, 't-shirt': 1, 'tell': 1, 'than': 1, 'that': 3, 'that?': 1, 'thinking': 2, 'this': 5, 'thought': 1, 'time': 2, 'time.': 1, 'timeIf': 1, 'to': 5, 'town': 1, 'typical': 1, 'understands': 3, 'up': 3, 'upset': 1, 'waiting': 2, 'wake': 2, 'wear': 2, 'wears': 2, 'what': 3, 'when': 2, 'where': 1, 'who': 1, 'whole': 3, 'why': 3, 'will': 1, 'with': 15, 'worn': 1, 'you': 23, "you're": 4, "you've": 1, 'you?': 1, 'your': 8}
sort words by frequency
The above method uses loop, which needs quite a lot of programming, and is also slow. The following method uses the dataframe data structure in the pandas package to quickly count and sort words by frequencies. Pandas documentation includes more details on its powerful data structure https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
df=pd.DataFrame(words, columns=['word'])
x=df["word"].value_counts()
pprint.pprint(x)
you 23 with 15 belong 12 You 11 I 9 .. listening 1 short 1 do.I'm 1 time. 1 just 1 Name: word, Length: 186, dtype: int64
import nltk
tokens = nltk.word_tokenize(text)
tags = nltk.pos_tag(tokens)
print(tags[0][0], tags[0][1])
You PRP
def removeStopwords(wordlist, stopwords):
return [w for w in wordlist if w not in stopwords]
words = removeStopwords(words, stopwords)
print(words)
["You're", 'on', 'phone', 'with', 'your', "girlfriend—she's", 'upset', "She's", 'going', 'off', 'about', 'something', 'that', 'you', 'said', "'Cause", 'she', "doesn't", 'get', 'your', 'humor', 'like', 'I', "do.I'm", 'in', 'room,', "it's", 'typical', 'Tuesday', 'night.', "I'm", 'listening', 'to', 'kind', 'of', 'music', 'she', "doesn't", 'like.', 'And', "she'll", 'never', 'know', 'your', 'story', 'like', 'I', 'doBut', 'she', 'wears', 'short', 'skirts', 'I', 'wear', 't-shirt', "She's", 'cheer', 'captain', 'And', "I'm", 'on', 'bleachersDreaming', 'about', 'day', 'when', 'you', 'wake', 'up', 'and', 'find', 'That', 'what', "you're", 'looking', 'for', 'has', 'been', 'here', 'whole', 'time.', 'Related', '11', 'Delicious', 'Misheard', 'Lyrics', 'About', 'Food', 'NEW', 'SONG:', 'Taylor', 'Swift', '-', "'Lover'", '-', 'LYRICS', 'Prime', 'Day', 'concert', 'by', 'Amazon', 'Music', 'will', 'be', 'headlined', 'by', 'Taylor', 'Swift', 'If', 'you', 'could', 'see', 'That', "I'm", 'one', 'Who', 'understands', 'you', 'Been', 'here', 'all', 'along', 'So,', 'why', "can't", 'you', 'see', 'You', 'belong', 'with', 'me', 'You', 'belong', 'with', 'me?Walk', 'in', 'streets', 'with', 'you', 'in', 'your', 'worn', 'out', 'jeans', 'I', "can't", 'help', 'thinking', 'this', 'how', 'it', 'ought', 'to', 'be.', 'Laughing', 'on', 'park', 'bench', 'thinking', 'to', 'myself', '"Hey,', "isn't", 'this', 'easy?"And', "you've", 'got', 'smile', 'That', 'can', 'light', 'up', 'this', 'whole', 'town', 'I', "haven't", 'seen', 'it', 'in', 'awhile', 'Since', 'she', 'brought', 'you', 'down.You', 'say', "you're", 'fine—I', 'know', 'you', 'better', 'than', 'that', 'Hey,', 'what', 'you', 'doing', 'with', 'girl', 'like', 'that?', 'She', 'wears', 'high', 'heels', 'I', 'wear', 'sneakers', "She's", 'cheer', 'captain', 'And', "I'm", 'on', 'bleachersDreaming', 'about', 'day', 'when', 'you', 'wake', 'up', 'and', 'find', 'That', 'what', "you're", 'looking', 'for', 'has', 'been', 'here', 'whole', 'timeIf', 'you', 'could', 'see', 'That', "I'm", 'one', 'Who', 'understands', 'you', 'Been', 'here', 'all', 'along', 'So,', 'why', "can't", 'you', 'see', 'You', 'belong', 'with', 'me?Standing', 'by', 'and', 'waiting', 'at', 'your', 'backdoor.', 'All', 'this', 'time', 'how', 'could', 'you', 'not', 'know,', 'baby?', 'You', 'belong', 'with', 'me', 'You', 'belong', 'with', 'meOh,', 'I', 'remember', 'you', 'driving', 'to', 'my', 'house', 'In', 'middle', 'of', 'night', "I'm", 'one', 'who', 'makes', 'you', 'laugh', 'When', 'you', 'know', "you're", "'bout", 'to', 'cryI', 'know', 'your', 'favorite', 'songs', 'And', 'you', 'tell', 'me', 'about', 'your', 'dreams', 'Think', 'I', 'know', 'where', 'you', 'belong', 'Think', 'I', 'know', "it's", 'with', "meCan't", 'you', 'se', 'that', "I'm", 'one', 'Who', 'understands', 'you?', 'Been', 'here', 'all', 'along', 'So,', 'why', "can't", 'you', 'see', 'You', 'belong', 'with', 'me?Standing', 'by', 'and', 'waiting', 'at', 'your', 'backdoor.', 'All', 'this', 'time', 'how', 'could', 'you', 'not', 'know,', 'baby?', 'You', 'belong', 'with', 'me', 'You', 'belong', 'with', 'me', 'You', 'belong', 'with', 'me', 'Have', 'you', 'ever', 'thought', 'just', 'maybe', 'You', 'belong', 'with', 'me?', 'You', 'belong', 'with', 'me']
counts = dict()
for word in words:
counts[word] = counts.get(word, 0) + 1
sorted(counts, key=counts.__getitem__, reverse=True)
pprint.pprint(counts)
{'"Hey,': 1, "'Cause": 1, "'Lover'": 1, "'bout": 1, '-': 2, '11': 1, 'About': 1, 'All': 2, 'Amazon': 1, 'And': 4, 'Been': 3, 'Day': 1, 'Delicious': 1, 'Food': 1, 'Have': 1, 'Hey,': 1, 'I': 9, "I'm": 7, 'If': 1, 'In': 1, 'LYRICS': 1, 'Laughing': 1, 'Lyrics': 1, 'Misheard': 1, 'Music': 1, 'NEW': 1, 'Prime': 1, 'Related': 1, 'SONG:': 1, 'She': 1, "She's": 3, 'Since': 1, 'So,': 3, 'Swift': 2, 'Taylor': 2, 'That': 5, 'Think': 2, 'Tuesday': 1, 'When': 1, 'Who': 3, 'You': 11, "You're": 1, 'about': 4, 'all': 3, 'along': 3, 'and': 4, 'at': 2, 'awhile': 1, 'baby?': 2, 'backdoor.': 2, 'be': 1, 'be.': 1, 'been': 2, 'belong': 12, 'bench': 1, 'better': 1, 'bleachersDreaming': 2, 'brought': 1, 'by': 4, 'can': 1, "can't": 4, 'captain': 2, 'cheer': 2, 'concert': 1, 'could': 4, 'cryI': 1, 'day': 2, "do.I'm": 1, 'doBut': 1, "doesn't": 2, 'doing': 1, 'down.You': 1, 'dreams': 1, 'driving': 1, 'easy?"And': 1, 'ever': 1, 'favorite': 1, 'find': 2, 'fine—I': 1, 'for': 2, 'get': 1, 'girl': 1, "girlfriend—she's": 1, 'going': 1, 'got': 1, 'has': 2, "haven't": 1, 'headlined': 1, 'heels': 1, 'help': 1, 'here': 5, 'high': 1, 'house': 1, 'how': 3, 'humor': 1, 'in': 4, "isn't": 1, 'it': 2, "it's": 2, 'jeans': 1, 'just': 1, 'kind': 1, 'know': 6, 'know,': 2, 'laugh': 1, 'light': 1, 'like': 3, 'like.': 1, 'listening': 1, 'looking': 2, 'makes': 1, 'maybe': 1, 'me': 7, 'me?': 1, 'me?Standing': 2, 'me?Walk': 1, "meCan't": 1, 'meOh,': 1, 'middle': 1, 'music': 1, 'my': 1, 'myself': 1, 'never': 1, 'night': 1, 'night.': 1, 'not': 2, 'of': 2, 'off': 1, 'on': 4, 'one': 4, 'ought': 1, 'out': 1, 'park': 1, 'phone': 1, 'remember': 1, 'room,': 1, 'said': 1, 'say': 1, 'se': 1, 'see': 5, 'seen': 1, 'she': 4, "she'll": 1, 'short': 1, 'skirts': 1, 'smile': 1, 'sneakers': 1, 'something': 1, 'songs': 1, 'story': 1, 'streets': 1, 't-shirt': 1, 'tell': 1, 'than': 1, 'that': 3, 'that?': 1, 'thinking': 2, 'this': 5, 'thought': 1, 'time': 2, 'time.': 1, 'timeIf': 1, 'to': 5, 'town': 1, 'typical': 1, 'understands': 3, 'up': 3, 'upset': 1, 'waiting': 2, 'wake': 2, 'wear': 2, 'wears': 2, 'what': 3, 'when': 2, 'where': 1, 'who': 1, 'whole': 3, 'why': 3, 'will': 1, 'with': 15, 'worn': 1, 'you': 23, "you're": 4, "you've": 1, 'you?': 1, 'your': 8}
df = pd.DataFrame(words, columns=['word'])
x = df["word"].value_counts()
pprint.pprint(x)
you 23 with 15 belong 12 You 11 I 9 .. listening 1 short 1 do.I'm 1 time. 1 just 1 Name: word, Length: 186, dtype: int64