daily log 9-30-20
less than 1 minute read
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
url = "https://www.rev.com/blog/transcripts/donald-trump-joe-biden-1st-presidential-debate-transcript-2020"
page = requests.get(url)
soup= bs(page.content, "html.parser")
content_block = soup.find("div", {"class": "fl-callout-text"})
targets = content_block.findAll('p')
transcript = []
prev_time = ''
for t in targets:
target = t.get_text()
name = target.split(':')[0]
try:
time = target.split('(')[1].split(')')[0]
except:
time = prev_time
try:
words = target.split('\n')[1].strip()
except:
print(target)
obj = {'name': name, 'time': time, 'words': words}
prev_time = time
transcript.append(obj)
from datetime import time
def better_time(t):
if len(t.split(':')) > 2:
return t
else:
return "00:" + t
df['better_time'] = df.apply(lambda x: better_time(x['time']), axis=1)
def convert_time(t):
try:
return time.fromisoformat(t)
except:
return 'error'
df['timestamp'] = df.apply(lambda x: convert_time(x['better_time']), axis=1)
df['datetime'] = pd.to_timedelta(df['timestamp'].astype(str))
df['time_diff'] = df['datetime'].diff()
df['seconds'] = df['time_diff'].dt.total_seconds()