daily log 9-30-20

less than 1 minute read


import requests
import pandas as pd
from bs4 import BeautifulSoup as bs

url = "https://www.rev.com/blog/transcripts/donald-trump-joe-biden-1st-presidential-debate-transcript-2020"

page = requests.get(url)
soup= bs(page.content, "html.parser")
content_block = soup.find("div", {"class": "fl-callout-text"})
targets = content_block.findAll('p')


transcript = []
prev_time = ''
for t in targets:
    target = t.get_text()
    name = target.split(':')[0]

    try:
        time = target.split('(')[1].split(')')[0]
    except:
        time = prev_time

    try:
        words = target.split('\n')[1].strip()

    except:
        print(target)

    obj = {'name': name, 'time': time, 'words': words}
    prev_time = time
    transcript.append(obj)

    from datetime import time

def better_time(t):
    if len(t.split(':')) > 2:
        return t
    else:
        return "00:" + t

df['better_time'] = df.apply(lambda x: better_time(x['time']), axis=1)


def convert_time(t):
    try:
        return time.fromisoformat(t)
    except:
        return 'error'
df['timestamp'] = df.apply(lambda x: convert_time(x['better_time']), axis=1)



df['datetime'] = pd.to_timedelta(df['timestamp'].astype(str))
df['time_diff'] = df['datetime'].diff()
df['seconds'] = df['time_diff'].dt.total_seconds()