import requests
import lxml.html as lh
import pandas as pd
from bs4 import BeautifulSoup as bs
url="https://www.tdcj.texas.gov/death_row/dr_executed_offenders.html"
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
print(page.status_code) #It should show 200 to verify that I collected the data from the website
#Creating the soup
#Using beautiful soups prettify() function to print the data in a readable html format
soup= bs(page.content, "html.parser")
# print(soup.prettify())
# The table we want is in the tag table and class "tdcj_table indent"
table = soup.find("table", class_="tdcj_table indent")
info = table.find("tr")
headers = info.text.split('\n')[1:-1]
headers
all_info = table.find_all("tr")
all_dics = []
for info in all_info:
row = info.find_all("td")
row_obj = {}
for i,column_value in enumerate(row):
if i == 1:
row_obj.update({column_value.find('a').text : column_value.find('a').get('href')})
elif i == 2:
row_obj.update({column_value.find('a').text : column_value.find('a').get('href')})
else:
row_obj.update({headers[i] : column_value.text})
all_dics.append(row_obj)
# all_dics
df = pd.DataFrame(all_dics)
df
clean_df = df.copy()[1:]
clean_df = clean_df[clean_df.columns[:-3]]
clean_df
clean_df = df.copy()[1:]
clean_df = clean_df[clean_df.columns[:-3]]
clean_df
# for testing
# clean_df = clean_df[:5]
# for link in clean_df['Last Statement'].tolist():
def get_last_statement(link):
try:
url="https://www.tdcj.texas.gov/death_row/" + link
page = requests.get(url)
soup= bs(page.content, "html.parser")
p_tags = soup.find_all("p")
for p in p_tags:
print
try:
if 'Last Statement' in p.text:
# print(p)
return p.next_sibling.next_sibling.text
except:
return 'no statement'
except:
return 'no link'
clean_df['Last Statement Text'] = clean_df.apply(lambda x: get_last_statement(x['Last Statement']), axis=1)
clean_df