Getting the information about executed prisoners from the Texas Department of Criminal Justice

https://www.tdcj.texas.gov/death_row/dr_executed_offenders.html

In [1]:
import requests
import lxml.html as lh
import pandas as pd
from bs4 import BeautifulSoup as bs
In [2]:
url="https://www.tdcj.texas.gov/death_row/dr_executed_offenders.html"
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
In [3]:
print(page.status_code) #It should show 200 to verify that I collected the data from the website 
200
In [4]:
#Creating the soup 
#Using beautiful soups prettify() function to print the data in a readable html format 
soup= bs(page.content, "html.parser")
# print(soup.prettify())
In [5]:
# The table we want is in the tag table and class "tdcj_table indent"
table = soup.find("table", class_="tdcj_table indent")
In [6]:
info = table.find("tr")
headers = info.text.split('\n')[1:-1]
headers
Out[6]:
['Execution',
 'Link',
 'Link',
 'Last Name',
 'First Name',
 'TDCJNumber',
 'Age',
 'Date',
 'Race',
 'County']
In [120]:
all_info = table.find_all("tr")
all_dics = []
for info in all_info:
    row = info.find_all("td")
    row_obj = {}
    for i,column_value in enumerate(row):
        if i == 1:
            row_obj.update({column_value.find('a').text : column_value.find('a').get('href')})
        elif i == 2:
            row_obj.update({column_value.find('a').text : column_value.find('a').get('href')})
        else:
            row_obj.update({headers[i] : column_value.text})
    all_dics.append(row_obj)
# all_dics
In [121]:
df = pd.DataFrame(all_dics)
In [122]:
df
Out[122]:
Execution Offender Information Last Statement Last Name First Name TDCJNumber Age Date Race County Offender Information Last Statement Last Statemen
0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 566 dr_info/halljusten.html dr_info/halljustenlast.html Hall Justen 999497 38 11/6/2019 White El Paso NaN NaN NaN
2 565 dr_info/sparksrobert.html dr_info/sparksrobertlast.html Sparks Robert 999542 45 9/25/2019 Black Dallas NaN NaN NaN
3 564 dr_info/solizmarkanthony.html dr_info/solizmarkanthonylast.html Soliz Mark 999571 37 9/10/2019 Hispanic Johnson NaN NaN NaN
4 563 dr_info/crutsingerbilly.html dr_info/crutsingerbillylast.html Crutsinger Billy 999459 64 9/4/2019 White Tarrant NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
562 5 dr_info/skillerndoyle.jpg dr_info/skillerndoylelast.html Skillern Doyle 518 49 01/16/1985 White Lubbock NaN NaN NaN
563 4 dr_info/barefootthomas.jpg dr_info/barefootthomaslast.html Barefoot Thomas 621 39 10/30/1984 White Bell NaN NaN NaN
564 3 dr_info/obryanronald.jpg dr_info/obryanronaldlast.html O'Bryan Ronald 529 39 03/31/1984 White Harris NaN NaN NaN
565 2 dr_info/autryjames.html dr_info/no_last_statement.html Autry James 670 29 03/14/1984 White Jefferson NaN NaN NaN
566 1 dr_info/brookscharlie.html dr_info/brookscharlielast.html Brooks, Jr. Charlie 592 40 12/07/1982 Black Tarrant NaN NaN NaN

567 rows × 13 columns

In [142]:
clean_df = df.copy()[1:]
clean_df = clean_df[clean_df.columns[:-3]]
clean_df
Out[142]:
Execution Offender Information Last Statement Last Name First Name TDCJNumber Age Date Race County
1 566 dr_info/halljusten.html dr_info/halljustenlast.html Hall Justen 999497 38 11/6/2019 White El Paso
2 565 dr_info/sparksrobert.html dr_info/sparksrobertlast.html Sparks Robert 999542 45 9/25/2019 Black Dallas
3 564 dr_info/solizmarkanthony.html dr_info/solizmarkanthonylast.html Soliz Mark 999571 37 9/10/2019 Hispanic Johnson
4 563 dr_info/crutsingerbilly.html dr_info/crutsingerbillylast.html Crutsinger Billy 999459 64 9/4/2019 White Tarrant
5 562 dr_info/swearingenlarry.html dr_info/swearingenlarrylast.html Swearingen Larry 999361 48 8/21/2019 White Montgomery
... ... ... ... ... ... ... ... ... ... ...
562 5 dr_info/skillerndoyle.jpg dr_info/skillerndoylelast.html Skillern Doyle 518 49 01/16/1985 White Lubbock
563 4 dr_info/barefootthomas.jpg dr_info/barefootthomaslast.html Barefoot Thomas 621 39 10/30/1984 White Bell
564 3 dr_info/obryanronald.jpg dr_info/obryanronaldlast.html O'Bryan Ronald 529 39 03/31/1984 White Harris
565 2 dr_info/autryjames.html dr_info/no_last_statement.html Autry James 670 29 03/14/1984 White Jefferson
566 1 dr_info/brookscharlie.html dr_info/brookscharlielast.html Brooks, Jr. Charlie 592 40 12/07/1982 Black Tarrant

566 rows × 10 columns

In [147]:
clean_df = df.copy()[1:]
clean_df = clean_df[clean_df.columns[:-3]]
clean_df
Out[147]:
Execution Offender Information Last Statement Last Name First Name TDCJNumber Age Date Race County
1 566 dr_info/halljusten.html dr_info/halljustenlast.html Hall Justen 999497 38 11/6/2019 White El Paso
2 565 dr_info/sparksrobert.html dr_info/sparksrobertlast.html Sparks Robert 999542 45 9/25/2019 Black Dallas
3 564 dr_info/solizmarkanthony.html dr_info/solizmarkanthonylast.html Soliz Mark 999571 37 9/10/2019 Hispanic Johnson
4 563 dr_info/crutsingerbilly.html dr_info/crutsingerbillylast.html Crutsinger Billy 999459 64 9/4/2019 White Tarrant
5 562 dr_info/swearingenlarry.html dr_info/swearingenlarrylast.html Swearingen Larry 999361 48 8/21/2019 White Montgomery
... ... ... ... ... ... ... ... ... ... ...
562 5 dr_info/skillerndoyle.jpg dr_info/skillerndoylelast.html Skillern Doyle 518 49 01/16/1985 White Lubbock
563 4 dr_info/barefootthomas.jpg dr_info/barefootthomaslast.html Barefoot Thomas 621 39 10/30/1984 White Bell
564 3 dr_info/obryanronald.jpg dr_info/obryanronaldlast.html O'Bryan Ronald 529 39 03/31/1984 White Harris
565 2 dr_info/autryjames.html dr_info/no_last_statement.html Autry James 670 29 03/14/1984 White Jefferson
566 1 dr_info/brookscharlie.html dr_info/brookscharlielast.html Brooks, Jr. Charlie 592 40 12/07/1982 Black Tarrant

566 rows × 10 columns

In [149]:
# for testing
# clean_df = clean_df[:5]
# for link in clean_df['Last Statement'].tolist():
    
def get_last_statement(link):
    try:
        url="https://www.tdcj.texas.gov/death_row/" + link
        page = requests.get(url)
        soup= bs(page.content, "html.parser")
        p_tags = soup.find_all("p")
        for p in p_tags:
            print
            try:
                if 'Last Statement' in p.text:
    #                 print(p)
                    return p.next_sibling.next_sibling.text
            except:
                return 'no statement'
    except:
        return 'no link'
            
    
clean_df['Last Statement Text'] = clean_df.apply(lambda x: get_last_statement(x['Last Statement']), axis=1)
In [150]:
clean_df
Out[150]:
Execution Offender Information Last Statement Last Name First Name TDCJNumber Age Date Race County Last Statement Text
1 566 dr_info/halljusten.html dr_info/halljustenlast.html Hall Justen 999497 38 11/6/2019 White El Paso Yeah, I want to address the Roundtree family ...
2 565 dr_info/sparksrobert.html dr_info/sparksrobertlast.html Sparks Robert 999542 45 9/25/2019 Black Dallas Umm, Pamela can you hear me Stephanie, Hardy,...
3 564 dr_info/solizmarkanthony.html dr_info/solizmarkanthonylast.html Soliz Mark 999571 37 9/10/2019 Hispanic Johnson It’s 6:09 on September 10th, Kayla and David,...
4 563 dr_info/crutsingerbilly.html dr_info/crutsingerbillylast.html Crutsinger Billy 999459 64 9/4/2019 White Tarrant Hi ladies I wanted to tell ya’ll how much I l...
5 562 dr_info/swearingenlarry.html dr_info/swearingenlarrylast.html Swearingen Larry 999361 48 8/21/2019 White Montgomery Lord forgive them. They don’t know what they ...
... ... ... ... ... ... ... ... ... ... ... ...
562 5 dr_info/skillerndoyle.jpg dr_info/skillerndoylelast.html Skillern Doyle 518 49 01/16/1985 White Lubbock I pray that my family will rejoice and will fo...
563 4 dr_info/barefootthomas.jpg dr_info/barefootthomaslast.html Barefoot Thomas 621 39 10/30/1984 White Bell When asked if he had a last statement, he rep...
564 3 dr_info/obryanronald.jpg dr_info/obryanronaldlast.html O'Bryan Ronald 529 39 03/31/1984 White Harris What is about to transpire in a few moments is...
565 2 dr_info/autryjames.html dr_info/no_last_statement.html Autry James 670 29 03/14/1984 White Jefferson This offender declined to make a last statemen...
566 1 dr_info/brookscharlie.html dr_info/brookscharlielast.html Brooks, Jr. Charlie 592 40 12/07/1982 Black Tarrant Statement to the Media: I, at this very moment...

566 rows × 11 columns

In [195]:
clean_df = clean_df[:5]
def get_last_statement(link):
    try:
        url="https://www.tdcj.texas.gov/death_row/" + link
        page = requests.get(url)
        soup= bs(page.content, "html.parser")
        tr_tags = soup.find_all("tr")
        offender_obj = {}
        for t in tr_tags:
            info = t.find_all('td')
#             print('---')
            row_array = []
            for i in info:
                string = i.text.replace(u'\xa0', u'')
#                 print(string)
#                 print(i.text)
#                 print(len(i.text))
                if len(string) > 0:
                    row_array.append(i.text)
#             row_array.remove('\xa0')
            print(row_array)
#             offender_obj.update({row_array[0]: row_array[1]})
#             print(row_array)
#         print(offender_obj)
            
                
#                 print(i.text)
#             try:
#                 if 'Last Statement' in p.text:
#     #                 print(p)
#                     return p.next_sibling.next_sibling.text
#             except:
#                 return 'no statement'
    except:
        return 'no link'
            
    
clean_df['Offender Info'] = clean_df.apply(lambda x: get_last_statement(x['Offender Information']), axis=1)
['Name', 'Hall, Justen Grant']
['TDCJ Number', '999497']
['Date of Birth', '06/16/1981']
['Date Received', '05/11/2005']
['Age (when Received)', '23']
['Education Level (Highest Grade Completed)', '9']
['Date of Offense', '10/28/2002']
['Age (at the time of Offense)', '21']
['County', 'El Paso']
['Race', 'White']
['Gender', 'Male']
['Hair Color', 'Brown']
['Height (in Feet and Inches)', '6′5″']
['Weight (in Pounds)', '197']
['Eye Color', 'Hazel']
['Native County', 'El Paso']
['Native State', 'Texas']
['Name', 'Sparks, Robert']
['TDCJ Number', '999542']
['Date of Birth', '02/13/1974']
['Date Received', '01/08/2009']
['Age (when Received)', '34']
['Education Level (Highest Grade Completed)', '8']
['Date of Offense', '09/15/2007']
['Age (at the time of Offense)', '33']
['County', 'Dallas']
['Race', 'Black']
['Gender', 'Male']
['Hair Color', 'Black']
['Height (in Feet and Inches)', '5′ 7″']
['Weight (in Pounds)', '247']
['Eye Color', 'Brown']
['Native County', 'Dallas']
['Native State', 'Texas']
['Name', 'Soliz, Mark Anthony']
['TDCJ Number', '999571']
['Date of Birth', '01/27/1982']
['Date Received', '03/28/2012']
['Age (when Received)', '30']
['Education Level (Highest Grade Completed)', '8']
['Date of Offense', '06/29/2010']
['Age (at the time of Offense)', '28']
['County', 'Johnson']
['Race', 'Hispanic']
['Gender', 'Male']
['Hair Color', 'Black']
['Height (in Feet and Inches)', '5′ 5″']
['Weight (in Pounds)', '177']
['Eye Color', 'Brown']
['Native County', 'Tarrant']
['Native State', 'Texas']
['Name', 'Crutsinger, Billy Jack']
['TDCJ Number', '999459']
['Date of Birth', '10/05/1954']
['Date Received', '10/08/2003']
['Age (when Received)', '49']
['Education Level (Highest Grade Completed)', '11']
['Date of Offense', '04/06/2003']
['Age (at the time of Offense)', '48']
['County', 'Tarrant']
['Race', 'White']
['Gender', 'Male']
['Hair Color', 'Gray']
['Height (in Feet and Inches)', '5′ 9″']
['Weight (in Pounds)', '217']
['Eye Color', 'Green']
['Native County', 'Tarrant']
['Native State', 'Texas']
['Name', 'Larry Ray Swearingen']
['TDCJ Number', '999361']
['Date of Birth', '5/21/1971']
['Date Received', '7/12/2000']
['Age (when    Received)', '29']
['Education Level (Highest Grade Completed)', '11']
['Date of Offense', '12/8/1998']
['Age (at the time of Offense)', '27']
['County', 'Montgomery']
['Race', 'White']
['Gender', 'Male']
['Hair Color', 'Brown']
['Height (in Feet and Inches)', '5′ 10″']
['Weight (in Pounds)', '194']
['Eye Color', 'Blue']
['Native County', 'Montgomery']
['Native State', 'Texas']
[]
In [ ]: