import os 
import csv 
import pandas as pd

Reading in the df and looking at the first 5 rows¶

death_row = pd.read_csv("death_row_final_project.csv")
death_row.head()

# getting the shape of the df 
death_row.shape

(566, 24)

#looking at descriptive stats for the numeric columns
death_row.describe()

#looking at the data types for each column in the df 
death_row.dtypes

execution           int64
last_name          object
first_name         object
age_received       object
education_level    object
age_crime          object
occupation         object
prior_record       object
num_of_vic         object
main_crime         object
type_of_crime      object
weapon             object
co_defendants      object
race_vic           object
vic_kid            object
vic_male           object
vic_female         object
vic_police         object
inmate_number       int64
age                 int64
date_executed      object
race               object
county             object
last_statement     object
dtype: object

Things I want to change:¶

age_received, age-crime, num_of_vic, vic_kid, vic_male, vic_female need to be changed to int
education level might make sense to discretize
occupation, main_crime, type_of_crime, weapon, race_vic, race, county, late_name, first_name, prior_record, vic_police boolean turn to a factor
can remove execution, date_executed and inmate_number as it serves no useful purpose and is unique to each prisoner

# looking at the values for age_received to ensure all are numeric.
death_row.age_received.value_counts()

23         43
20         38
21         37
25         34
24         31
29         29
19         28
22         28
27         25
30         22
26         22
31         21
32         21
28         19
38         17
36         17
33         14
40         14
35         14
34         13
39         13
18         11
37          9
45          5
43          5
41          4
46          4
49          4
42          3
51          3
44          3
47          3
53          3
48          2
unknown     2
52          1
54          1
50          1
17          1
57          1
Name: age_received, dtype: int64

# need to remove unknown to n/a for all the columns that I want to change to numeric. 
numeric_columns = ["age_received", "age_crime", "num_of_vic", "vic_kid", "vic_male", "vic_female", "co_defendants"]

for column in numeric_columns: 
    death_row[column] = death_row[column].str.replace("unknown", "")

# checking to make sure that worked... 
death_row.age_received.value_counts()

23    43
20    38
21    37
25    34
24    31
29    29
19    28
22    28
27    25
30    22
26    22
31    21
32    21
28    19
38    17
36    17
33    14
40    14
35    14
34    13
39    13
18    11
37     9
45     5
43     5
41     4
46     4
49     4
42     3
51     3
44     3
47     3
53     3
48     2
       2
52     1
54     1
50     1
17     1
57     1
Name: age_received, dtype: int64

death_row[numeric_columns] = death_row[numeric_columns].apply(pd.to_numeric) #changes everything to float

death_row.head()

# I do not want to see the decimal places in the columns 
pd.options.display.float_format = "{:,.0f}".format

Replacing all missing values with the mean of each column¶

for column in numeric_columns: 
    print("The number of missing values in", column, "is", death_row[column].isna().sum())
    death_row[column] = death_row[column].fillna(death_row[column].mean())
    print("Now the number of missing values in", column, "is", death_row[column].isna().sum())

The number of missing values in age_received is 2
Now the number of missing values in age_received is 0
The number of missing values in age_crime is 2
Now the number of missing values in age_crime is 0
The number of missing values in num_of_vic is 1
Now the number of missing values in num_of_vic is 0
The number of missing values in vic_kid is 1
Now the number of missing values in vic_kid is 0
The number of missing values in vic_male is 2
Now the number of missing values in vic_male is 0
The number of missing values in vic_female is 2
Now the number of missing values in vic_female is 0
The number of missing values in co_defendants is 1
Now the number of missing values in co_defendants is 0

death_row.head()

Discretizing education level¶

death_row.education_level.value_counts()

12         110
11          75
10          75
9           72
ged         63
8           50
unknown     41
7           27
14          17
6            9
13           8
15           5
5            4
16           4
3            2
12.5         1
0            1
college      1
4            1
Name: education_level, dtype: int64

For discretizing, anything above 12 will be changed to college, 12 and ged will be changed to highschool, 9 - 11 to some highschool, less than 9 is not highschool

def replace_items_in_column_from_list(a_list_of_items_to_replace, df, column, word_to_be_changed_to):
    for item in a_list_of_items_to_replace: 
        df[column] = df[column].str.replace(item, word_to_be_changed_to)
    return df[column]

high_school = ["12", "ged"]
some_highschool = ["11", "10", "9"]
no_highschool = ["8", "7", "6", "5", "4", "3", "2", "1", "0"]
college = ["13", "14", "15", "12.5", "16"]
death_row["education_level"] = replace_items_in_column_from_list(college, death_row, "education_level", "college")
death_row["education_level"] = replace_items_in_column_from_list(high_school, death_row, "education_level", "highschool")
death_row["education_level"] = replace_items_in_column_from_list(some_highschool, death_row, "education_level", "some_highschool")
death_row["education_level"] = replace_items_in_column_from_list(no_highschool, death_row, "education_level", "no_highschool")
death_row.education_level.value_counts()

some_highschool    222
highschool         173
no_highschool       94
unknown             41
college             36
Name: education_level, dtype: int64

#Changing the education level to an ordered category 
cat = ["unknown", "no_highschool", "some_highschool", "highschool", "college"]
#Changing the month data type from int to ordered category 
death_row["education_level"] = pd.Categorical(death_row["education_level"], ordered = True, categories = cat)
#Checking to see if it worked 
death_row.education_level.dtype

CategoricalDtype(categories=['unknown', 'no_highschool', 'some_highschool', 'highschool',
                  'college'],
                 ordered=True)

death_row.head()

Changing the other columns that should be a category (factor)¶

def cat_fun(df, column): 
    df[column] = df[column].astype("category") 
    return(df[column])

category_columns = ["occupation", "main_crime", "type_of_crime", "weapon", "race", "race_vic", "county", "last_name", "first_name", "prior_record", "vic_police"]
for column in category_columns: 
    death_row[column] = cat_fun(death_row, column)

#Checking the data types
death_row.dtypes

execution             int64
last_name          category
first_name         category
age_received        float64
education_level    category
age_crime           float64
occupation         category
prior_record       category
num_of_vic          float64
main_crime         category
type_of_crime      category
weapon             category
co_defendants       float64
race_vic           category
vic_kid             float64
vic_male            float64
vic_female          float64
vic_police         category
inmate_number         int64
age                   int64
date_executed        object
race               category
county             category
last_statement       object
dtype: object

Removing execution and inmate number¶

death_row.drop(["execution", "inmate_number", "date_executed"], axis = 1, inplace = True)
death_row.head()

Aggregating a column: time_on_death_row¶

death_row["time_spent"] = death_row["age"] - death_row["age_received"]

looking = death_row[death_row["time_spent"] == 1]
looking
#double checking that the data is correct, as 1 year is a small amount of time spent on death row. 
#There were a few records that were incorrect based on data entry from the website we scraped. Those records were updated.

death_row.head()

On further thought, since all of the columns except for last_statement are different labels. I am going to discretize everything.¶

death_row.describe()

death_row.age_received.describe()

count   566
mean     29
std       8
min      17
25%      22
50%      27
75%      33
max      57
Name: age_received, dtype: float64

categories = ["teens", "twenties", "thirty+"]
death_row["age_received"] = pd.cut(death_row["age_received"], [0, 19, 29, 99], labels = categories)
death_row["age_crime"] = pd.cut(death_row["age_crime"], [0, 19, 29, 99], labels = categories)

death_row.head()

death_row.num_of_vic.describe()

count   566
mean      2
std       1
min       1
25%       1
50%       1
75%       2
max      15
Name: num_of_vic, dtype: float64

categories = ["one", "two+"]
death_row["num_of_vic"] = pd.cut(death_row["num_of_vic"], [0,1,99], labels = categories)

death_row.head()

Before discretizing vic_kid, vic_male, vic_female I am going to get a count of the total number of victims for each column¶

sum_kid_victims = death_row.vic_kid.sum(axis = 0, skipna = True).round()
print("The number of children victims is", sum_kid_victims)
sum_male_victims = death_row.vic_male.sum(axis = 0, skipna = True).round()
print("The number of male victims is", sum_male_victims)
sum_female_victims = death_row.vic_female.sum(axis = 0, skipna = True).round()
print("The number of female victims is", sum_female_victims)

The number of children victims is 154.0
The number of male victims is 458.0
The number of female victims is 466.0

# #Changing vic_kid, vic_male, vic_female back to object 
# columns = ["vic_kid", "vic_male", "vic_female"]
# for column in columns: 
#     death_row[column] = death_row[column].astype("object")

death_row.dtypes

last_name          category
first_name         category
age_received       category
education_level    category
age_crime          category
occupation         category
prior_record       category
num_of_vic         category
main_crime         category
type_of_crime      category
weapon             category
co_defendants       float64
race_vic           category
vic_kid             float64
vic_male            float64
vic_female          float64
vic_police         category
age                   int64
race               category
county             category
last_statement       object
time_spent          float64
dtype: object

death_row.vic_kid.value_counts()

0    460
1     67
2     30
3      6
0      1
5      1
4      1
Name: vic_kid, dtype: int64

# numeric_columns = ["vic_kid", "vic_male", "vic_female"]
# death_row[numeric_columns] = death_row[numeric_columns].apply(pd.to_numeric)

categories = ["no", "yes"]
death_row["vic_kid"] = pd.cut(death_row["vic_kid"], [-1, 0, 99], labels = categories)

death_row.head()

death_row["vic_male"] = pd.cut(death_row["vic_male"], [-1, 0, 99], labels = categories)
death_row["vic_female"] = pd.cut(death_row["vic_female"], [-1, 0, 99], labels = categories)
death_row.head()

death_row.time_spent.describe()

count   566
mean     11
std       5
min      -1
25%       8
50%      11
75%      13
max      32
Name: time_spent, dtype: float64

categories = ["10_or_less", "10+"]
death_row["time_spent"] = pd.cut(death_row["time_spent"], [-1, 10, 99], labels = categories)

death_row.head()

death_row.age.describe()

count   566
mean     40
std       9
min      24
25%      33
50%      38
75%      45
max      70
Name: age, dtype: float64

categories = ["18-34", "35-45", "45+"]
death_row["age"] = pd.cut(death_row["age"], [18, 34, 45, 99], labels = categories)

death_row.head()

death_row.dtypes

last_name          category
first_name         category
age_received       category
education_level    category
age_crime          category
occupation         category
prior_record       category
num_of_vic         category
main_crime         category
type_of_crime      category
weapon             category
co_defendants       float64
race_vic           category
vic_kid            category
vic_male           category
vic_female         category
vic_police         category
age                category
race               category
county             category
last_statement       object
time_spent         category
dtype: object

death_row.co_defendants.describe()

count   566
mean      1
std       1
min       0
25%       0
50%       0
75%       1
max       9
Name: co_defendants, dtype: float64

categories = ["no", "yes"]
death_row["co_defendants"] = pd.cut(death_row["co_defendants"], [-1, 0, 99], labels = categories)

death_row.head()

death_row.dtypes

last_name          category
first_name         category
age_received       category
education_level    category
age_crime          category
occupation         category
prior_record       category
num_of_vic         category
main_crime         category
type_of_crime      category
weapon             category
co_defendants      category
race_vic           category
vic_kid            category
vic_male           category
vic_female         category
vic_police         category
age                category
race               category
county             category
last_statement       object
time_spent         category
dtype: object

Ultimately we do not need the prisoner's first and last name unless we want to look to see if there are any specific names that occur more frequently than others. Therefore, I am removing those two columns¶

death_row.drop(["last_name", "first_name"], axis = 1, inplace = True)
death_row.head()

Now the data is ready to analyze¶

death_row.last_statement.head(25)

0     Yeah, I want to  address the Roundtree family ...
1     Umm, Pamela can you  hear me Stephanie, Hardy,...
2     It's 6:09 on  September 10th, Kayla and David,...
3     Hi ladies I wanted to  tell ya'll how much I l...
4     Lord forgive them.  They don't know what they ...
5                                           Spoken: No.
6     Yes Sir, that will be five Dollars I love you,...
7     To my friends and family it was a nice journey...
8     Yes Sir, I would like to thank the Shape Commu...
9     Yes Sir. Dear Heavenly Father please forgive t...
10    I am very thankful for all the hard work the M...
11                                  No statement given.
12    Thank you I love you all. Sandra, nice meeting...
13    l want to make sure the Patel family knows I l...
14                                         no statement
15    To everyone that has been there for me you kno...
16    Yes, I would like to say nephew it burns huh. ...
17    First I would like to say I have been here sin...
18    No, Well, Hi Mary Jean. See y'all later. Go ah...
19    First I would like to praise my Lord Jesus  Ch...
20    I'd like to take a moment to say I'm sorry.  N...
21                                                  NaN
22                                                  NaN
23    First and foremost I'd like to say, "Justice h...
24    Yes, I do, Grace Kehler is that you? I have gi...
Name: last_statement, dtype: object

death_row.last_statement.tail(25)

541    This offender declined to make a last statemen...
542    This offender declined to make a last statemen...
543    Mother, I am sorry for all the pain I've cause...
544    This offender declined to make a last statemen...
545    This offender declined to make a last statemen...
546    This offender declined to make a last statemen...
547    I want to say I'm sorry for the things I've do...
548    This offender declined to make a last statemen...
549    Tell my mother I love her and continue on with...
550    Goodbye to my family; I love all of you, I'm s...
551                    I have no last words. I am ready.
552    Goodbye to all my friends; be cool. Thank you ...
553    "Be strong for me," Pinkerton told his father,...
554    This offender declined to make a last statemen...
555        I deserve this. Tell everyone I said goodbye.
556    D.J., Laurie, Dr. Wheat, about all I can say i...
557    I want to thank Father Walsh for his spiritual...
558     There's no God but Allah, and unto thy I belo...
559    This offender declined to make a last statemen...
560    Heavenly Father, I give thanks for this time, ...
561    I pray that my family will rejoice and will fo...
562     When asked if he had a last statement, he rep...
563    What is about to transpire in a few moments is...
564    This offender declined to make a last statemen...
565    Statement to the Media: I, at this very moment...
Name: last_statement, dtype: object

''' different ways that no statment is represented: 
1. Spoken: No. 
2. No statement given. 
3. no statement 
4. This offender declined to make a last statement.

going to replace all of these with nothing'''

death_row["last_statement"] = death_row["last_statement"].str.replace("Spoken: No.", "none")
death_row["last_statement"] = death_row["last_statement"].str.replace("No statement given.", "none")
death_row["last_statement"] = death_row["last_statement"].str.replace("no statement", "none")
death_row["last_statement"] = death_row["last_statement"].str.replace("This offender declined to make a last statement.", "none")

death_row.last_statement.head(25)

0     Yeah, I want to  address the Roundtree family ...
1     Umm, Pamela can you  hear me Stephanie, Hardy,...
2     It's 6:09 on  September 10th, Kayla and David,...
3     Hi ladies I wanted to  tell ya'll how much I l...
4     Lord forgive them.  They don't know what they ...
5                                                  none
6     Yes Sir, that will be five Dollars I love you,...
7     To my friends and family it was a nice journey...
8     Yes Sir, I would like to thank the Shape Commu...
9     Yes Sir. Dear Heavenly Father please forgive t...
10    I am very thankful for all the hard work the M...
11                                                 none
12    Thank you I love you all. Sandra, nice meeting...
13    l want to make sure the Patel family knows I l...
14                                                 none
15    To everyone that has been there for me you kno...
16    Yes, I would like to say nephew it burns huh. ...
17    First I would like to say I have been here sin...
18    No, Well, Hi Mary Jean. See y'all later. Go ah...
19    First I would like to praise my Lord Jesus  Ch...
20    I'd like to take a moment to say I'm sorry.  N...
21                                                  NaN
22                                                  NaN
23    First and foremost I'd like to say, "Justice h...
24    Yes, I do, Grace Kehler is that you? I have gi...
Name: last_statement, dtype: object

death_row.last_statement.tail(25)

541                                               none  
542                                               none  
543    Mother, I am sorry for all the pain I've cause...
544                                               none  
545                                               none  
546                                               none  
547    I want to say I'm sorry for the things I've do...
548                                               none  
549    Tell my mother I love her and continue on with...
550    Goodbye to my family; I love all of you, I'm s...
551                    I have no last words. I am ready.
552    Goodbye to all my friends; be cool. Thank you ...
553    "Be strong for me," Pinkerton told his father,...
554                                               none  
555        I deserve this. Tell everyone I said goodbye.
556    D.J., Laurie, Dr. Wheat, about all I can say i...
557    I want to thank Father Walsh for his spiritual...
558     There's no God but Allah, and unto thy I belo...
559                                               none  
560    Heavenly Father, I give thanks for this time, ...
561    I pray that my family will rejoice and will fo...
562     When asked if he had a last statement, he rep...
563    What is about to transpire in a few moments is...
564                                               none  
565    Statement to the Media: I, at this very moment...
Name: last_statement, dtype: object

Now we can analyze (hopefully)¶

death_row.describe()

Going through each column and looking at the categories¶

print(death_row.age_received.value_counts())

twenties    308
thirty+     218
teens        40
Name: age_received, dtype: int64

column_names = list(death_row.columns)
column_names.remove("last_statement")

column_names

['age_received',
 'education_level',
 'age_crime',
 'occupation',
 'prior_record',
 'num_of_vic',
 'main_crime',
 'type_of_crime',
 'weapon',
 'co_defendants',
 'race_vic',
 'vic_kid',
 'vic_male',
 'vic_female',
 'vic_police',
 'age',
 'race',
 'county',
 'time_spent']

for column in column_names: 
    print("-------------", column, "-------------", "\n", death_row[column].value_counts())

------------- age_received ------------- 
 twenties    308
thirty+     218
teens        40
Name: age_received, dtype: int64
------------- education_level ------------- 
 some_highschool    222
highschool         173
no_highschool       94
unknown             41
college             36
Name: education_level, dtype: int64
------------- age_crime ------------- 
 twenties    299
thirty+     180
teens        87
Name: age_crime, dtype: int64
------------- occupation ------------- 
 laborer            206
unknown             47
mechanic            32
construction        28
food service        22
                  ... 
press operator       1
farm worker          1
factory worker       1
produce broker       1
ac/heating tech      1
Name: occupation, Length: 78, dtype: int64
------------- prior_record ------------- 
 yes        298
no         253
unknown     14
no           1
Name: prior_record, dtype: int64
------------- num_of_vic ------------- 
 one     354
two+    212
Name: num_of_vic, dtype: int64
------------- main_crime ------------- 
 murder, robbery                                            209
murder                                                     115
murder, rape                                                47
murder, kidnapping, rape                                    30
murder, rape, robbery                                       28
murder, car theft                                           25
murder, eluding arrest                                      20
murder, kidnapping                                          20
murder, kidnapping, robbery                                 14
murder, kidnapping, rape, robbery                           10
murder, car theft, robbery                                   8
murder, insurance scam                                       6
murder, escape                                               4
murder-serial, rape-serial                                   2
murder, eluding arrest, robbery                              2
murder, car theft, rape                                      2
murder-serial, rape                                          2
murder, for hire, rape                                       2
murder, car theft, rape, robbery                             1
murder, car theft, kidnappy, robbery                         1
murder, car theft, kidnapping, rape                          1
murder, car theft, kidnapping                                1
murder, eluding arrest, kidnapping                           1
murder, eluding arrest, rape                                 1
murder, car theft, eluding arrest, kidnapping, robbery       1
unknown                                                      1
murder, for hire                                             1
murder, identity theft                                       1
murder-serial, robbery-serial, rape                          1
murder, kidnapping, ransom                                   1
murder, mutilation-sexual                                    1
murder-attempted, escape                                     1
murder-attempted, robbery                                    1
murder-serial                                                1
murder-serial, rape, robbery                                 1
murder-serial, rape-serial, kidnapping                       1
murder-serial, robbery-serial                                1
murder, insurance scam, rape                                 1
Name: main_crime, dtype: int64
------------- type_of_crime ------------- 
 shooting                          290
stabbing                           82
strangling                         44
beating                            41
beating, stabbing                  20
beating, strangling                15
shooting, stabbing                  8
stabbing, strangling                4
shooting, strangling                4
beating, stabbing, strangling       4
drowning                            3
beating, shooting, stabbing         3
car                                 3
arson                               3
shooting, stabbing, strangling      3
shooting                            3
unknown                             2
beating, shooting                   2
arson, strangling                   2
arson, stabbing                     2
drowning, strangling                2
arson, shooting                     2
hate                                2
beating, shooting, strangling       2
arson, shooting, stabbing           1
arson, shooting, strangling         1
stabbing, strangulation             1
beating, drowning, stabbing         1
beating, rape                       1
arson, beating, stabbing            1
beating, rape, strangling           1
rape, shooting                      1
stabbing, strangling                1
poisoning                           1
shooting, stabbing                  1
broke neck                          1
buried alive, strangling            1
contract                            1
rape, stabbing                      1
drowning, shooting                  1
drowning, shooting, strangling      1
drugs                               1
neglect                             1
shotting                            1
Name: type_of_crime, dtype: int64
------------- weapon ------------- 
 gun                      297
knife                     78
hands                     41
hands, knife              12
gun, knife                10
                        ... 
cord, fireplace brush      1
cord, fire, gun            1
concrete                   1
coat hangers               1
ace bandage                1
Name: weapon, Length: 82, dtype: int64
------------- co_defendants ------------- 
 no     328
yes    238
Name: co_defendants, dtype: int64
------------- race_vic ------------- 
 white             298
unknown           106
hispanic           86
black              57
asian               9
unkown              4
middle eastern      2
black               2
white               1
samoan              1
Name: race_vic, dtype: int64
------------- vic_kid ------------- 
 no     460
yes    106
Name: vic_kid, dtype: int64
------------- vic_male ------------- 
 yes    356
no     210
Name: vic_male, dtype: int64
------------- vic_female ------------- 
 yes    329
no     237
Name: vic_female, dtype: int64
------------- vic_police ------------- 
 no         478
yes         50
no          32
yes          3
 no          2
unknown      1
Name: vic_police, dtype: int64
------------- age ------------- 
 35-45    245
18-34    188
45+      133
Name: age, dtype: int64
------------- race ------------- 
 White        250
Black        204
Hispanic     107
White          2
Other          2
Hispanic       1
Name: race, dtype: int64
------------- county ------------- 
 Harris        128
Dallas         59
Bexar          46
Tarrant        42
Montgomery     15
             ... 
Llano           1
Lubbock         1
Madison         1
McLennan        1
Kaufman         1
Name: county, Length: 113, dtype: int64
------------- time_spent ------------- 
 10+           284
10_or_less    281
Name: time_spent, dtype: int64

'''
Need to change shotting to shooting for type of crime
In order to predict weapon or type_of_crime a main crime needs to be decided

need to remove the space after white, black, in race_vic and also might want to change it to white, and non-white

need to remove the space after yesm and no for vic police and for the unknown changing it to no, because our belief is if it was it would have said yes 

need to remove the space after White, and Hispanic for race. Also might make sense to change to white, non-white

'''

'\nNeed to change shotting to shooting for type of crime\nIn order to predict weapon or type_of_crime a main crime needs to be decided\n\nneed to remove the space after white, black, in race_vic and also might want to change it to white, and non-white\n\nneed to remove the space after yesm and no for vic police and for the unknown changing it to no, because our belief is if it was it would have said yes \n\nneed to remove the space after White, and Hispanic for race. Also might make sense to change to white, non-white\n\n'

#It looks like there is a space after one no therefore removing that and looking again to see if we just have 3 categories
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("shotting", "shooting")
death_row.type_of_crime.value_counts()

shooting                          291
stabbing                           82
strangling                         44
beating                            41
beating, stabbing                  20
beating, strangling                15
shooting, stabbing                  8
beating, stabbing, strangling       4
shooting, strangling                4
stabbing, strangling                4
shooting, stabbing, strangling      3
drowning                            3
beating, shooting, stabbing         3
arson                               3
shooting                            3
car                                 3
hate                                2
arson, stabbing                     2
beating, shooting                   2
drowning, strangling                2
beating, shooting, strangling       2
arson, strangling                   2
arson, shooting                     2
unknown                             2
neglect                             1
rape, shooting                      1
beating, rape                       1
shooting, stabbing                  1
arson, shooting, stabbing           1
broke neck                          1
poisoning                           1
drowning, shooting, strangling      1
buried alive, strangling            1
beating, drowning, stabbing         1
arson, shooting, strangling         1
beating, rape, strangling           1
drowning, shooting                  1
arson, beating, stabbing            1
drugs                               1
contract                            1
stabbing, strangling                1
stabbing, strangulation             1
rape, stabbing                      1
Name: type_of_crime, dtype: int64

Because the majority of the crimes deal with guns, I think that it might be beneficial, in order to try to preduct type_of_crime, to change it to gun or non-gun. Non-gun would encompass any crime that did not use a gun so arson, stabbing, beating, strangling, drowning, car, neglect, rape. Contract will be included in gun.

death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("shooting", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("gun ", "gun")
death_row.type_of_crime.value_counts()

gun                              294
stabbing                          82
strangling                        44
beating                           41
beating, stabbing                 20
beating, strangling               15
gun, stabbing                      8
beating, stabbing, strangling      4
gun, strangling                    4
stabbing, strangling               4
car                                3
drowning                           3
arson                              3
gun, stabbing, strangling          3
beating, gun, stabbing             3
hate                               2
beating, gun                       2
arson, strangling                  2
beating, gun, strangling           2
arson, gun                         2
drowning, strangling               2
arson, stabbing                    2
unknown                            2
poisoning                          1
rape, stabbing                     1
drowning, gun, strangling          1
arson, gun, strangling             1
beating, rape                      1
arson, gun, stabbing               1
buried alive, strangling           1
broke neck                         1
stabbing, strangulation            1
arson, beating, stabbing           1
beating, drowning, stabbing        1
rape, gun                          1
beating, rape, strangling          1
gun, stabbing                      1
stabbing, strangling               1
drugs                              1
contract                           1
drowning, gun                      1
neglect                            1
Name: type_of_crime, dtype: int64

death_row["type_of_crime"] = death_row["type_of_crime"].astype("object")

death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("gun, stabbing", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("gun, strangling", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("beating, gun, stabbing", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("gun, stabbing, strangling", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("beating, gun", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("arson, gun", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("beating, gun, strangling", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("rape, gun", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("drowning, gun, strangling", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("arson, gun, stabbing", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("arson, gun, strangling", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("drowning, gun", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("contract", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("gun, stabbing", "gun")

death_row.type_of_crime.value_counts()

gun                              324
stabbing                          82
strangling                        44
beating                           41
beating, stabbing                 20
beating, strangling               15
beating, stabbing, strangling      4
stabbing, strangling               4
drowning                           3
arson                              3
car                                3
hate                               2
drowning, strangling               2
unknown                            2
arson, strangling                  2
arson, stabbing                    2
poisoning                          1
arson, beating, stabbing           1
beating, rape                      1
gun                                1
broke neck                         1
beating, drowning, stabbing        1
stabbing, strangulation            1
rape, stabbing                     1
beating, rape, strangling          1
stabbing, strangling               1
drugs                              1
buried alive, strangling           1
neglect                            1
Name: type_of_crime, dtype: int64

death_row.type_of_crime.dtype

dtype('O')

death_row.loc[death_row["type_of_crime"] != "gun", "type_of_crime"] = "other"

death_row.type_of_crime.value_counts()

gun      324
other    242
Name: type_of_crime, dtype: int64

'''
need to remove the space after white, black, in race_vic 

need to remove the space after yes and no for vic police and for the unknown changing it to no, because our belief is if it was it would have said yes 

need to remove the space after White, and Hispanic for race. Also might make sense to change to white, non-white

'''

'\nneed to remove the space after white, black, in race_vic \n\nneed to remove the space after yes and no for vic police and for the unknown changing it to no, because our belief is if it was it would have said yes \n\nneed to remove the space after White, and Hispanic for race. Also might make sense to change to white, non-white\n\n'

death_row.head()

death_row.prior_record.value_counts()

yes        298
no         253
unknown     14
no           1
Name: prior_record, dtype: int64

#It looks like there is a space after one no therefore removing that and looking again to see if we just have 3 categories
death_row["prior_record"] = death_row["prior_record"].str.replace("no ", "no")
death_row.prior_record.value_counts()

yes        298
no         254
unknown     14
Name: prior_record, dtype: int64

death_row.race_vic.value_counts()

white             298
unknown           106
hispanic           86
black              57
asian               9
unkown              4
middle eastern      2
black               2
white               1
samoan              1
Name: race_vic, dtype: int64

#We have decided to break race into white, hispanic, black, other, and unknown 
death_row["race_vic"] = death_row["race_vic"].str.replace("white ", "white")
death_row["race_vic"] = death_row["race_vic"].str.replace("black ", "black")
death_row["race_vic"] = death_row["race_vic"].str.replace("unkown", "unknown")
death_row["race_vic"] = death_row["race_vic"].str.replace("asian", "other")
death_row["race_vic"] = death_row["race_vic"].str.replace("middle eastern", "other")
death_row["race_vic"] = death_row["race_vic"].str.replace("samoan", "other")
death_row.race_vic.value_counts()

white        299
unknown      110
hispanic      86
black         59
other         12
Name: race_vic, dtype: int64

death_row.race.value_counts()

White        250
Black        204
Hispanic     107
White          2
Other          2
Hispanic       1
Name: race, dtype: int64

#We have decided to break race into white, hispanic, black, other, and unknown 
death_row["race"] = death_row["race"].str.replace("White", "white")
death_row["race"] = death_row["race"].str.replace("white ", "white")
death_row["race"] = death_row["race"].str.replace("Black", "black")
death_row["race"] = death_row["race"].str.replace("Hispanic", "hispanic")
death_row["race"] = death_row["race"].str.replace("hispanic ", "hispanic")
death_row["race"] = death_row["race"].str.replace("Other", "other")

death_row.race.value_counts()

white       252
black       204
hispanic    108
other         2
Name: race, dtype: int64

death_row.vic_police.value_counts()

no         478
yes         50
no          32
yes          3
 no          2
unknown      1
Name: vic_police, dtype: int64

death_row["vic_police"] = death_row["vic_police"].str.replace("no ", "no")
death_row["vic_police"] = death_row["vic_police"].str.replace("yes ", "yes")
death_row["vic_police"] = death_row["vic_police"].str.replace(" no", "no")
death_row["vic_police"] = death_row["vic_police"].str.replace("unknown", "no")
death_row.vic_police.value_counts()

no     513
yes     53
Name: vic_police, dtype: int64

death_row.weapon.value_counts()

gun                      297
knife                     78
hands                     41
hands, knife              12
gun, knife                10
                        ... 
cord, fireplace brush      1
cord, fire, gun            1
concrete                   1
coat hangers               1
ace bandage                1
Name: weapon, Length: 82, dtype: int64

#Changing weapon to gun, knife, strangulation_item, other 
#If a gun is used it will be the main weapon, then goes knife, then strangulation_item... 
death_row["weapon"] = death_row["weapon"].str.replace("hands, knife", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("gun, knife", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("cord, fireplace brush", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("cord, fire, gun", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("coat hangers", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("ace bandage", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("clothes", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("concrete", "other")
death_row["weapon"] = death_row["weapon"].str.replace("blunt object", "other")
death_row["weapon"] = death_row["weapon"].str.replace("gun, water", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("bag, gun", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("knife, rope", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("belt, club", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("sword", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("screwdriver", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("rock", "other")
death_row["weapon"] = death_row["weapon"].str.replace("gun, lamp", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("asphalt", "other")
death_row["weapon"] = death_row["weapon"].str.replace("board, strangulation_item", "other")
death_row["weapon"] = death_row["weapon"].str.replace("sissors", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("club, gun", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("strangulation_item, gun", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("gun, pipe", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("hammer", "other")
death_row["weapon"] = death_row["weapon"].str.replace("car", "other")
death_row["weapon"] = death_row["weapon"].str.replace("gun, hands", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("cord", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("tool", "other")
death_row["weapon"] = death_row["weapon"].str.replace("hammer, knife", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("fire, gun", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("bat", "other")
death_row["weapon"] = death_row["weapon"].str.replace("fire", "other")
death_row["weapon"] = death_row["weapon"].str.replace("other, hands", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("unknown", "other")
death_row["weapon"] = death_row["weapon"].str.replace("gun, wire", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("board", "other")
death_row["weapon"] = death_row["weapon"].str.replace("knife, other", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("knife, pipe", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("bar", "other")
death_row["weapon"] = death_row["weapon"].str.replace("bathtub", "other")
death_row["weapon"] = death_row["weapon"].str.replace("rope", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("strangulation_item, hammer", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("bar, knife", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("belt, fire", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("other, knife", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("pillow", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("strangulation_item, other", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("belt, other", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("axe", "other")
death_row["weapon"] = death_row["weapon"].str.replace("otherhtub", "other")
death_row["weapon"] = death_row["weapon"].str.replace("heroin", "other")
death_row["weapon"] = death_row["weapon"].str.replace("knife, water", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("bag, other", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("other, other, knife", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("starvation", "other")
death_row["weapon"] = death_row["weapon"].str.replace("statuette", "other")
death_row["weapon"] = death_row["weapon"].str.replace("steel lock", "other")
death_row["weapon"] = death_row["weapon"].str.replace("pickax", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("hands, strangulation_item", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("strangulation_item, hands", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("plastic tie wrap", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("poison", "other")
death_row["weapon"] = death_row["weapon"].str.replace("cellophane, gun, sink", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("beer bottle, hands", "hands")
death_row["weapon"] = death_row["weapon"].str.replace("hands, water", "hands")
death_row["weapon"] = death_row["weapon"].str.replace("strangulation_item, ice pick", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("hands, sand", "hands")
death_row["weapon"] = death_row["weapon"].str.replace("club", "other")
death_row["weapon"] = death_row["weapon"].str.replace("knife, mug", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("ice pick, knife", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("pipe", "other")
death_row["weapon"] = death_row["weapon"].str.replace("hatchet", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("chain", "other")
death_row["weapon"] = death_row["weapon"].str.replace("bumper jack", "other")
death_row["weapon"] = death_row["weapon"].str.replace("frying pan", "other")
death_row["weapon"] = death_row["weapon"].str.replace("river", "other")
death_row["weapon"] = death_row["weapon"].str.replace("other, gun, other", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("other, other", "other")
death_row["weapon"] = death_row["weapon"].str.replace("other ", "other")
death_row["weapon"] = death_row["weapon"].str.replace("other, knife", "other")
death_row["weapon"] = death_row["weapon"].str.replace("other, hands", "other")

death_row.weapon.value_counts()

gun                   324
knife                 114
other                  55
hands                  44
strangulation_item     29
Name: weapon, dtype: int64

death_row["weapon"] = death_row["weapon"].str.replace("hands", "other")
death_row["weapon"] = death_row["weapon"].str.replace("strangulation_item", "other")
death_row.weapon.value_counts()

gun      324
other    128
knife    114
Name: weapon, dtype: int64

death_row["main_crime"] = death_row["main_crime"].str.replace("murder-serial, robbery-serial, rape", "murder_rape_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder-serial, robbery-serial", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder-serial, rape-serial, kidnapping", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder-serial, rape, robbery", "murder_rape_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder-serial, rape-serial", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder-serial, rape", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder-serial", "murder")
death_row["main_crime"] = death_row["main_crime"].str.replace("unknown", "murder")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder-attempted, robbery", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder-attempted, escape", "murder_other")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, car theft, eluding arrest, kidnapping, robbery", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, car theft, kidnapping, rape", "murder_rape_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, car theft, kidnappy, robbery", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, car theft, rape, robbery", "murder_rape_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, car theft, kidnapping", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, eluding arrest, kidnapping", "murder_other")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, eluding arrest, rape", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, kidnapping, rape, robbery", "murder_rape_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, kidnapping, robbery", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, kidnapping, ransom", "murder_other")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, kidnapping, rape", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, kidnapping", "murder_other")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, for hire, rape", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, for hire", "murder")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, insurance scam, rape", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, insurance scam", "murder_other")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, car theft, robbery", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, car theft, rape", "murder_rape_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, car theft", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, eluding arrest, robbery", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, eluding arrest", "murder_other")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, rape, robbery", "murder_rape_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, rape", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, mutilation-sexual", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, identity theft", "murder_other")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, escape", "murder_other")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, robbery", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder_robbery ", "murder_robbery")

death_row.main_crime.value_counts()

murder_robbery         263
murder                 118
murder_rape             87
murder_other            54
murder_rape_robbery     44
Name: main_crime, dtype: int64

death_row.describe()

death_row["occupation"] = death_row["occupation"].astype("object")
death_row.loc[death_row["occupation"] != "laborer", "occupation"] = "other"

death_row.describe()

Now the data is ready to analyze/visualize/play with¶

column_names = list(death_row.columns)
column_names.remove("last_statement")
column_names

['age_received',
 'education_level',
 'age_crime',
 'occupation',
 'prior_record',
 'num_of_vic',
 'main_crime',
 'type_of_crime',
 'weapon',
 'co_defendants',
 'race_vic',
 'vic_kid',
 'vic_male',
 'vic_female',
 'vic_police',
 'age',
 'race',
 'county',
 'time_spent']

def get_value_counts(df, column): 
    new_df = pd.DataFrame(df[column].value_counts())
    new_df.columns = ["count"]
    new_df["category"] = new_df.index 
    new_df.reset_index(drop = True, inplace = True)
    return new_df

age_received_df = get_value_counts(death_row, "age_received")
age_received_df

from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import matplotlib as mpl
import matplotlib.pyplot as plt 
from matplotlib import cm 
from colorspacious import cspace_converter
import seaborn as sns

#What the function does: creates a bar graph
#Input: the df and title of the graph 
#Output: the bar graph
def category_bar_plot(df, title, rotation): 
    with sns.plotting_context("talk"):
        graph = sns.barplot(y = "count", x = "category", data = df, 
                           palette = "GnBu_d")
        plt.title(title)
        plt.xlabel("Category")
        plt.ylabel("Count")
        plt.xticks(rotation = rotation)
        return plt

category_bar_plot(age_received_df, "Age Received Breakdown", 0)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

education_level_df = get_value_counts(death_row, "education_level")
education_level_df

category_bar_plot(education_level_df, "Education Level Breakdown", 90)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

age_crime_df = get_value_counts(death_row, "age_crime")
age_crime_df

category_bar_plot(age_crime_df, "Age at Time of Crime Breakdown", 0)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

occupation_df = get_value_counts(death_row, "occupation")
occupation_df

category_bar_plot(occupation_df, "Occupation Breakdown", 0)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

prior_record_df = get_value_counts(death_row, "prior_record")
prior_record_df

category_bar_plot(prior_record_df, "Prior Record Breakdown", 0)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

num_of_vic_df = get_value_counts(death_row, "num_of_vic")
num_of_vic_df

category_bar_plot(num_of_vic_df, "Number of Crimes Breakdown", 0)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

main_crime_df = get_value_counts(death_row, "main_crime")
main_crime_df

category_bar_plot(main_crime_df, "Main Crime Breakdown", 90)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

type_of_crime_df = get_value_counts(death_row, "type_of_crime")
type_of_crime_df

category_bar_plot(type_of_crime_df, "Type of Crime Breakdown", 0)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

weapon_df = get_value_counts(death_row, "weapon")
weapon_df

category_bar_plot(weapon_df, "Main Weapon Breakdown", 0)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

co_defendants_df = get_value_counts(death_row, "co_defendants")
co_defendants_df

category_bar_plot(co_defendants_df, "Codefendant Breakdown", 0)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

race_vic_df = get_value_counts(death_row, "race_vic")
race_vic_df

category_bar_plot(race_vic_df, "Race of Victim Breakdown", 90)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

vic_kid_df = get_value_counts(death_row, "vic_kid")
vic_kid_df

category_bar_plot(vic_kid_df, "Victim a Child Breakdown", 90)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

vic_male_df = get_value_counts(death_row, "vic_male")
vic_male_df

category_bar_plot(vic_male_df, "Victim a Male Breakdown", 0)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

vic_female_df = get_value_counts(death_row, "vic_female")
vic_female_df

category_bar_plot(vic_female_df, "Victim a Female Breakdown", 0)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

vic_police_df = get_value_counts(death_row, "vic_police")
vic_police_df

category_bar_plot(vic_police_df, "Victim a Police Office Breakdown", 0)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

age_df = get_value_counts(death_row, "age")
age_df

category_bar_plot(age_df, "Age at Time of Execution Breakdown", 0)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

race_df = get_value_counts(death_row, "race")
race_df

category_bar_plot(race_df, "Race of Prisoner Breakdown", 0)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

county_df = get_value_counts(death_row, "county")
county_df.head(10)

time_spent_df = get_value_counts(death_row, "time_spent")
time_spent_df

category_bar_plot(time_spent_df, "Time Spent on Death Row Breakdown", 0)

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

Visualizing the last statements¶

from collections import Counter 
import numpy as np
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image

#What the function does: to be creating a list of reviews, then joining the reviews together to a string and 
                         #getting a count for each word in the string
#Input: df and column 
#Output: a dictionary with each word and the count of the word
def creating_freq_list_from_df_to_dict(df, column):
    reviews = df[column].tolist() 
    review_string = " ".join(reviews)
    review_string = review_string.split()
    review_dict = Counter(review_string)
    return review_dict

#What the function does: creates a word cloud that is in the shape of the mask passed in
#Input: the location where the mask image is saved, the frequency word dictionary, and the max # of words to include
        #and the title of the plot 
def create_word_cloud_with_mask(path_of_mask_image, dictionary, 
                                max_num_words, title):
        mask = np.array(Image.open(path_of_mask_image))
        #creating the word cloud 
        word_cloud = WordCloud(background_color = "white", 
                               max_words = max_num_words, 
                              mask = mask, max_font_size = 125)
        word_cloud.generate_from_frequencies(dictionary)
        #creating the coloring for the word cloud 
        image_colors = ImageColorGenerator(mask)
        plt.figure(figsize = [8,8])
        plt.imshow(word_cloud.recolor(color_func = image_colors), 
                  interpolation = "bilinear")
        plt.title(title)
        sns.set_context("poster")
        plt.axis("off")
        return plt

#What the function does: creates a df with two columns: word and count of the top 12 words
#Input: the word frequency dictionary 
#Output: a df with the top x words 
def word_freq_dict_to_df_top_words(dictionary, number_of_words_wanted): 
    df = pd.DataFrame.from_dict(dictionary,orient='index')
    df.columns = ["count"]
    df["word"] = df.index
    df.reset_index(drop = True, inplace = True)
    df.sort_values(by=["count"], ascending = False, inplace = True)
    df = df[:number_of_words_wanted]
    return(df)

#What the function does: creates a bar graph
#Input: the df and title of the graph 
#Output: the bar graph
def top_words_bar_plot(df, title): 
    with sns.plotting_context("talk"):
        graph = sns.barplot(y = "count", x = "word", data = df, 
                           palette = "GnBu_d")
        plt.title(title)
        plt.xlabel("Word")
        plt.ylabel("Count")
        plt.xticks(rotation = 90)
        return plt

#What the function does: creates a df with two columns: word and count 
#Input: the word frequency dictionary 
#Output: a df
def word_freq_dict_to_df_all_words(dictionary): 
    df = pd.DataFrame.from_dict(dictionary,orient='index')
    df.columns = ["count"]
    df["word"] = df.index
    df.reset_index(drop = True, inplace = True)
    df.sort_values(by=["count"], ascending = False, inplace = True)
    return(df)

#What the function does: Returns 2 statements: One with the total number of words and the other with the number 
                        #of unique words 
#Input: the frequency count dictionary 
#output: 2 statements 
def total_words_unique_words(dictionary): 
    eda_reviews_all_words = word_freq_dict_to_df_all_words(dictionary)
    print("The total number of words is", sum(eda_reviews_all_words["count"]))
    print("The total number of unique words is", len(dictionary))

def creating_freq_list_from_df_to_dict_2(df, column):
    reviews = df[column].tolist()
    reviews = [review if (type(review) == str) else 'number' for review in reviews]
    review_string = " ".join(reviews)
#     print(review_string)
    review_string = review_string.split()
    review_dict = Counter(review_string)
    return review_dict

last_statements_dic = creating_freq_list_from_df_to_dict_2(death_row, "last_statement")

#http://www.transparentpng.com/details/scroll-transparent-image-_4493.html
# create_word_cloud_with_mask("scroll3.png", last_statements_dic, 750, "Word Cloud Prior to Cleaning")

top_words = word_freq_dict_to_df_top_words(last_statements_dic, 20)
top_words

top_words_bar_plot(top_words, "Top 20 Words \n Prior to Cleaning and Separating")

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

total_words_unique_words(last_statements_dic)

The total number of words is 41690
The total number of unique words is 3098

It was decided to change all personal pronouns to "first_person_pronounds" and all other pronouns to "pronoun". The belief is that different types of criminal might speak of themselves versus other criminals. Punctuation will be removed prior to any changes and all words will be converted to lowercase.

death_row["last_statement"] = death_row["last_statement"].str.lower()
death_row["last_statement"] = death_row["last_statement"].str.replace(r"[^\w^\s]", "")
death_row["last_statement"] = death_row["last_statement"].str.replace(r"[0-9]+", "")

first_person_pronouns = [" i ", " me ", " mine ", " my ", " we ", " our ", " us ", " ours "]
pronouns = [" you ", " he ", " she ", " it ", " they ", " him ", " her ", " them ", " your ", " yours ", " his ", " hers ", " its "]

for word in first_person_pronouns: 
    death_row["last_statement"] = death_row["last_statement"].str.replace(word, " first_person_pronoun ")

for word in pronouns: 
    death_row["last_statement"] = death_row["last_statement"].str.replace(word, " pronoun ")

last_statements_dic = creating_freq_list_from_df_to_dict_2(death_row, "last_statement")
#http://www.transparentpng.com/details/scroll-transparent-image-_4493.html
# create_word_cloud_with_mask("scroll3.png", last_statements_dic, 750, "Word Cloud Prior to Cleaning")

top_words = word_freq_dict_to_df_top_words(last_statements_dic, 20)
top_words

top_words_bar_plot(top_words, "Top 20 Words")

<module 'matplotlib.pyplot' from '/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/matplotlib/pyplot.py'>

total_words_unique_words(last_statements_dic)

The total number of words is 41690
The total number of unique words is 3092

The last statements will be tokenized and any words less than 3 character will be removed. The last statements will then be stemmed using the snowball stemmer.

import nltk
from nltk.stem.snowball import SnowballStemmer

def tokenize_last_statement(statement):
    try:
        return nltk.word_tokenize(statement)
    except:
        return 'error'
death_row["last_statement"] = death_row.apply(lambda row: tokenize_last_statement(row["last_statement"]), axis = 1)

death_row.last_statement.head(10)

0    [yeah, first_person_pronoun, want, to, address...
1    [umm, pamela, can, pronoun, hear, first_person...
2    [its, on, september, th, kayla, and, david, fi...
3    [hi, ladies, first_person_pronoun, wanted, to,...
4    [lord, forgive, pronoun, pronoun, dont, know, ...
5                                               [none]
6    [yes, sir, that, will, be, five, dollars, firs...
7    [to, first_person_pronoun, friends, and, famil...
8    [yes, sir, first_person_pronoun, would, like, ...
9    [yes, sir, dear, heavenly, father, please, for...
Name: last_statement, dtype: object

df

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-204-00cf07b74dcd> in <module>
----> 1 df

NameError: name 'df' is not defined

death_row.to_csv('death_row_discritized.csv', index=False)

	execution	inmate_number	age
count	566.0000	566.000000	566.000000
mean	283.5000	531777.630742	39.726148
std	163.5344	498661.405354	8.828008
min	1.0000	511.000000	24.000000
25%	142.2500	819.250000	33.000000
50%	283.5000	999033.000000	38.000000
75%	424.7500	999269.750000	45.000000
max	566.0000	999571.000000	70.000000

	execution	last_name	first_name	age_received	education_level	age_crime	occupation	prior_record	num_of_vic	main_crime	...	vic_kid	vic_male	vic_female	vic_police	inmate_number	age	date_executed	race	county	last_statement
0	566	Hall	Justen	23	9	21	laborer	yes	1	murder	...	0	0	1	no	999497	38	11/6/2019	White	El Paso	Yeah, I want to address the Roundtree family ...
1	565	Sparks	Robert	34	8	33	machine operator	yes	3	murder	...	2	2	1	no	999542	45	9/25/2019	Black	Dallas	Umm, Pamela can you hear me Stephanie, Hardy,...
2	564	Soliz	Mark	30	8	28	cabinet maker	yes	1	murder, robbery	...	0	0	1	no	999571	37	9/10/2019	Hispanic	Johnson	It's 6:09 on September 10th, Kayla and David,...
3	563	Crutsinger	Billy	49	11	48	laborer	yes	2	murder	...	0	0	2	no	999459	64	9/4/2019	White	Tarrant	Hi ladies I wanted to tell ya'll how much I l...
4	562	Swearingen	Larry	29	11	27	laborer	yes	1	murder, kidnapping	...	0	0	1	no	999361	48	8/21/2019	White	Montgomery	Lord forgive them. They don't know what they ...

	execution	last_name	first_name	age_received	education_level	age_crime	occupation	prior_record	num_of_vic	main_crime	...	vic_kid	vic_male	vic_female	vic_police	inmate_number	age	date_executed	race	county	last_statement
0	566	Hall	Justen	23.0	9	21.0	laborer	yes	1.0	murder	...	0.0	0.0	1.0	no	999497	38	11/6/2019	White	El Paso	Yeah, I want to address the Roundtree family ...
1	565	Sparks	Robert	34.0	8	33.0	machine operator	yes	3.0	murder	...	2.0	2.0	1.0	no	999542	45	9/25/2019	Black	Dallas	Umm, Pamela can you hear me Stephanie, Hardy,...
2	564	Soliz	Mark	30.0	8	28.0	cabinet maker	yes	1.0	murder, robbery	...	0.0	0.0	1.0	no	999571	37	9/10/2019	Hispanic	Johnson	It's 6:09 on September 10th, Kayla and David,...
3	563	Crutsinger	Billy	49.0	11	48.0	laborer	yes	2.0	murder	...	0.0	0.0	2.0	no	999459	64	9/4/2019	White	Tarrant	Hi ladies I wanted to tell ya'll how much I l...
4	562	Swearingen	Larry	29.0	11	27.0	laborer	yes	1.0	murder, kidnapping	...	0.0	0.0	1.0	no	999361	48	8/21/2019	White	Montgomery	Lord forgive them. They don't know what they ...

	execution	last_name	first_name	age_received	education_level	age_crime	occupation	prior_record	num_of_vic	main_crime	...	vic_kid	vic_male	vic_female	vic_police	inmate_number	age	date_executed	race	county	last_statement
0	566	Hall	Justen	23	some_highschool	21	laborer	yes	1	murder	...	0	0	1	no	999497	38	11/6/2019	White	El Paso	Yeah, I want to address the Roundtree family ...
1	565	Sparks	Robert	34	no_highschool	33	machine operator	yes	3	murder	...	2	2	1	no	999542	45	9/25/2019	Black	Dallas	Umm, Pamela can you hear me Stephanie, Hardy,...
2	564	Soliz	Mark	30	no_highschool	28	cabinet maker	yes	1	murder, robbery	...	0	0	1	no	999571	37	9/10/2019	Hispanic	Johnson	It's 6:09 on September 10th, Kayla and David,...
3	563	Crutsinger	Billy	49	some_highschool	48	laborer	yes	2	murder	...	0	0	2	no	999459	64	9/4/2019	White	Tarrant	Hi ladies I wanted to tell ya'll how much I l...
4	562	Swearingen	Larry	29	some_highschool	27	laborer	yes	1	murder, kidnapping	...	0	0	1	no	999361	48	8/21/2019	White	Montgomery	Lord forgive them. They don't know what they ...

	last_name	first_name	age_received	education_level	age_crime	occupation	prior_record	num_of_vic	main_crime	type_of_crime	...	race_vic	vic_male	vic_female	vic_police	age	race	county	last_statement	time_spent
153	Rodriguez	Michael	39	highschool	40	laborer	yes	1	murder, escape	shooting	...	white	1	0	yes	40	Hispanic	Dallas	Yes I do, I know this no way makes up for all...	1
184	Swift	Christopher	30	some_highschool	28	laborer	yes	2	murder	strangling	...	white	0	2	no	31	White	Denton	This offender declined to make a last statemen...	1
344	Graham	Gary	38	some_highschool	18	laborer	no	1	murder, robbery	shooting	...	white	1	0	no	39	Black	Harris	I would like to say that I did not kill Bobby...	1
392	Foust	Aaron	25	highschool	24	laborer	no	1	murder, car theft, robbery	strangling	...	white	1	0	no	26	White	Tarrant	Adios, amigos, I'll see ya'll on the other sid...	1
420	Renfro	Steven	39	unknown	38	laborer	no	4	murder	shooting	...	white	2	2	yes	40	White	Harrison	I would like to tell the victims' families tha...	1
459	Gonzales, Jr.	Joe	35	highschool	31	construction	yes	1	murder, robbery	shooting	...	white	1	0	no	36	Hispanic	Potter	There are people all over the world who face t...	1

	last_name	first_name	age_received	education_level	age_crime	occupation	prior_record	num_of_vic	main_crime	type_of_crime	...	race_vic	vic_kid	vic_male	vic_female	vic_police	age	race	county	last_statement	time_spent
0	Hall	Justen	twenties	some_highschool	twenties	laborer	yes	1	murder	strangling	...	unkown	0	0	1	no	38	White	El Paso	Yeah, I want to address the Roundtree family ...	15
1	Sparks	Robert	thirty+	no_highschool	thirty+	machine operator	yes	3	murder	stabbing	...	black	2	2	1	no	45	Black	Dallas	Umm, Pamela can you hear me Stephanie, Hardy,...	11
2	Soliz	Mark	thirty+	no_highschool	twenties	cabinet maker	yes	1	murder, robbery	shooting	...	white	0	0	1	no	37	Hispanic	Johnson	It's 6:09 on September 10th, Kayla and David,...	7
3	Crutsinger	Billy	thirty+	some_highschool	thirty+	laborer	yes	2	murder	stabbing	...	white	0	0	2	no	64	White	Tarrant	Hi ladies I wanted to tell ya'll how much I l...	15
4	Swearingen	Larry	twenties	some_highschool	twenties	laborer	yes	1	murder, kidnapping	strangling	...	white	0	0	1	no	48	White	Montgomery	Lord forgive them. They don't know what they ...	19

	age_received	education_level	age_crime	occupation	prior_record	num_of_vic	main_crime	type_of_crime	weapon	co_defendants	race_vic	vic_kid	vic_male	vic_female	vic_police	age	race	county	last_statement	time_spent
count	566	566	566	566	566	566	566	566	566	566	566	566	566	566	566	566	566	566	564	565
unique	3	5	3	78	4	2	38	44	82	2	10	2	2	2	6	3	6	113	454	2
top	twenties	some_highschool	twenties	laborer	yes	one	murder, robbery	shooting	gun	no	white	no	yes	yes	no	35-45	White	Harris	none	10+
freq	308	222	299	206	298	354	209	290	297	328	298	460	356	329	478	245	250	128	101	284

	count	category
0	263	murder_robbery
1	118	murder
2	87	murder_rape
3	54	murder_other
4	44	murder_rape_robbery

	count	category
0	128	Harris
1	59	Dallas
2	46	Bexar
3	42	Tarrant
4	15	Montgomery
5	14	Jefferson
6	13	Nueces
7	12	Lubbock
8	11	Brazos
9	11	Smith

	count	word
1	3085	i
18	1608	you
3	1569	to
8	1325	and
5	1177	the
26	837	my
10	760	for
15	725	that
29	705	love
33	659	all
39	616	me
111	598	of
45	489	am
23	460	have
51	451	is
94	432	a
123	423	in
21	377	it
20	373	this
7	325	family

	count	word
1	4732	first_person_pronoun
18	2924	pronoun
3	1569	to
8	1325	and
5	1177	the
10	760	for
15	725	that
27	705	love
31	659	all
105	598	of
41	489	am
22	460	have
47	451	is
88	432	a
117	423	in
20	373	this
7	325	family
76	314	know
135	299	be
103	281	not