import os 
import csv 
import pandas as pd 
death_row = pd.read_csv("death_row_final_project.csv")
death_row.head()
# getting the shape of the df 
death_row.shape
#looking at descriptive stats for the numeric columns
death_row.describe()
#looking at the data types for each column in the df 
death_row.dtypes
# looking at the values for age_received to ensure all are numeric.
death_row.age_received.value_counts()
# need to remove unknown to n/a for all the columns that I want to change to numeric. 
numeric_columns = ["age_received", "age_crime", "num_of_vic", "vic_kid", "vic_male", "vic_female", "co_defendants"]
for column in numeric_columns: 
    death_row[column] = death_row[column].str.replace("unknown", "")
# checking to make sure that worked... 
death_row.age_received.value_counts()
death_row[numeric_columns] = death_row[numeric_columns].apply(pd.to_numeric) #changes everything to float
death_row.head()
# I do not want to see the decimal places in the columns 
pd.options.display.float_format = "{:,.0f}".format
for column in numeric_columns: 
    print("The number of missing values in", column, "is", death_row[column].isna().sum())
    death_row[column] = death_row[column].fillna(death_row[column].mean())
    print("Now the number of missing values in", column, "is", death_row[column].isna().sum())
death_row.head()
death_row.education_level.value_counts()
For discretizing, anything above 12 will be changed to college, 12 and ged will be changed to highschool, 9 - 11 to some highschool, less than 9 is not highschool
def replace_items_in_column_from_list(a_list_of_items_to_replace, df, column, word_to_be_changed_to):
    for item in a_list_of_items_to_replace: 
        df[column] = df[column].str.replace(item, word_to_be_changed_to)
    return df[column]
high_school = ["12", "ged"]
some_highschool = ["11", "10", "9"]
no_highschool = ["8", "7", "6", "5", "4", "3", "2", "1", "0"]
college = ["13", "14", "15", "12.5", "16"]
death_row["education_level"] = replace_items_in_column_from_list(college, death_row, "education_level", "college")
death_row["education_level"] = replace_items_in_column_from_list(high_school, death_row, "education_level", "highschool")
death_row["education_level"] = replace_items_in_column_from_list(some_highschool, death_row, "education_level", "some_highschool")
death_row["education_level"] = replace_items_in_column_from_list(no_highschool, death_row, "education_level", "no_highschool")
death_row.education_level.value_counts()
#Changing the education level to an ordered category 
cat = ["unknown", "no_highschool", "some_highschool", "highschool", "college"]
#Changing the month data type from int to ordered category 
death_row["education_level"] = pd.Categorical(death_row["education_level"], ordered = True, categories = cat)
#Checking to see if it worked 
death_row.education_level.dtype
death_row.head()
def cat_fun(df, column): 
    df[column] = df[column].astype("category") 
    return(df[column])
category_columns = ["occupation", "main_crime", "type_of_crime", "weapon", "race", "race_vic", "county", "last_name", "first_name", "prior_record", "vic_police"]
for column in category_columns: 
    death_row[column] = cat_fun(death_row, column)
#Checking the data types
death_row.dtypes
death_row.drop(["execution", "inmate_number", "date_executed"], axis = 1, inplace = True)
death_row.head()
death_row["time_spent"] = death_row["age"] - death_row["age_received"]
looking = death_row[death_row["time_spent"] == 1]
looking
#double checking that the data is correct, as 1 year is a small amount of time spent on death row. 
#There were a few records that were incorrect based on data entry from the website we scraped. Those records were updated. 
death_row.head()
death_row.describe()
death_row.age_received.describe()
categories = ["teens", "twenties", "thirty+"]
death_row["age_received"] = pd.cut(death_row["age_received"], [0, 19, 29, 99], labels = categories)
death_row["age_crime"] = pd.cut(death_row["age_crime"], [0, 19, 29, 99], labels = categories)
death_row.head()
death_row.num_of_vic.describe()
categories = ["one", "two+"]
death_row["num_of_vic"] = pd.cut(death_row["num_of_vic"], [0,1,99], labels = categories)
death_row.head()
sum_kid_victims = death_row.vic_kid.sum(axis = 0, skipna = True).round()
print("The number of children victims is", sum_kid_victims)
sum_male_victims = death_row.vic_male.sum(axis = 0, skipna = True).round()
print("The number of male victims is", sum_male_victims)
sum_female_victims = death_row.vic_female.sum(axis = 0, skipna = True).round()
print("The number of female victims is", sum_female_victims)
# #Changing vic_kid, vic_male, vic_female back to object 
# columns = ["vic_kid", "vic_male", "vic_female"]
# for column in columns: 
#     death_row[column] = death_row[column].astype("object")
death_row.dtypes
death_row.vic_kid.value_counts()
# numeric_columns = ["vic_kid", "vic_male", "vic_female"]
# death_row[numeric_columns] = death_row[numeric_columns].apply(pd.to_numeric)
categories = ["no", "yes"]
death_row["vic_kid"] = pd.cut(death_row["vic_kid"], [-1, 0, 99], labels = categories)
death_row.head()
death_row["vic_male"] = pd.cut(death_row["vic_male"], [-1, 0, 99], labels = categories)
death_row["vic_female"] = pd.cut(death_row["vic_female"], [-1, 0, 99], labels = categories)
death_row.head()
death_row.time_spent.describe()
categories = ["10_or_less", "10+"]
death_row["time_spent"] = pd.cut(death_row["time_spent"], [-1, 10, 99], labels = categories)
death_row.head()
death_row.age.describe()
categories = ["18-34", "35-45", "45+"]
death_row["age"] = pd.cut(death_row["age"], [18, 34, 45, 99], labels = categories)
death_row.head()
death_row.dtypes
death_row.co_defendants.describe()
categories = ["no", "yes"]
death_row["co_defendants"] = pd.cut(death_row["co_defendants"], [-1, 0, 99], labels = categories)
death_row.head()
death_row.dtypes
death_row.drop(["last_name", "first_name"], axis = 1, inplace = True)
death_row.head()
death_row.last_statement.head(25)
death_row.last_statement.tail(25)
''' different ways that no statment is represented: 
1. Spoken: No. 
2. No statement given. 
3. no statement 
4. This offender declined to make a last statement.
going to replace all of these with nothing'''
death_row["last_statement"] = death_row["last_statement"].str.replace("Spoken: No.", "none")
death_row["last_statement"] = death_row["last_statement"].str.replace("No statement given.", "none")
death_row["last_statement"] = death_row["last_statement"].str.replace("no statement", "none")
death_row["last_statement"] = death_row["last_statement"].str.replace("This offender declined to make a last statement.", "none")
death_row.last_statement.head(25)
death_row.last_statement.tail(25)
death_row.describe()
print(death_row.age_received.value_counts())
column_names = list(death_row.columns)
column_names.remove("last_statement")
column_names
for column in column_names: 
    print("-------------", column, "-------------", "\n", death_row[column].value_counts())
'''
Need to change shotting to shooting for type of crime
In order to predict weapon or type_of_crime a main crime needs to be decided
need to remove the space after white, black, in race_vic and also might want to change it to white, and non-white
need to remove the space after yesm and no for vic police and for the unknown changing it to no, because our belief is if it was it would have said yes 
need to remove the space after White, and Hispanic for race. Also might make sense to change to white, non-white
'''
#It looks like there is a space after one no therefore removing that and looking again to see if we just have 3 categories
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("shotting", "shooting")
death_row.type_of_crime.value_counts()
Because the majority of the crimes deal with guns, I think that it might be beneficial, in order to try to preduct type_of_crime, to change it to gun or non-gun. Non-gun would encompass any crime that did not use a gun so arson, stabbing, beating, strangling, drowning, car, neglect, rape. Contract will be included in gun.
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("shooting", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("gun ", "gun")
death_row.type_of_crime.value_counts()
death_row["type_of_crime"] = death_row["type_of_crime"].astype("object")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("gun, stabbing", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("gun, strangling", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("beating, gun, stabbing", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("gun, stabbing, strangling", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("beating, gun", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("arson, gun", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("beating, gun, strangling", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("rape, gun", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("drowning, gun, strangling", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("arson, gun, stabbing", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("arson, gun, strangling", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("drowning, gun", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("contract", "gun")
death_row["type_of_crime"] = death_row["type_of_crime"].str.replace("gun, stabbing", "gun")
death_row.type_of_crime.value_counts()
death_row.type_of_crime.dtype
death_row.loc[death_row["type_of_crime"] != "gun", "type_of_crime"] = "other"
death_row.type_of_crime.value_counts()
'''
need to remove the space after white, black, in race_vic 
need to remove the space after yes and no for vic police and for the unknown changing it to no, because our belief is if it was it would have said yes 
need to remove the space after White, and Hispanic for race. Also might make sense to change to white, non-white
'''
death_row.head()
death_row.prior_record.value_counts()
#It looks like there is a space after one no therefore removing that and looking again to see if we just have 3 categories
death_row["prior_record"] = death_row["prior_record"].str.replace("no ", "no")
death_row.prior_record.value_counts()
death_row.race_vic.value_counts()
#We have decided to break race into white, hispanic, black, other, and unknown 
death_row["race_vic"] = death_row["race_vic"].str.replace("white ", "white")
death_row["race_vic"] = death_row["race_vic"].str.replace("black ", "black")
death_row["race_vic"] = death_row["race_vic"].str.replace("unkown", "unknown")
death_row["race_vic"] = death_row["race_vic"].str.replace("asian", "other")
death_row["race_vic"] = death_row["race_vic"].str.replace("middle eastern", "other")
death_row["race_vic"] = death_row["race_vic"].str.replace("samoan", "other")
death_row.race_vic.value_counts()
death_row.race.value_counts()
#We have decided to break race into white, hispanic, black, other, and unknown 
death_row["race"] = death_row["race"].str.replace("White", "white")
death_row["race"] = death_row["race"].str.replace("white ", "white")
death_row["race"] = death_row["race"].str.replace("Black", "black")
death_row["race"] = death_row["race"].str.replace("Hispanic", "hispanic")
death_row["race"] = death_row["race"].str.replace("hispanic ", "hispanic")
death_row["race"] = death_row["race"].str.replace("Other", "other")
death_row.race.value_counts()
death_row.vic_police.value_counts()
death_row["vic_police"] = death_row["vic_police"].str.replace("no ", "no")
death_row["vic_police"] = death_row["vic_police"].str.replace("yes ", "yes")
death_row["vic_police"] = death_row["vic_police"].str.replace(" no", "no")
death_row["vic_police"] = death_row["vic_police"].str.replace("unknown", "no")
death_row.vic_police.value_counts()
death_row.weapon.value_counts()
#Changing weapon to gun, knife, strangulation_item, other 
#If a gun is used it will be the main weapon, then goes knife, then strangulation_item... 
death_row["weapon"] = death_row["weapon"].str.replace("hands, knife", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("gun, knife", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("cord, fireplace brush", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("cord, fire, gun", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("coat hangers", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("ace bandage", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("clothes", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("concrete", "other")
death_row["weapon"] = death_row["weapon"].str.replace("blunt object", "other")
death_row["weapon"] = death_row["weapon"].str.replace("gun, water", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("bag, gun", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("knife, rope", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("belt, club", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("sword", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("screwdriver", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("rock", "other")
death_row["weapon"] = death_row["weapon"].str.replace("gun, lamp", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("asphalt", "other")
death_row["weapon"] = death_row["weapon"].str.replace("board, strangulation_item", "other")
death_row["weapon"] = death_row["weapon"].str.replace("sissors", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("club, gun", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("strangulation_item, gun", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("gun, pipe", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("hammer", "other")
death_row["weapon"] = death_row["weapon"].str.replace("car", "other")
death_row["weapon"] = death_row["weapon"].str.replace("gun, hands", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("cord", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("tool", "other")
death_row["weapon"] = death_row["weapon"].str.replace("hammer, knife", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("fire, gun", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("bat", "other")
death_row["weapon"] = death_row["weapon"].str.replace("fire", "other")
death_row["weapon"] = death_row["weapon"].str.replace("other, hands", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("unknown", "other")
death_row["weapon"] = death_row["weapon"].str.replace("gun, wire", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("board", "other")
death_row["weapon"] = death_row["weapon"].str.replace("knife, other", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("knife, pipe", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("bar", "other")
death_row["weapon"] = death_row["weapon"].str.replace("bathtub", "other")
death_row["weapon"] = death_row["weapon"].str.replace("rope", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("strangulation_item, hammer", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("bar, knife", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("belt, fire", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("other, knife", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("pillow", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("strangulation_item, other", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("belt, other", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("axe", "other")
death_row["weapon"] = death_row["weapon"].str.replace("otherhtub", "other")
death_row["weapon"] = death_row["weapon"].str.replace("heroin", "other")
death_row["weapon"] = death_row["weapon"].str.replace("knife, water", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("bag, other", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("other, other, knife", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("starvation", "other")
death_row["weapon"] = death_row["weapon"].str.replace("statuette", "other")
death_row["weapon"] = death_row["weapon"].str.replace("steel lock", "other")
death_row["weapon"] = death_row["weapon"].str.replace("pickax", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("hands, strangulation_item", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("strangulation_item, hands", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("plastic tie wrap", "strangulation_item")
death_row["weapon"] = death_row["weapon"].str.replace("poison", "other")
death_row["weapon"] = death_row["weapon"].str.replace("cellophane, gun, sink", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("beer bottle, hands", "hands")
death_row["weapon"] = death_row["weapon"].str.replace("hands, water", "hands")
death_row["weapon"] = death_row["weapon"].str.replace("strangulation_item, ice pick", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("hands, sand", "hands")
death_row["weapon"] = death_row["weapon"].str.replace("club", "other")
death_row["weapon"] = death_row["weapon"].str.replace("knife, mug", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("ice pick, knife", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("pipe", "other")
death_row["weapon"] = death_row["weapon"].str.replace("hatchet", "knife")
death_row["weapon"] = death_row["weapon"].str.replace("chain", "other")
death_row["weapon"] = death_row["weapon"].str.replace("bumper jack", "other")
death_row["weapon"] = death_row["weapon"].str.replace("frying pan", "other")
death_row["weapon"] = death_row["weapon"].str.replace("river", "other")
death_row["weapon"] = death_row["weapon"].str.replace("other, gun, other", "gun")
death_row["weapon"] = death_row["weapon"].str.replace("other, other", "other")
death_row["weapon"] = death_row["weapon"].str.replace("other ", "other")
death_row["weapon"] = death_row["weapon"].str.replace("other, knife", "other")
death_row["weapon"] = death_row["weapon"].str.replace("other, hands", "other")
death_row.weapon.value_counts()
death_row["weapon"] = death_row["weapon"].str.replace("hands", "other")
death_row["weapon"] = death_row["weapon"].str.replace("strangulation_item", "other")
death_row.weapon.value_counts()
death_row["main_crime"] = death_row["main_crime"].str.replace("murder-serial, robbery-serial, rape", "murder_rape_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder-serial, robbery-serial", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder-serial, rape-serial, kidnapping", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder-serial, rape, robbery", "murder_rape_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder-serial, rape-serial", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder-serial, rape", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder-serial", "murder")
death_row["main_crime"] = death_row["main_crime"].str.replace("unknown", "murder")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder-attempted, robbery", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder-attempted, escape", "murder_other")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, car theft, eluding arrest, kidnapping, robbery", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, car theft, kidnapping, rape", "murder_rape_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, car theft, kidnappy, robbery", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, car theft, rape, robbery", "murder_rape_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, car theft, kidnapping", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, eluding arrest, kidnapping", "murder_other")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, eluding arrest, rape", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, kidnapping, rape, robbery", "murder_rape_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, kidnapping, robbery", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, kidnapping, ransom", "murder_other")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, kidnapping, rape", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, kidnapping", "murder_other")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, for hire, rape", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, for hire", "murder")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, insurance scam, rape", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, insurance scam", "murder_other")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, car theft, robbery", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, car theft, rape", "murder_rape_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, car theft", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, eluding arrest, robbery", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, eluding arrest", "murder_other")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, rape, robbery", "murder_rape_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, rape", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, mutilation-sexual", "murder_rape")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, identity theft", "murder_other")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, escape", "murder_other")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder, robbery", "murder_robbery")
death_row["main_crime"] = death_row["main_crime"].str.replace("murder_robbery ", "murder_robbery")
death_row.main_crime.value_counts()
death_row.describe()
death_row["occupation"] = death_row["occupation"].astype("object")
death_row.loc[death_row["occupation"] != "laborer", "occupation"] = "other"
death_row.describe()
column_names = list(death_row.columns)
column_names.remove("last_statement")
column_names
def get_value_counts(df, column): 
    new_df = pd.DataFrame(df[column].value_counts())
    new_df.columns = ["count"]
    new_df["category"] = new_df.index 
    new_df.reset_index(drop = True, inplace = True)
    return new_df
age_received_df = get_value_counts(death_row, "age_received")
age_received_df
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import matplotlib as mpl
import matplotlib.pyplot as plt 
from matplotlib import cm 
from colorspacious import cspace_converter
import seaborn as sns
#What the function does: creates a bar graph
#Input: the df and title of the graph 
#Output: the bar graph
def category_bar_plot(df, title, rotation): 
    with sns.plotting_context("talk"):
        graph = sns.barplot(y = "count", x = "category", data = df, 
                           palette = "GnBu_d")
        plt.title(title)
        plt.xlabel("Category")
        plt.ylabel("Count")
        plt.xticks(rotation = rotation)
        return plt
category_bar_plot(age_received_df, "Age Received Breakdown", 0)
education_level_df = get_value_counts(death_row, "education_level")
education_level_df
category_bar_plot(education_level_df, "Education Level Breakdown", 90)
age_crime_df = get_value_counts(death_row, "age_crime")
age_crime_df
category_bar_plot(age_crime_df, "Age at Time of Crime Breakdown", 0)
occupation_df = get_value_counts(death_row, "occupation")
occupation_df
category_bar_plot(occupation_df, "Occupation Breakdown", 0)
prior_record_df = get_value_counts(death_row, "prior_record")
prior_record_df
category_bar_plot(prior_record_df, "Prior Record Breakdown", 0)
num_of_vic_df = get_value_counts(death_row, "num_of_vic")
num_of_vic_df
category_bar_plot(num_of_vic_df, "Number of Crimes Breakdown", 0)
main_crime_df = get_value_counts(death_row, "main_crime")
main_crime_df
category_bar_plot(main_crime_df, "Main Crime Breakdown", 90)
type_of_crime_df = get_value_counts(death_row, "type_of_crime")
type_of_crime_df
category_bar_plot(type_of_crime_df, "Type of Crime Breakdown", 0)
weapon_df = get_value_counts(death_row, "weapon")
weapon_df
category_bar_plot(weapon_df, "Main Weapon Breakdown", 0)
co_defendants_df = get_value_counts(death_row, "co_defendants")
co_defendants_df
category_bar_plot(co_defendants_df, "Codefendant Breakdown", 0)
race_vic_df = get_value_counts(death_row, "race_vic")
race_vic_df
category_bar_plot(race_vic_df, "Race of Victim Breakdown", 90)
vic_kid_df = get_value_counts(death_row, "vic_kid")
vic_kid_df
category_bar_plot(vic_kid_df, "Victim a Child Breakdown", 90)
vic_male_df = get_value_counts(death_row, "vic_male")
vic_male_df
category_bar_plot(vic_male_df, "Victim a Male Breakdown", 0)
vic_female_df = get_value_counts(death_row, "vic_female")
vic_female_df
category_bar_plot(vic_female_df, "Victim a Female Breakdown", 0)
vic_police_df = get_value_counts(death_row, "vic_police")
vic_police_df
category_bar_plot(vic_police_df, "Victim a Police Office Breakdown", 0)
age_df = get_value_counts(death_row, "age")
age_df
category_bar_plot(age_df, "Age at Time of Execution Breakdown", 0)
race_df = get_value_counts(death_row, "race")
race_df
category_bar_plot(race_df, "Race of Prisoner Breakdown", 0)
county_df = get_value_counts(death_row, "county")
county_df.head(10)
time_spent_df = get_value_counts(death_row, "time_spent")
time_spent_df
category_bar_plot(time_spent_df, "Time Spent on Death Row Breakdown", 0)
from collections import Counter 
import numpy as np
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
#What the function does: to be creating a list of reviews, then joining the reviews together to a string and 
                         #getting a count for each word in the string
#Input: df and column 
#Output: a dictionary with each word and the count of the word
def creating_freq_list_from_df_to_dict(df, column):
    reviews = df[column].tolist() 
    review_string = " ".join(reviews)
    review_string = review_string.split()
    review_dict = Counter(review_string)
    return review_dict
#What the function does: creates a word cloud that is in the shape of the mask passed in
#Input: the location where the mask image is saved, the frequency word dictionary, and the max # of words to include
        #and the title of the plot 
def create_word_cloud_with_mask(path_of_mask_image, dictionary, 
                                max_num_words, title):
        mask = np.array(Image.open(path_of_mask_image))
        #creating the word cloud 
        word_cloud = WordCloud(background_color = "white", 
                               max_words = max_num_words, 
                              mask = mask, max_font_size = 125)
        word_cloud.generate_from_frequencies(dictionary)
        #creating the coloring for the word cloud 
        image_colors = ImageColorGenerator(mask)
        plt.figure(figsize = [8,8])
        plt.imshow(word_cloud.recolor(color_func = image_colors), 
                  interpolation = "bilinear")
        plt.title(title)
        sns.set_context("poster")
        plt.axis("off")
        return plt
#What the function does: creates a df with two columns: word and count of the top 12 words
#Input: the word frequency dictionary 
#Output: a df with the top x words 
def word_freq_dict_to_df_top_words(dictionary, number_of_words_wanted): 
    df = pd.DataFrame.from_dict(dictionary,orient='index')
    df.columns = ["count"]
    df["word"] = df.index
    df.reset_index(drop = True, inplace = True)
    df.sort_values(by=["count"], ascending = False, inplace = True)
    df = df[:number_of_words_wanted]
    return(df)
#What the function does: creates a bar graph
#Input: the df and title of the graph 
#Output: the bar graph
def top_words_bar_plot(df, title): 
    with sns.plotting_context("talk"):
        graph = sns.barplot(y = "count", x = "word", data = df, 
                           palette = "GnBu_d")
        plt.title(title)
        plt.xlabel("Word")
        plt.ylabel("Count")
        plt.xticks(rotation = 90)
        return plt
#What the function does: creates a df with two columns: word and count 
#Input: the word frequency dictionary 
#Output: a df
def word_freq_dict_to_df_all_words(dictionary): 
    df = pd.DataFrame.from_dict(dictionary,orient='index')
    df.columns = ["count"]
    df["word"] = df.index
    df.reset_index(drop = True, inplace = True)
    df.sort_values(by=["count"], ascending = False, inplace = True)
    return(df)
#What the function does: Returns 2 statements: One with the total number of words and the other with the number 
                        #of unique words 
#Input: the frequency count dictionary 
#output: 2 statements 
def total_words_unique_words(dictionary): 
    eda_reviews_all_words = word_freq_dict_to_df_all_words(dictionary)
    print("The total number of words is", sum(eda_reviews_all_words["count"]))
    print("The total number of unique words is", len(dictionary)) 
def creating_freq_list_from_df_to_dict_2(df, column):
    reviews = df[column].tolist()
    reviews = [review if (type(review) == str) else 'number' for review in reviews]
    review_string = " ".join(reviews)
#     print(review_string)
    review_string = review_string.split()
    review_dict = Counter(review_string)
    return review_dict
last_statements_dic = creating_freq_list_from_df_to_dict_2(death_row, "last_statement")
#http://www.transparentpng.com/details/scroll-transparent-image-_4493.html
# create_word_cloud_with_mask("scroll3.png", last_statements_dic, 750, "Word Cloud Prior to Cleaning")
top_words = word_freq_dict_to_df_top_words(last_statements_dic, 20)
top_words
top_words_bar_plot(top_words, "Top 20 Words \n Prior to Cleaning and Separating")
total_words_unique_words(last_statements_dic)
It was decided to change all personal pronouns to "first_person_pronounds" and all other pronouns to "pronoun". The belief is that different types of criminal might speak of themselves versus other criminals. Punctuation will be removed prior to any changes and all words will be converted to lowercase.
death_row["last_statement"] = death_row["last_statement"].str.lower()
death_row["last_statement"] = death_row["last_statement"].str.replace(r"[^\w^\s]", "")
death_row["last_statement"] = death_row["last_statement"].str.replace(r"[0-9]+", "")
first_person_pronouns = [" i ", " me ", " mine ", " my ", " we ", " our ", " us ", " ours "]
pronouns = [" you ", " he ", " she ", " it ", " they ", " him ", " her ", " them ", " your ", " yours ", " his ", " hers ", " its "]
for word in first_person_pronouns: 
    death_row["last_statement"] = death_row["last_statement"].str.replace(word, " first_person_pronoun ") 
for word in pronouns: 
    death_row["last_statement"] = death_row["last_statement"].str.replace(word, " pronoun ") 
last_statements_dic = creating_freq_list_from_df_to_dict_2(death_row, "last_statement")
#http://www.transparentpng.com/details/scroll-transparent-image-_4493.html
# create_word_cloud_with_mask("scroll3.png", last_statements_dic, 750, "Word Cloud Prior to Cleaning")
top_words = word_freq_dict_to_df_top_words(last_statements_dic, 20)
top_words
top_words_bar_plot(top_words, "Top 20 Words")
total_words_unique_words(last_statements_dic)
The last statements will be tokenized and any words less than 3 character will be removed. The last statements will then be stemmed using the snowball stemmer.
import nltk
from nltk.stem.snowball import SnowballStemmer
def tokenize_last_statement(statement):
    try:
        return nltk.word_tokenize(statement)
    except:
        return 'error'
death_row["last_statement"] = death_row.apply(lambda row: tokenize_last_statement(row["last_statement"]), axis = 1)
death_row.last_statement.head(10)
df
death_row.to_csv('death_row_discritized.csv', index=False)