import os
def get_data_from_files(path):
directory = os.listdir(path)
results = []
filenames = []
for file in directory:
f=open(path+file)
filenames.append(file)
results.append(f.read())
f.close()
return results, filenames
inmates, filenames = get_data_from_files('FinalProject/inmates/')
import pandas as pd
import numpy as np
df = pd.DataFrame(inmates, filenames)
df.reset_index(inplace=True)
df['inmate_number'] = df.apply(lambda x: x['index'].split('_')[2], axis=1)
df['last_name'] = df.apply(lambda x: x['index'].split('_')[4].split('.')[0], axis=1)
df['first_name'] = df.apply(lambda x: x['index'].split('_')[3], axis=1)
df
index | 0 | inmate_number | last_name | first_name | |
---|---|---|---|---|---|
0 | img_text_732_Johnny_Anderson.txt | Name: dohnny Anderson\n\npop: 12/_ 28 /_59 Rec... | 732 | Anderson | Johnny |
1 | img_text_810_Betty_Beets.txt | Name: BettyLouBeets RIO\nDOB:03/12/37_ Receive... | 810 | Beets | Betty |
2 | img_text_981_Daniel_Hittle.txt | Name: Daniel Joe Hittle D.R. # 981\n\n \n\n \n... | 981 | Hittle | Daniel |
3 | img_text_651_John_Satterwhite.txt | Name: John Thomas Satterwhite D.R.# 651\n\nDOB... | 651 | Satterwhite | John |
4 | img_text_999186_John_Chavez.txt | Name: John Chavez D.R.# 999186\nDOB: _ 04/27/6... | 999186 | Chavez | John |
... | ... | ... | ... | ... | ... |
375 | img_text_838_Gerald_Mitchell.txt | Name: Gerald Lee Mitchell ____._.___ D.R.#838_... | 838 | Mitchell | Gerald |
376 | img_text_555_Charles_Rumbaugh.txt | CHARLES FRANCIS RUMBAUGH\n\nEXECUTION #555 Dat... | 555 | Rumbaugh | Charles |
377 | img_text_980_Claude_Jones.txt | Claude Howard Jones 980\n\n \n\n \n\n \n\nName... | 980 | Jones | Claude |
378 | img_text_999145_Rolando_Ruiz.txt | Name: Roland Ruiz, Jr. D.R. #999145\n\n@ DOB: ... | 999145 | Ruiz | Rolando |
379 | img_text_954_Oliver_Cruz.txt | David Oliver Cruz\nD.R.# ao\n\n \n\n \n\n \n\n... | 954 | Cruz | Oliver |
380 rows × 5 columns
import re
df['clean'] = df.apply(lambda x: re.sub(r'[\W_]+', ' ', x[0].lower()), axis=1)
df
index | 0 | inmate_number | last_name | first_name | clean | |
---|---|---|---|---|---|---|
0 | img_text_732_Johnny_Anderson.txt | Name: dohnny Anderson\n\npop: 12/_ 28 /_59 Rec... | 732 | Anderson | Johnny | name dohnny anderson pop 12 28 59 received cou... |
1 | img_text_810_Betty_Beets.txt | Name: BettyLouBeets RIO\nDOB:03/12/37_ Receive... | 810 | Beets | Betty | name bettyloubeets rio dob 03 12 37 received 1... |
2 | img_text_981_Daniel_Hittle.txt | Name: Daniel Joe Hittle D.R. # 981\n\n \n\n \n... | 981 | Hittle | Daniel | name daniel joe hittle d r 981 v pop 3 1 50 re... |
3 | img_text_651_John_Satterwhite.txt | Name: John Thomas Satterwhite D.R.# 651\n\nDOB... | 651 | Satterwhite | John | name john thomas satterwhite d r 651 dob 12 29... |
4 | img_text_999186_John_Chavez.txt | Name: John Chavez D.R.# 999186\nDOB: _ 04/27/6... | 999186 | Chavez | John | name john chavez d r 999186 dob 04 27 68 recei... |
... | ... | ... | ... | ... | ... | ... |
375 | img_text_838_Gerald_Mitchell.txt | Name: Gerald Lee Mitchell ____._.___ D.R.#838_... | 838 | Mitchell | Gerald | name gerald lee mitchell d r 838 dob 12 27 67 ... |
376 | img_text_555_Charles_Rumbaugh.txt | CHARLES FRANCIS RUMBAUGH\n\nEXECUTION #555 Dat... | 555 | Rumbaugh | Charles | charles francis rumbaugh execution 555 date se... |
377 | img_text_980_Claude_Jones.txt | Claude Howard Jones 980\n\n \n\n \n\n \n\nName... | 980 | Jones | Claude | claude howard jones 980 name d r pop 9 24 40 r... |
378 | img_text_999145_Rolando_Ruiz.txt | Name: Roland Ruiz, Jr. D.R. #999145\n\n@ DOB: ... | 999145 | Ruiz | Rolando | name roland ruiz jr d r 999145 dob 07 04 72 re... |
379 | img_text_954_Oliver_Cruz.txt | David Oliver Cruz\nD.R.# ao\n\n \n\n \n\n \n\n... | 954 | Cruz | Oliver | david oliver cruz d r ao name dob 5 18 67 rece... |
380 rows × 6 columns
def get_occupation(summary):
try:
p = re.compile(r'(?<=occupation)(\W.*?)(?=\s)')
r = p.search(summary).group().strip()
if 'prior' in r or len(r) < 3:
return 'none_listed'
else:
return r
except:
return 'none_listed'
occupations = [get_occupation(summary) for summary in df['clean'].values]
occupations
['none_listed', 'cashier', 'welder', 'mechanic', 'painter', 'laborer', 'education', 'laborer', 'borex', 'press', 'truck', 'mechanic', 'millwright', 'laborer', 'construction', 'education', 'laborer', 'none_listed', 'plumber', 'sculptor', 'iron', 'bull', 'laborer', 'none_listed', 'carpenter', 'brick', 'truck', 'auto', 'direct', 'restaurant', 'mechanic', 'fork', 'forklift', 'janitor', 'construction', 'machinist', 'salesman', 'construction', 'musician', 'musicia', 'plumber', 'construction', 'roofer', 'education', 'laborer', 'press', 'laborer', 'drywaller', 'none_listed', 'barber', 'construction', 'education', 'tankerman', 'bartender', 'roofer', 'welder', 'mechanic', 'computer', 'laborer', 'laborer', 'roofer', 'laborer', 'education', 'none_listed', 'laborer', 'produce', 'gardener', 'truck', 'landscaping', 'laborer', '2porer', 'analyst', 'antique', 'auto', 'welder', 'clectr', 'none_listed', 'clecitician', 'nurses', 'laborer', 'painter', 'laborer', 'oachi', 'electrician', 'carpenter', 'maintenance', 'truck', 'cabinet', 'security', 'laborer', 'waiter', 'construction', 'truck', 'laborer', 'electrician', 'laborer', 'laborer', 'correctional', 'construction', 'laborer', 'laborer', 'education', 'edckisyes', 'unemployed', 'landscaper', 'jaborer', 'laborer', 'none_listed', 'none_listed', 'paint', 'general', 'telemarketing', 'laborer', 'none_listed', 'construction', 'auto', 'laborer', 'informant', 'roofer', 'laborer', 'truck', 'telephone', 'unemployed', 'electricianfmarketing', 'student', 'none_listed', 'laborer', 'none_listed', 'laborer', 'laborer', 'salesman', 'welder', 'welder', 'clectrician', 'brick', 'food', 'food', 'laborer', 'metal', 'none_listed', 'deliveryman', 'laborer', 'construction', 'warehouse', 'jaborer', 'laborer', 'delivery', 'cement', 'construction', 'carpenter', 'food', 'paint', 'brick', 'machine', 'education', 'heavy', 'accounting', 'asst', 'laborer', 'mechanic', 'govt', 'sales', 'clerk', 'cook', 'construction', 'laborer', 'none_listed', 'mechanic', '1aborer', 'meat', 'painter', 'none_listed', 'none_listed', 'retail', 'laborer', 'laborer', 'electrician', 'car', 'none_listed', 'none_listed', 'mechanic', 'farmer', 'machinist', 'baker', 'commercial', 'video', 'machinist', 'machinist', 'laborer', 'none_listed', 'education', 'janitor', 'electrician', 'cashier', 'small', 'cable', 'laborer', 'welder', 'laborer', 'mechanic', 'radiator', 'mechanic', 'shrimper', 'paint', 'none_listed', 'landscaping', 'none_listed', 'none_listed', 'cook', 'none_listed', 'feed', 'auto', 'sales', 'laborer', 'student', 'cement', 'none_listed', 'laborer', 'laborer', 'cook', 'none_listed', 'none_listed', 'auto', 'drywaller', 'none_listed', 'painter', 'construction', 'laborer', 'clectrician', 'laborer', 'pax', 'electrician', 'laborer', 'nursing', 'laborer', 'produce', 'jaborer', 'none_listed', 'sales', 'anto', 'custom', 'electrician', 'roofer', 'truck', 'none_listed', 'warehouseman', 'auto', 'laborer', 'welder', 'laborer', 'laborer', 'carpenter', 'laborer', 'stocker', 'roofer', 'cashier', 'laborer', 'labor', 'stocker', 'oilfield', 'none_listed', 'painter', 'clectrician', 'restaurant', 'laborer', 'computer', 'none_listed', 'electrician', 'sales', 'maintenance', 'welder', 'carpenter', 'physical', 'mechanic', 'gump', 'jaborer', 'echanic', 'none_listed', 'carpenter', 'laborer', 'apprentice', 'electrician', 'welder', 'sales', 'laborer', 'laborer', 'truck', 'carpenter', 'auto', 'welder', 'education', 'stockbroker', 'general', 'landscaping', 'construct', 'construction', 'none_listed', 'carpenter', 'laborer', 'oil', 'laborer', 'paint', 'mechanic', 'mover', 'student', 'electrician', 'fork', 'ast', 'janitorial', 'receiving', 'auto', 'chemical', 'welder', 'motorcycle', 'mechanic', 'laborer', 'cook', 'electrician', 'heavy', 'none_listed', 'mechanic', 'mechanic', 'iron', 'auto', 'roofer', 'laborer', 'farm', 'electrician', 'cook', 'laborer', 'none_listed', 'cook', 'manager', 'general', 'laborer', 'insurance', 'cook', 'none_listed', 'laborer', 'painter', 'mechanic', 'education', 'office', 'ghneck', 'carpenter', 'delivery', 'security', 'cook', 'landscaping', 'diesel', 'laborer', 'laborer', 'construction', 'roughneck', 'construction', 'auto', 'education', 'construction', 'dishwasher', 'factory', 'laborer', 'carpenter', 'drywall', 'cement', 'none_listed', 'brickmason', 'laborer', 'roofer', 'construction', 'telemarketing', 'laborer', 'oilfield', 'clerical', 'laborer', 'dispatcher', 'carpenter', 'none_listed', 'electrician', 'laborer', 'laborer']
df['occupation'] = occupations
df
index | 0 | inmate_number | last_name | first_name | clean | occupation | |
---|---|---|---|---|---|---|---|
0 | img_text_732_Johnny_Anderson.txt | Name: dohnny Anderson\n\npop: 12/_ 28 /_59 Rec... | 732 | Anderson | Johnny | name dohnny anderson pop 12 28 59 received cou... | none_listed |
1 | img_text_810_Betty_Beets.txt | Name: BettyLouBeets RIO\nDOB:03/12/37_ Receive... | 810 | Beets | Betty | name bettyloubeets rio dob 03 12 37 received 1... | cashier |
2 | img_text_981_Daniel_Hittle.txt | Name: Daniel Joe Hittle D.R. # 981\n\n \n\n \n... | 981 | Hittle | Daniel | name daniel joe hittle d r 981 v pop 3 1 50 re... | welder |
3 | img_text_651_John_Satterwhite.txt | Name: John Thomas Satterwhite D.R.# 651\n\nDOB... | 651 | Satterwhite | John | name john thomas satterwhite d r 651 dob 12 29... | mechanic |
4 | img_text_999186_John_Chavez.txt | Name: John Chavez D.R.# 999186\nDOB: _ 04/27/6... | 999186 | Chavez | John | name john chavez d r 999186 dob 04 27 68 recei... | painter |
... | ... | ... | ... | ... | ... | ... | ... |
375 | img_text_838_Gerald_Mitchell.txt | Name: Gerald Lee Mitchell ____._.___ D.R.#838_... | 838 | Mitchell | Gerald | name gerald lee mitchell d r 838 dob 12 27 67 ... | carpenter |
376 | img_text_555_Charles_Rumbaugh.txt | CHARLES FRANCIS RUMBAUGH\n\nEXECUTION #555 Dat... | 555 | Rumbaugh | Charles | charles francis rumbaugh execution 555 date se... | none_listed |
377 | img_text_980_Claude_Jones.txt | Claude Howard Jones 980\n\n \n\n \n\n \n\nName... | 980 | Jones | Claude | claude howard jones 980 name d r pop 9 24 40 r... | electrician |
378 | img_text_999145_Rolando_Ruiz.txt | Name: Roland Ruiz, Jr. D.R. #999145\n\n@ DOB: ... | 999145 | Ruiz | Rolando | name roland ruiz jr d r 999145 dob 07 04 72 re... | laborer |
379 | img_text_954_Oliver_Cruz.txt | David Oliver Cruz\nD.R.# ao\n\n \n\n \n\n \n\n... | 954 | Cruz | Oliver | david oliver cruz d r ao name dob 5 18 67 rece... | laborer |
380 rows × 7 columns
def get_priors(summary):
try:
text = re.compile(r'(?<=record)(\W.*?)(?=\s)')
result = text.search(summary).group().strip()
return 'no' if 'none' in result else 'yes'
except:
return 'none_listed'
priors = [get_priors(summary) for summary in df['clean'].values]
df['prior_record'] = priors
df
index | 0 | inmate_number | last_name | first_name | clean | occupation | prior_record | |
---|---|---|---|---|---|---|---|---|
0 | img_text_732_Johnny_Anderson.txt | Name: dohnny Anderson\n\npop: 12/_ 28 /_59 Rec... | 732 | Anderson | Johnny | name dohnny anderson pop 12 28 59 received cou... | none_listed | no |
1 | img_text_810_Betty_Beets.txt | Name: BettyLouBeets RIO\nDOB:03/12/37_ Receive... | 810 | Beets | Betty | name bettyloubeets rio dob 03 12 37 received 1... | cashier | no |
2 | img_text_981_Daniel_Hittle.txt | Name: Daniel Joe Hittle D.R. # 981\n\n \n\n \n... | 981 | Hittle | Daniel | name daniel joe hittle d r 981 v pop 3 1 50 re... | welder | yes |
3 | img_text_651_John_Satterwhite.txt | Name: John Thomas Satterwhite D.R.# 651\n\nDOB... | 651 | Satterwhite | John | name john thomas satterwhite d r 651 dob 12 29... | mechanic | yes |
4 | img_text_999186_John_Chavez.txt | Name: John Chavez D.R.# 999186\nDOB: _ 04/27/6... | 999186 | Chavez | John | name john chavez d r 999186 dob 04 27 68 recei... | painter | yes |
... | ... | ... | ... | ... | ... | ... | ... | ... |
375 | img_text_838_Gerald_Mitchell.txt | Name: Gerald Lee Mitchell ____._.___ D.R.#838_... | 838 | Mitchell | Gerald | name gerald lee mitchell d r 838 dob 12 27 67 ... | carpenter | yes |
376 | img_text_555_Charles_Rumbaugh.txt | CHARLES FRANCIS RUMBAUGH\n\nEXECUTION #555 Dat... | 555 | Rumbaugh | Charles | charles francis rumbaugh execution 555 date se... | none_listed | none_listed |
377 | img_text_980_Claude_Jones.txt | Claude Howard Jones 980\n\n \n\n \n\n \n\nName... | 980 | Jones | Claude | claude howard jones 980 name d r pop 9 24 40 r... | electrician | yes |
378 | img_text_999145_Rolando_Ruiz.txt | Name: Roland Ruiz, Jr. D.R. #999145\n\n@ DOB: ... | 999145 | Ruiz | Rolando | name roland ruiz jr d r 999145 dob 07 04 72 re... | laborer | yes |
379 | img_text_954_Oliver_Cruz.txt | David Oliver Cruz\nD.R.# ao\n\n \n\n \n\n \n\n... | 954 | Cruz | Oliver | david oliver cruz d r ao name dob 5 18 67 rece... | laborer | no |
380 rows × 8 columns
def get_edu(summary):
try:
text = re.compile(r'(?<=education)(.*?)(years|yrs|ged|prior)')
result = text.search(summary).group().strip()
number = re.compile(r'\d+')
number_result = number.search(result).group()
# print(number_result)
return str(number_result) + " years"
# return 'no' if 'none' in result else 'yes'
except:
return 'none_listed'
edu = [get_edu(summary) for summary in df['clean'].values]
df['education_level'] = edu
df.to_csv('V8_fromphotos.csv')
def get_vics(summary):
try:
text = re.compile(r'(male|men|man)')
vics = text.findall(summary)
print(len(vics))
except:
print('nope')
vic_f = [get_vics(summary) for summary in df['clean'].values]
1 1 1 3 1 0 3 0 3 3 1 2 4 4 1 2 2 2 1 2 0 1 0 2 1 1 0 2 2 2 1 3 0 1 3 0 2 0 3 3 2 1 2 2 1 0 0 1 6 2 1 2 8 3 0 0 3 1 4 1 3 2 3 6 4 0 0 1 0 0 3 1 0 1 7 3 0 0 1 3 0 2 3 0 1 3 0 1 1 3 4 8 1 0 4 4 0 0 2 2 0 0 1 0 5 1 2 1 2 2 0 2 1 1 0 2 0 1 1 0 8 5 2 2 2 3 4 1 6 1 1 1 0 2 0 4 3 1 2 0 3 0 1 5 1 1 2 2 4 0 1 2 1 1 2 3 0 3 1 0 0 2 1 1 1 0 1 1 4 1 5 1 7 6 2 1 1 1 4 0 1 1 2 0 0 3 0 5 1 2 0 2 2 0 2 3 0 2 5 1 1 1 4 0 1 2 1 1 1 0 6 5 0 1 2 1 0 1 1 1 1 1 3 0 1 2 3 0 2 2 3 0 2 5 2 2 0 2 1 1 1 2 2 3 2 3 1 3 1 2 1 1 2 7 4 0 5 1 1 2 0 1 0 10 3 2 1 2 0 1 0 1 3 0 3 3 2 1 2 3 1 0 2 3 2 1 3 3 0 2 0 2 3 0 0 0 0 1 1 0 1 2 1 1 2 3 3 0 6 2 4 2 6 2 0 1 2 2 3 3 0 3 1 1 2 1 3 4 2 1 1 3 9 3 1 1 0 1 0 0 1 2 2 0 2 2 0 2 0 1 0 1 2 1 0 1 0 8 3 1 6 2 4 2 0 2 1 2 0 0 3 2 0 0 2 0 0 1 1 2
def get_vics(summary):
try:
# text = re.compile(r'(?<=race of victim s)\W(black|white|hispanic|hite|asian)(.*?)(male|female)')
text = re.compile(r'(?<=race of victim s)(.*?)(male|female)')
result = text.search(summary).group().strip()
result_s = result.split(' ')
if len(result_s) > 3:
return 'error'
else:
return result_s
# if len(result_s) >
# print(result)
# number = re.compile(r'\d+')
# number_result = number.search(result).group()
# # print(number_result)
# return str(number_result) + " years"
# # return 'no' if 'none' in result else 'yes'
except:
return ['none_listed']
vic_deets = [get_vics(summary) for summary in df['clean'].values]
df['vic_deets'] = vic_deets
# df['race_vic'] = df.apply()
# if len(summary) == 3:
# summary[0]
multiple_vics = [summary[0] if len(summary) == 3 else 'no' for summary in df['vic_deets'].values]
female_vics = ['yes' if 'female' in summary else 'no' for summary in df['vic_deets'].values]
male_vics = ['yes' if 'male' in summary else 'no' for summary in df['vic_deets'].values]
df['multiple_vics'] = multiple_vics
df['vic_female'] = female_vics
df['vic_male'] = male_vics
df
index | 0 | inmate_number | last_name | first_name | clean | occupation | prior_record | education_level | vic_deets | multiple_vics | vic_female | vic_male | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | img_text_732_Johnny_Anderson.txt | Name: dohnny Anderson\n\npop: 12/_ 28 /_59 Rec... | 732 | Anderson | Johnny | name dohnny anderson pop 12 28 59 received cou... | none_listed | no | 6 years | [white, male] | no | no | yes |
1 | img_text_810_Betty_Beets.txt | Name: BettyLouBeets RIO\nDOB:03/12/37_ Receive... | 810 | Beets | Betty | name bettyloubeets rio dob 03 12 37 received 1... | cashier | no | 10 years | [white, male] | no | no | yes |
2 | img_text_981_Daniel_Hittle.txt | Name: Daniel Joe Hittle D.R. # 981\n\n \n\n \n... | 981 | Hittle | Daniel | name daniel joe hittle d r 981 v pop 3 1 50 re... | welder | yes | 14 years | [white, male] | no | no | yes |
3 | img_text_651_John_Satterwhite.txt | Name: John Thomas Satterwhite D.R.# 651\n\nDOB... | 651 | Satterwhite | John | name john thomas satterwhite d r 651 dob 12 29... | mechanic | yes | none_listed | [none_listed] | no | no | no |
4 | img_text_999186_John_Chavez.txt | Name: John Chavez D.R.# 999186\nDOB: _ 04/27/6... | 999186 | Chavez | John | name john chavez d r 999186 dob 04 27 68 recei... | painter | yes | 8 years | [hispanic, male] | no | no | yes |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
375 | img_text_838_Gerald_Mitchell.txt | Name: Gerald Lee Mitchell ____._.___ D.R.#838_... | 838 | Mitchell | Gerald | name gerald lee mitchell d r 838 dob 12 27 67 ... | carpenter | yes | 10 years | [none_listed] | no | no | no |
376 | img_text_555_Charles_Rumbaugh.txt | CHARLES FRANCIS RUMBAUGH\n\nEXECUTION #555 Dat... | 555 | Rumbaugh | Charles | charles francis rumbaugh execution 555 date se... | none_listed | none_listed | none_listed | [none_listed] | no | no | no |
377 | img_text_980_Claude_Jones.txt | Claude Howard Jones 980\n\n \n\n \n\n \n\nName... | 980 | Jones | Claude | claude howard jones 980 name d r pop 9 24 40 r... | electrician | yes | 9 years | [white, male] | no | no | yes |
378 | img_text_999145_Rolando_Ruiz.txt | Name: Roland Ruiz, Jr. D.R. #999145\n\n@ DOB: ... | 999145 | Ruiz | Rolando | name roland ruiz jr d r 999145 dob 07 04 72 re... | laborer | yes | 10 years | [hispanic, female] | no | yes | no |
379 | img_text_954_Oliver_Cruz.txt | David Oliver Cruz\nD.R.# ao\n\n \n\n \n\n \n\n... | 954 | Cruz | Oliver | david oliver cruz d r ao name dob 5 18 67 rece... | laborer | no | 7 years | [white, female] | no | yes | no |
380 rows × 13 columns
race_vics = [summary[1] if len(summary) == 3 else summary[0] for summary in df['vic_deets'].values]
race_vics
['white', 'white', 'white', 'none_listed', 'hispanic', 'none_listed', 'none_listed', 'none_listed', 'white', 'white', 'none_listed', 'white', 'white', 'white', 'none_listed', 'none_listed', 'white', 'hispanic', 'none_listed', 'white', 'none_listed', 'white', 'none_listed', 'hispanic', 'none_listed', 'none_listed', 'none_listed', 'white', 'white', 'hispanic', 'male', 'black', 'none_listed', 'white', 'white', 'none_listed', 'white', 'none_listed', 'hispanic', 'none_listed', 'none_listed', 'white', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'white', 'none_listed', 'hispanic', 'black', 'white', 'white', 'white', 'none_listed', 'none_listed', 'none_listed', 'white', 'white', 'white', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'white', 'none_listed', 'none_listed', 'white', 'none_listed', 'none_listed', 'white', 'none_listed', 'none_listed', 'none_listed', 'white', 'white', 'none_listed', 'none_listed', 'white', 'white', 'none_listed', 'hispanic', 'white', 'none_listed', 'white', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'white', 'white', 'white', 'none_listed', 'hite', 'none_listed', 'none_listed', 'none_listed', 'hispanic', 'hispanic', 'none_listed', 'none_listed', 'white', 'none_listed', 'white', 'white', 'white', 'none_listed', 'white', 'white', 'none_listed', 'black', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'white', 'none_listed', 'none_listed', 'white', 'white', 'white', 'white', 'white', 'black', 'none_listed', 'none_listed', 'white', 'none_listed', 'none_listed', 'none_listed', 'white', 'none_listed', 'plack', 'waite', 'none_listed', 'white', 'none_listed', 'black', 'none_listed', 'hispanic', 'none_listed', 'white', 'white', 'white', 'white', 'white', 'none_listed', 'hispanic', 'white', 'none_listed', 'white', 'none_listed', 'white', 'none_listed', 'none_listed', 'white', 'none_listed', 'none_listed', 'white', 'white', 'hispanic', 'white', 'none_listed', 'none_listed', 'white', 'none_listed', 'white', 'e', 'e', 'none_listed', 'white', 'white', 'white', 'white', 'white', 'none_listed', 'none_listed', 'black', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'white', 'none_listed', 'white', 'none_listed', 'white', 'none_listed', 'white', 'white', 'none_listed', 'none_listed', 'white', 'none_listed', 'white', 'white', 'white', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'white', 'none_listed', 'none_listed', 'white', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'hispanic', 'white', 'white', 'none_listed', 'none_listed', 'white', 'none_listed', 'black', 'white', 'white', 'none_listed', 'hispanic', 'none_listed', 'white', 'none_listed', 'none_listed', 'white', 'white', 'white', 'none_listed', 'white', 'none_listed', 'none_listed', 'none_listed', 'white', 'none_listed', 'none_listed', 'none_listed', 'white', 'none_listed', 'none_listed', 'hispanic', 'white', 'none_listed', 'white', 'white', 'white', 'white', 'none_listed', 'white', 'black', 'none_listed', 'none_listed', 'none_listed', 'e', 'none_listed', 'white', 'none_listed', 'white', 'white', 'white', 'none_listed', 'white', 'none_listed', 'white', 'hispanic', 'none_listed', 'white', 'white', 'none_listed', 'none_listed', 'e', 'none_listed', 'white', 'none_listed', 'black', 'black', 'white', 'asian', 'white', 'hispanic', 'none_listed', 'white', 'none_listed', 'white', 'white', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'black', 'black', 'none_listed', 'black', 'none_listed', 'none_listed', 'white', 'none_listed', 'none_listed', 'hispanic', 'none_listed', 'biack', 'none_listed', 'none_listed', 'white', 'none_listed', 'white', 'none_listed', 'white', 'white', 'white', 'none_listed', 'white', 'none_listed', 'white', 'white', 'white', '1', 'white', 'white', 'none_listed', 'black', 'white', 'none_listed', 'none_listed', 'e', 'hite', 'e', 'white', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'none_listed', 'white', 'white', 'none_listed', 'white', 'black', 'none_listed', 'none_listed', 'none_listed', 'white', 'none_listed', 'white', 'white', 'none_listed', 'none_listed', 'white', 'none_listed', 'white', 'none_listed', 'white', 'none_listed', 'white', 'hispanic', 'none_listed', 'none_listed', 'none_listed', 'asian', 'white', 'none_listed', 'none_listed', 'none_listed', 'hite', 'none_listed', 'none_listed', 'white', 'none_listed', 'none_listed', 'white', 'hispanic', 'white']
df['race_vic'] = race_vics
df
index | 0 | inmate_number | last_name | first_name | clean | occupation | prior_record | education_level | vic_deets | multiple_vics | vic_female | vic_male | race_vic | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | img_text_732_Johnny_Anderson.txt | Name: dohnny Anderson\n\npop: 12/_ 28 /_59 Rec... | 732 | Anderson | Johnny | name dohnny anderson pop 12 28 59 received cou... | none_listed | no | 6 years | [white, male] | no | no | yes | white |
1 | img_text_810_Betty_Beets.txt | Name: BettyLouBeets RIO\nDOB:03/12/37_ Receive... | 810 | Beets | Betty | name bettyloubeets rio dob 03 12 37 received 1... | cashier | no | 10 years | [white, male] | no | no | yes | white |
2 | img_text_981_Daniel_Hittle.txt | Name: Daniel Joe Hittle D.R. # 981\n\n \n\n \n... | 981 | Hittle | Daniel | name daniel joe hittle d r 981 v pop 3 1 50 re... | welder | yes | 14 years | [white, male] | no | no | yes | white |
3 | img_text_651_John_Satterwhite.txt | Name: John Thomas Satterwhite D.R.# 651\n\nDOB... | 651 | Satterwhite | John | name john thomas satterwhite d r 651 dob 12 29... | mechanic | yes | none_listed | [none_listed] | no | no | no | none_listed |
4 | img_text_999186_John_Chavez.txt | Name: John Chavez D.R.# 999186\nDOB: _ 04/27/6... | 999186 | Chavez | John | name john chavez d r 999186 dob 04 27 68 recei... | painter | yes | 8 years | [hispanic, male] | no | no | yes | hispanic |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
375 | img_text_838_Gerald_Mitchell.txt | Name: Gerald Lee Mitchell ____._.___ D.R.#838_... | 838 | Mitchell | Gerald | name gerald lee mitchell d r 838 dob 12 27 67 ... | carpenter | yes | 10 years | [none_listed] | no | no | no | none_listed |
376 | img_text_555_Charles_Rumbaugh.txt | CHARLES FRANCIS RUMBAUGH\n\nEXECUTION #555 Dat... | 555 | Rumbaugh | Charles | charles francis rumbaugh execution 555 date se... | none_listed | none_listed | none_listed | [none_listed] | no | no | no | none_listed |
377 | img_text_980_Claude_Jones.txt | Claude Howard Jones 980\n\n \n\n \n\n \n\nName... | 980 | Jones | Claude | claude howard jones 980 name d r pop 9 24 40 r... | electrician | yes | 9 years | [white, male] | no | no | yes | white |
378 | img_text_999145_Rolando_Ruiz.txt | Name: Roland Ruiz, Jr. D.R. #999145\n\n@ DOB: ... | 999145 | Ruiz | Rolando | name roland ruiz jr d r 999145 dob 07 04 72 re... | laborer | yes | 10 years | [hispanic, female] | no | yes | no | hispanic |
379 | img_text_954_Oliver_Cruz.txt | David Oliver Cruz\nD.R.# ao\n\n \n\n \n\n \n\n... | 954 | Cruz | Oliver | david oliver cruz d r ao name dob 5 18 67 rece... | laborer | no | 7 years | [white, female] | no | yes | no | white |
380 rows × 14 columns
def get_age_crime(summary):
try:
text = re.compile(r'(?<=age at time of offense )(\d.*?)\W')
result = text.search(summary).group().strip()
if len(result) < 2:
return 'none_listed'
else:
return result
except:
return 'none_listed'
age_crime = [get_age_crime(summary) for summary in df['clean'].values]
age_crime
['21', '46', '39', '32', '27', '30', '18', '19', '23', '22', '23', '27', '27', '22', '22', '43', '18', '19', '20', '21', '23', '26', '17', '34', '28', '25', 'none_listed', '18', '20', 'none_listed', '22', '30', '37', '27', '29', '33', '31', 'none_listed', '35', '28', 'none_listed', '24', '22', 'none_listed', '17', '26', '39', '39', 'none_listed', '38', '34', 'none_listed', '26', '21', '20', '51', '43', '27', '25', '18', '24', 'none_listed', '24', 'none_listed', '30', '33', '53', '42', '35', '27', 'none_listed', 'none_listed', '39', 'none_listed', '29', '32', '56', 'none_listed', '20', 'none_listed', '32', '22', '24', '33', '23', '26', '32', '29', '26', '35', '21', '29', '28', 'none_listed', '24', 'none_listed', '24', '26', 'none_listed', '20', '17', '20', '27', '19', '27', '29', '19', 'none_listed', '26', '33', '31', '21', '21', 'none_listed', '28', '28', '38', '34', '23', '20', 'none_listed', '38', '24', '25', '18', '30', '26', 'none_listed', '19', '18', '33', '39', '30', '23', 'none_listed', '31', '18', '21', '27', '36', '23', '18', '25', '22', '23', 'none_listed', 'none_listed', '39', '29', '44', '19', '20', '21', '45', '29', '27', 'none_listed', '19', '29', '21', '45', '24', '25', '19', '30', '17', '44', '18', '18', '33', '21', '32', '19', '30', '22', 'none_listed', '44', 'none_listed', 'none_listed', '27', '34', '32', '24', '29', '25', '19', '22', '27', '24', 'none_listed', '31', '19', '40', '22', 'none_listed', '33', '18', '32', '19', '24', '26', '22', '31', 'none_listed', 'none_listed', '31', 'none_listed', 'none_listed', '22', 'none_listed', '33', '24', '44', '17', 'none_listed', '20', 'none_listed', '19', '19', '29', 'none_listed', '44', '37', '26', '19', '23', '19', '19', '28', '20', '23', '19', '38', '24', '20', '26', '18', '23', '19', '37', '22', 'none_listed', '31', '26', 'none_listed', '21', '23', 'none_listed', 'none_listed', 'none_listed', '28', '24', '18', '20', '20', '19', '25', '22', '20', '34', 'none_listed', '19', '21', '22', '37', '45', '28', '37', '30', '24', '22', '22', '37', '26', '18', '17', '26', '40', '37', '22', '24', '36', '36', '23', 'none_listed', 'none_listed', '31', '26', '31', '24', '20', '32', '32', 'none_listed', '28', '22', 'none_listed', '30', '29', '28', '31', 'none_listed', '23', '20', 'none_listed', '29', '19', '17', '52', '27', '927', '35', '35', '27', 'none_listed', '20', '25', '22', '33', '30', '22', '24', '34', '23', '21', '26', 'none_listed', '19', '34', '22', 'none_listed', '23', '35', '47', '19', '28', '19', 'none_listed', '19', '30', '30', 'none_listed', '23', '26', '48', '50', '18', '28', '31', 'none_listed', 'none_listed', '20', '25', 'none_listed', '44', '32', '35', '54', '23', '19', '19', '24', '31', '33', 'none_listed', '34', '19', '25', '36', '20', 'none_listed', '28', '20', '24', '50', 'none_listed', 'none_listed', '49', '20', '21']
df['age_crime'] = age_crime
df
index | 0 | inmate_number | last_name | first_name | clean | occupation | prior_record | education_level | vic_deets | multiple_vics | vic_female | vic_male | race_vic | age_crime | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | img_text_732_Johnny_Anderson.txt | Name: dohnny Anderson\n\npop: 12/_ 28 /_59 Rec... | 732 | Anderson | Johnny | name dohnny anderson pop 12 28 59 received cou... | none_listed | no | 6 years | [white, male] | no | no | yes | white | 21 |
1 | img_text_810_Betty_Beets.txt | Name: BettyLouBeets RIO\nDOB:03/12/37_ Receive... | 810 | Beets | Betty | name bettyloubeets rio dob 03 12 37 received 1... | cashier | no | 10 years | [white, male] | no | no | yes | white | 46 |
2 | img_text_981_Daniel_Hittle.txt | Name: Daniel Joe Hittle D.R. # 981\n\n \n\n \n... | 981 | Hittle | Daniel | name daniel joe hittle d r 981 v pop 3 1 50 re... | welder | yes | 14 years | [white, male] | no | no | yes | white | 39 |
3 | img_text_651_John_Satterwhite.txt | Name: John Thomas Satterwhite D.R.# 651\n\nDOB... | 651 | Satterwhite | John | name john thomas satterwhite d r 651 dob 12 29... | mechanic | yes | none_listed | [none_listed] | no | no | no | none_listed | 32 |
4 | img_text_999186_John_Chavez.txt | Name: John Chavez D.R.# 999186\nDOB: _ 04/27/6... | 999186 | Chavez | John | name john chavez d r 999186 dob 04 27 68 recei... | painter | yes | 8 years | [hispanic, male] | no | no | yes | hispanic | 27 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
375 | img_text_838_Gerald_Mitchell.txt | Name: Gerald Lee Mitchell ____._.___ D.R.#838_... | 838 | Mitchell | Gerald | name gerald lee mitchell d r 838 dob 12 27 67 ... | carpenter | yes | 10 years | [none_listed] | no | no | no | none_listed | none_listed |
376 | img_text_555_Charles_Rumbaugh.txt | CHARLES FRANCIS RUMBAUGH\n\nEXECUTION #555 Dat... | 555 | Rumbaugh | Charles | charles francis rumbaugh execution 555 date se... | none_listed | none_listed | none_listed | [none_listed] | no | no | no | none_listed | none_listed |
377 | img_text_980_Claude_Jones.txt | Claude Howard Jones 980\n\n \n\n \n\n \n\nName... | 980 | Jones | Claude | claude howard jones 980 name d r pop 9 24 40 r... | electrician | yes | 9 years | [white, male] | no | no | yes | white | 49 |
378 | img_text_999145_Rolando_Ruiz.txt | Name: Roland Ruiz, Jr. D.R. #999145\n\n@ DOB: ... | 999145 | Ruiz | Rolando | name roland ruiz jr d r 999145 dob 07 04 72 re... | laborer | yes | 10 years | [hispanic, female] | no | yes | no | hispanic | 20 |
379 | img_text_954_Oliver_Cruz.txt | David Oliver Cruz\nD.R.# ao\n\n \n\n \n\n \n\n... | 954 | Cruz | Oliver | david oliver cruz d r ao name dob 5 18 67 rece... | laborer | no | 7 years | [white, female] | no | yes | no | white | 21 |
380 rows × 15 columns
def get_weapon(summary):
try:
if 'knife' in summary:
return 'knife'
# weapon = 'knife'
elif 'gun' in summary:
return 'gun'
# weapon = 'gun'
elif 'cord ' in summary:
# print(summary.split('cord')[1])
return 'cord'
elif 'blunt object':
return 'blunt object'
else:
return 'other'
except:
return 'none_listed'
df['clean_summary'] = [summary.split('summary')[1] if 'summary' in summary else 'nope' for summary in df['clean'].values]
weapon = [get_weapon(summary) for summary in df['clean_summary'].values]
len(df[df['clean_summary'] == 'nope'])
97
weapon
['blunt object', 'blunt object', 'gun', 'gun', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'gun', 'blunt object', 'knife', 'blunt object', 'blunt object', 'knife', 'gun', 'blunt object', 'gun', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'knife', 'gun', 'blunt object', 'blunt object', 'gun', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'knife', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'knife', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'cord', 'gun', 'blunt object', 'blunt object', 'blunt object', 'knife', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'cord', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'knife', 'blunt object', 'blunt object', 'knife', 'blunt object', 'knife', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'knife', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'knife', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'knife', 'blunt object', 'gun', 'knife', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'knife', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'gun', 'knife', 'gun', 'blunt object', 'gun', 'gun', 'cord', 'blunt object', 'blunt object', 'blunt object', 'cord', 'blunt object', 'blunt object', 'knife', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'knife', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'knife', 'knife', 'blunt object', 'gun', 'gun', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'cord', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'knife', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'knife', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'knife', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'knife', 'knife', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'knife', 'blunt object', 'blunt object', 'gun', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'gun', 'gun', 'blunt object', 'blunt object', 'knife', 'blunt object', 'cord', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'knife', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'gun', 'gun', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'blunt object', 'gun', 'blunt object', 'blunt object', 'blunt object']
df['weapon'] = weapon
df.columns
Index([ 'index', 0, 'inmate_number', 'last_name', 'first_name', 'clean', 'occupation', 'prior_record', 'education_level', 'vic_deets', 'multiple_vics', 'vic_female', 'vic_male', 'race_vic', 'age_crime', 'clean_summary', 'weapon'], dtype='object')
columns = ['inmate_number','last_name', 'first_name','education_level','age_crime',
'occupation','prior_record','multiple_vics','weapon','race_vic','vic_male','vic_female']
df1 = pd.DataFrame(df, columns=columns)
df1
inmate_number | last_name | first_name | education_level | age_crime | occupation | prior_record | multiple_vics | weapon | race_vic | vic_male | vic_female | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 732 | Anderson | Johnny | 6 years | 21 | none_listed | no | no | blunt object | white | yes | no |
1 | 810 | Beets | Betty | 10 years | 46 | cashier | no | no | blunt object | white | yes | no |
2 | 981 | Hittle | Daniel | 14 years | 39 | welder | yes | no | gun | white | yes | no |
3 | 651 | Satterwhite | John | none_listed | 32 | mechanic | yes | no | gun | none_listed | no | no |
4 | 999186 | Chavez | John | 8 years | 27 | painter | yes | no | blunt object | hispanic | yes | no |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
375 | 838 | Mitchell | Gerald | 10 years | none_listed | carpenter | yes | no | blunt object | none_listed | no | no |
376 | 555 | Rumbaugh | Charles | none_listed | none_listed | none_listed | none_listed | no | gun | none_listed | no | no |
377 | 980 | Jones | Claude | 9 years | 49 | electrician | yes | no | blunt object | white | yes | no |
378 | 999145 | Ruiz | Rolando | 10 years | 20 | laborer | yes | no | blunt object | hispanic | no | yes |
379 | 954 | Cruz | Oliver | 7 years | 21 | laborer | no | no | blunt object | white | no | yes |
380 rows × 12 columns
df1.to_csv('V9_photo_inmates.csv')
len(df1)
380