# MAIN QUESTION: Does number of promotions sent impact "generosity"?
# To answer this question, we will answer four sub questions:
# QUESTION 1: Does number of promotions sent impact the total amount of donations?
# QUESTION 2: Does number of promotions sent impact the average amount of donations?
# QUESTION 3: Does number of promotions sent impact the time since the last donation?
# QUESTION 4: Does number of promotions sent impact the frequency of donations?
# --
# QUESTION 5: Does number of promotions sent impact the percent of donations?
# NOTE: "Headings" and analysis are preceeded by a single octothorp
## Code comments & processing steps are preceeded by two octothorpes
## STEP 1: Import libraries
## We are using pandas for our data frames and matplot lib for our graphs.
## Gridspec is used to arrange our graphs
import pandas as pd
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec
donors = pd.read_csv('donors_data.csv')
## STEP 2: Examine and clean data
donors.head()
donors.describe()
## "Remove rows" we do not need by creating a new data frame with only the data we want
## Used this opportunity to rename these haphazard variable names
## Added a '_d' to variables that arrived already discretized
donors_cleaned = pd.concat([donors['homeowner dummy'],
donors['NUMCHLD'],
donors['INCOME'],
donors['gender dummy'],
donors['WEALTH'],
donors['HV'],
donors['Icmed'],
donors['Icavg'],
donors['IC15'],
donors['NUMPROM'],
donors['RAMNTALL'],
donors['MAXRAMNT'],
donors['LASTGIFT'],
donors['totalmonths'],
donors['TIMELAG'],
donors['AVGGIFT']
],
axis=1,
keys=['homeowner',
'numchildren',
'income_d',
'gender',
'wealth_d',
'homevalue',
'income_med',
'income_avg',
'lowincome_perc',
'numpromos',
'donations_total',
'donations_max',
'donations_last',
'donations_months_since_last',
'donations_months_between_first_second',
'donations_avg'])
donors_cleaned.describe()
# QUESTION 1: Does number of promotions sent impact the total amount of donations?
# To answer this question, we need:
# 1 -- the total number of promotions
# 2 -- the total amount of donations
plt.plot(donors_cleaned.numpromos, donors_cleaned.donations_total, 'o')
plt.title('Number of Promotions vs Amount of Donations')
plt.xlabel('Promotions')
plt.ylabel('Total Donation Amount')
plt.show()
# That looks like a pretty big outlier
# We should take note of this outlier and then remove it so it doesn't skew the rest of the analysis
plt.boxplot(donors_cleaned.donations_total)
plt.show()
# Testing before I make a big change
test = donors_cleaned
# Only adding items to the df if the donations total is < 250
test = donors_cleaned[donors_cleaned.donations_total < 2500]
# Re-examining our boxplot
plt.boxplot(test.donations_total)
plt.show()
# Still some big outliers
# Settled on 1000 as that is a nice clean "donation-esque" number
test = donors_cleaned[donors_cleaned.donations_total <= 1000]
plt.boxplot(test.donations_total)
plt.show()
# Confirming we still have a good dataset
test.count()
# Reassigning our variable
donors_cleaned = test
# Verifying it worked
donors_cleaned.count()
# BACK TO OUR ORIGINALLY SCHEDULED PROGRAMMING!!
# QUESTION 1: Does number of promotions sent impact the total amount of donations?
# To answer this question, we need:
# 1 -- the total number of promotions
# 2 -- the total amount of donations
plt.plot(donors_cleaned.numpromos, donors_cleaned.donations_total, 'o')
plt.title('Number of Promotions vs Amount of Donations')
plt.xlabel('Promotions')
plt.ylabel('Total Donation Amount')
plt.show()
# This is so much more useful!!
# Moving on
# I'm sure there is a better way to do this...
# SIDE QUEST QUESTION:
homeowner_yes = donors_cleaned[donors_cleaned.homeowner == 1]
homeowner_no = donors_cleaned[donors_cleaned.homeowner == 0]
plt.plot(homeowner_yes.numpromos, homeowner_yes.donations_total, 'o')
plt.plot(homeowner_no.numpromos, homeowner_no.donations_total, 'o')
plt.title('Number of Promotions vs Amount of Donations')
plt.xlabel('Promotions')
plt.ylabel('Total Donation Amount')
plt.show()
# QUESTION 2: Does number of promotions sent impact the average amount of donations?
# To answer this question, we need:
# 1 -- the total number of promotions
# 2 -- the average amount of donations
plt.plot(donors_cleaned.numpromos, donors_cleaned.donations_avg, 'o')
plt.title('Number of Promotions vs Average Donation')
plt.xlabel('Promotions')
plt.ylabel('Average Donation Amount')
plt.show()
# QUESTION 3: Does number of promotions sent impact the time since the last donation?
# To answer this question, we need:
# 1 -- the total number of promotions
# 2 -- the time since the last donation
plt.plot(donors_cleaned.numpromos, donors_cleaned.donations_months_since_last, 'o')
plt.title('Number of Promotions vs Time Since Donation')
plt.xlabel('Promotions')
plt.ylabel('Months since last donation')
plt.show()
# QUESTION 3.5: Does number of promotions sent impact the time between first and second donation?
# To answer this question, we need:
# 1 -- the total number of promotions
# 2 -- the between first and second
plt.plot(donors_cleaned.numpromos, donors_cleaned.donations_months_between_first_second, 'o')
plt.title('Number of Promotions vs Time Since Donation')
plt.xlabel('Promotions')
plt.ylabel('Months since last donation')
plt.show()
# QUESTION 4: Does number of promotions sent impact the frequency of donations?
# To answer this question, we need:
# 1 -- the total number of promotions
# 2 -- the frequency of donations
# - to get the frequency of donations, we need:
# -- 2a. The total donations
# -- 2b. The average donations
donation_frequency = donors_cleaned.donations_total/donors_cleaned.donations_avg
plt.plot(donors_cleaned.numpromos, donation_frequency, 'o')
plt.title('Number of Promotions vs Donation Frequency')
plt.xlabel('Promotions')
plt.ylabel('Donation Frequency')
plt.show()