crime <- read.csv('http://datasets.flowingdata.com/crimeRatesByState2005.csv', sep=",", header=TRUE)
plot(crime$murder, crime$burglary)
PROBLEM: The outlier on the far right is skewing the graph SOLUTION: Remove that outlier to get a better understanding of the data as a whole NOTE: This does not mean we are ignoring Washington, DC. This means we will return to it later
crime2 <- crime[crime$state != "District of Columbia",]
crime2 <- crime2[crime2$state != "United States",]
plot(crime2$murder, crime2$burglary)
PROBLEM: Axis should start at zero SOLUTION: Set x-axis(1-10) y-axis(0-1200)
plot(crime2$murder, crime2$burglary, xlim = c(0,10), ylim = c(0,1200))
PROBLEM: We could use more clarity SOLUTION: Add a LOESS curve with scatter,smooth()
scatter.smooth(crime2$murder, crime2$burglary, xlim = c(0,10), ylim = c(0,1200))
PROBLEM: This is still ugly SOLUTION: Fix it in Illustrator (if time permits)
scatter.smooth(crime2$murder, crime2$burglary, xlim = c(0,10), ylim = c(0,1200))
plot(crime2[,2:9])
PROBLEM: We want curves! SOLUTION: add panel.smooth
plot(crime2[,2:9], panel=panel.smooth)
PROBLEM: It’s ugly! SOLUTION: Fix it in Illustrator (if time permits)
Tip from Nathan: 1. Decide what part of the story you want to tell 2. Design graphic to highlight this area 3. Do not obscure facts
crime <- read.csv('http://datasets.flowingdata.com/crimeRatesByState2005.tsv', sep="\t", header=TRUE)
symbols(crime$murder, crime$burglary, circles=crime$population)
PROBLEM: The circle sizes are incorrectly proportional to the radius, not the area SOLUTION: use sqrt( crime$population, pi)
radius <- sqrt( crime$population / pi)
symbols(crime$murder, crime$burglary, circles=radius)
PROBLEM: The circles are huge! SOLUTION: Scale the circles down with the inches argument
symbols(crime$murder, crime$burglary, circles=radius, inches=0.35, fg="white", bg="red", xlab="Murder Rate", ylab="Burglary Rate")
NOW WITH SQUARES!
symbols(crime$murder, crime$burglary, squares=sqrt(crime$population), inches=0.5, xlab="Murder Rate", ylab="Burglary Rate")
PROBLEM: We can see a relationship, but we have no idea which bubble is which state SOLUTION: add labels with text()
symbols(crime$murder, crime$burglary, circles=radius, inches=0.35, fg="white", bg="red", xlab="Murder Rate", ylab="Burglary Rate")
text(crime$murder, crime$burglary, crime$state, cex=0.5)
PROBLEM: It’s ugly and Georgia is behind Texas! SOLUTION: Fix it in Illustrator (if time permits)
birth <- read.csv('http://datasets.flowingdata.com/birth-rate.csv')
stem(birth$X2008)
##
## The decimal point is at the |
##
## 8 | 2371334468999
## 10 | 01223455566999001222334555777889
## 12 | 00011111356789993789
## 14 | 0034566788991237
## 16 | 227779123677889
## 18 | 00233677888900448
## 20 | 0024445688912455679
## 22 | 0057834579
## 24 | 11456677771347
## 26 | 31335667
## 28 | 014999
## 30 | 124234
## 32 | 1449069
## 34 | 556049
## 36 | 8890
## 38 | 023455823468
## 40 | 23125
## 42 | 699
## 44 | 17
## 46 | 252
## 48 |
## 50 |
## 52 | 5
hist(birth$X2008)
hist(birth$X2008, main="Global Distribution of Birth Rates", xlab="Live births per 1,000 population", ylab="Country Count", col = "purple", border = "white")
PROBLEM: Even with that clean up, it’s still unattractive. SOLUTION: Fix it in Illustrator (if time permits)
NOTE: Density plots cannot have missing values. The first step is always remove the missing values.
birth2008 <- birth$X2008[!is.na(birth$X2008)]
d2008 <-density(birth2008)
plot(d2008, type="n")
# type="n" means "no plotting"
polygon(d2008, col="#821122", border="#cccccc")
# plot(d2008, type="n")
library(lattice)
# histogram(birth$X2008,breaks=10)
# lines(d2008)
NOTE: Nathan suggests using python (which is awesome) but since we can do this easily right here in R, I’m doing it right here in R
library(reshape)
mdata <- melt(birth)
melted <- mdata[,2:3]
colnames(melted) <- c('year', 'rate')
histogram(~rate | year, data=melted, layout=c(10,5))
PROBLEM: Messy! Reads the wrong direction! An outlier is skewing the data! SOLUTION: First, remove outlier
melted <- melted[melted$rate <132,]
histogram(~rate | year, data=melted, layout=c(10,5))
melted$year <- as.character(melted$year)
if(nchar(melted$year[1])>4){
melted$year <- substring(melted$year, 2)
}
h <- histogram(~rate | year, data=melted, layout=c(10,5))
update(h, index.cond=list(c(41:50, 31:40, 21:30, 11:20, 1:10)))
PROBLEM: Much better! But still, busy and needs some help SOLUTION: Illustrator if time permits!
tvs <- read.table('http://datasets.flowingdata.com/tv_sizes.txt', sep="\t", header=TRUE)
# Filter outliers
tvs <- tvs[tvs$size < 80,]
tvs <- tvs[tvs$size > 10,]
# Set breakfs for histograms
breaks = seq(10, 80, by=5)
# Set layout
par(mfrow=c(4,2))
# Draw histograms one by one
hist(tvs[tvs$year == 2009,]$size, breaks=breaks)
hist(tvs[tvs$year == 2008,]$size, breaks=breaks)
hist(tvs[tvs$year == 2007,]$size, breaks=breaks)
hist(tvs[tvs$year == 2006,]$size, breaks=breaks)
hist(tvs[tvs$year == 2005,]$size, breaks=breaks)
hist(tvs[tvs$year == 2004,]$size, breaks=breaks)
hist(tvs[tvs$year == 2003,]$size, breaks=breaks)
hist(tvs[tvs$year == 2002,]$size, breaks=breaks)
# And now, as a loop
par(mfrow=c(4,2))
for (year in 2002:max(tvs$year)){
hist(tvs[tvs$year == year,]$size, breaks=breaks)
}
setwd("/Users/kendraosburn/syracuse/719")
happiness <- read.csv('happiness_project.csv', sep=",", header=TRUE)
par(mfrow=c(3,1))
happiness_melt <- melt(happiness, id=c('year', 'happiness_score'))
hist(happiness_melt[happiness_melt$year == 2015,]$happiness_score, xlim=c(0,10))
abline(v=mean(happiness_melt[happiness_melt$year == 2015,]$happiness_score),col="blue")
hist(happiness_melt[happiness_melt$year == 2016,]$happiness_score, xlim=c(0,10))
abline(v=mean(happiness_melt[happiness_melt$year == 2016,]$happiness_score),col="blue")
hist(happiness_melt[happiness_melt$year == 2017,]$happiness_score, xlim=c(0,10))
abline(v=mean(happiness_melt[happiness_melt$year == 2017,]$happiness_score),col="blue")
# print(mean(happiness_melt[happiness_melt$year == 2015,]$happiness_score))
# print(mean(happiness_melt[happiness_melt$year == 2016,]$happiness_score))
# print(mean(happiness_melt[happiness_melt$year == 2017,]$happiness_score))
# And now, as a loop
# for (year in 2015:2017){
# hist(happiness_melt[happiness_melt$year == year,]$size, breaks=breaks)
# }
par(mfrow=c(3,3))
regions <- unique(happiness_melt$region)
for (region in regions) {
hist(happiness_melt[happiness_melt$region == region,]$happiness_score,
xlim=c(3,8), main = paste(region), xlab="Score", ylab="Number of Countries")
}
par(mfrow=c(1,3))
years <- unique(happiness_melt$year)
for (year in years) {
hist(happiness_melt[happiness_melt$year == year,]$happiness_score,
xlim=c(3,8), main = paste(year), xlab="Score", ylab="Frequency")
}
END