# IST 687 - Applied Data Science
# Week 4
#
#                                   W4 Teaching and HW4
# Step 1  
v <- c(1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 6,6, 6, 6, 7,7, 7,  8,8, 8,  10000)
hist(v)
curve(v)
#inside a function do the following
mean(v)
median(v)
max(v)
min(v)
sd(v)
length(v)
length(v[v=="1"])

#Your HW4 Function should look similar to this
Info <- function(x){
  #statistical measurments
  a<-mean(x)
  b<-median(x)
  z <- max(x)
  #All Others
  #Report the results
  cat("mean:",a,"\nmedian:",b, "\nmax:", z)
}
Info(v)

quantile(v,0.5) 
quantile(v,0.25)
hist(v)

#install package to use the function skewness
#install.packages("moments")
library(moments)
skewness(v) #measure of symmetry

v2 <- c(10000, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 ,4, 5, 5, 5, 6, 6, 7)
mean(v2)
hist(v2)
skewness(v2)

#generate random numbers with a normal distribution using n, mean, and SD
#install.packages("stringr")
library(stringr)

hist(rnorm(1000, 10, 10))

mean(rnorm(3, 0, 2))
skewness(rnorm(20000000, 10, 2))
#How to replicate
#HW3 step 2

A <- "Head"
B <- "Tail"

RepA <- replicate(100,A)
RepB <- replicate(100,B)

#Store All results in one bucket
MyBucket <- c(RepA, RepB)

#Sample function, subset
SamMyBucket <- sample(MyBucket,20,replace=TRUE)
SamMyBucket

#number of tails in my sample
TailCount <- length(SamMyBucket[SamMyBucket=="Tail"])
#TailCount <- length(grep("Tail",SamMyBucket))
TailPerc <- TailCount / length(SamMyBucket) * 100

cat(TailCount, "Tail, which is" ,TailPerc, "%")

# create a function to do sampling
sam <- function(v, x){
  samp <- sample(v,x,replace=TRUE)
  num <- length(samp[samp=="Tail"])/length(samp) *100
  return(num)
}


sam(MyBucket, 10)

#multible iterations
x <- mean(replicate(10,sam(MyBucket, 10)))
#store in a variable
sampleMeans <- replicate(20,sam(MyBucket, 100))

hist(sampleMeans)
Info(x)

#step #2 clarification
#6 Sample 10 samples from your Jar, e.g. sam(MyBucket, 10)
#7 generate a sample of 20 means, each was measured from the mean of 10 samples, e.g. replicate(20,mean(replicate(10,sam(MyBucket, 10))))
#8 generate a sample of 20 means, each was measured from the mean of 100 samples
#9 generate a sample of 100 means, each was measured from the mean of 100 sampless

# Step 3: Explore the airquality dataset
# Question 10
D <- airquality
any(is.na(D))
# Remove NA rows using the following function
DD <- na.omit(D)
#use the following code to replace NA wih the mean
#DataFrame$Column[is.na(DataFrame$Column)] <- mean(DataFrame$Column) 
#Mice package https://cran.r-project.org/web/packages/mice/mice.pdf