# Last updated June 30 2014
# Author: Claudia Wagner, claudia.wagner@gesis.org
options(digits=3)
# First time users: install packages by uncommenting the following line.
#install.packages(c('lmtest', 'boot', 'cluster'))
library('lmtest')
library('boot')
library('cluster')
#############################################################
##### Explore R ######
#############################################################
# show variables in your workspace
ls()
# show working directory
getwd()
# change working directory
setwd("C:/Users/wagnerca/Dropbox/Slides/ss2014-southampton/WebSciSS2014")
# use help to get information about different functions e.g. mean
help(mean)
# see examples of how a function can be used
example(mean)
#############################################################
##### Input/Output ######
#############################################################
# Read/Write CSV files
data <- read.table("data.txt", header = TRUE, sep = ",", row.names = 1)
write.table(data, file="new-data.csv",sep = ",",col.names = NA,qmethod = "double")
# select first column
data[, 1]
data["Sepal.Length"]
#############################################################
##### Point Estimates (central tendency and dispersion)######
#############################################################
# Task: Explore and use measures of central tendency
vec <- c(12, 13, 13, 15, 33, 44, 80)
median(vec)
# mode
temp <- table(vec)
names(temp)[temp == max(temp)]
#arithmetic mean
mean(vec)
# geometric mean
exp(mean(log(vec)))
#harmonic mean
1/mean(1/vec)
summary(vec)
# Task: Explore and use measures of dispersion
# range
range = max(vec) - min(vec)
# interquartile range
IQR(vec)
# sum of square
ss = sum((vec - mean(vec))^2)
ss
# variance in sample
ss/length(vec)
# standard deviation in sample
(ss/length(vec))^.5
# Task: compare the inferential variance of the population with the sample variance! What do you observe?
# inferential variance
var(vec)
# inferential standard deviation
sd(vec)
# Task: Plot frequency vector and dispersion
eyes <- c(2, 5, 3)
labels <- c("blue", "braun", "green")
barplot(eyes, names.arg=labels, ylab="Frequency")
boxplot(vec, ylab="Frequencies")
#######################################################
##### Interval Estimates(Confidence Intervals) ######
#######################################################
# Task: Compute normal CI with known variance
# sample size
n = 50
# confidence level
conf = 95
# known population std
sigma = 15
# sample mean
m.hat = 94.6
alpha = (100-conf)/100
# standard deviation in sample (standard error)
se.hat = sigma / n^0.5
# margin of error moe
# To get quantiles or "critical values", you can use the qnorm( )
# qnorm(.95) p = .05, one-tailed (upper)
# qnorm(c(.025,.975)) p = .05, two-tailed
moe = se.hat * qnorm(1-alpha/2)
lower = m.hat - moe
upper = m.hat + moe
c(lower, upper)
# Task: Compute CI for the mean of a single sample using t
#variance of sample
sigma.hat = 19.6
# standard deviation in sample (standard error)
se.hat = sigma.hat / n^0.5
# margin of error moe
# To get quantiles or "critical values", you can use the t-distribution qt()
moe = se.hat * gt(1-alpha/2, n-1)
lower = m.hat - moe
upper = m.hat + moe
c(lower, upper)
# Task: Compute CI for discrete data
# Assume we made 24 trails and observed 19 successes
# Give exact CI for the observed success propotion in the sample with confidence level 0.99
# Wilson CI
prop.test(19, 24)
prop.test(19, 24, conf.level=.99, correct=FALSE)$conf.int
# we can also use the binomial distribution
#binom.exact(19, 24, tsmethod="blaker", conf.level=0.99)
# Task: Compute the CI for a difference in independent means
v1 = c(1, 2, 3, 4, 5, 6, 7)
v2 = c(4, 5, 6, 7, 8, 9, 10)
t.test(v1, conf.level=.95)$conf.int
t.test(v2, conf.level=.95)$conf.int
t.test(v1, v2, conf.level=.95)$conf.int
# Task: Compute the CI for a difference proportions
# In group 1 we observe 24 smokers out of 50
# In group 2 we observe 3 smokers out of 50
# Cpompute th 95% CI for the difference between 3 and 24 out of 50
s <- c(24, 3)
n <- c(50, 50)
prop.test(s, n)$conf.int
# Task: generate a bootstrap estimate for the CI of the median of a single sample
medboot = boot(v1, function(x, i) median(x[i]), R=10000)
temp = boot.ci(medboot, conf=.95)
#######################################################
##### Sampling distributions and standard errors ######
#######################################################
# Consider the mean of a bernoulli random variable, where we flip a coin
# that lands on 1 with probability p, and 0 with probability (1-p).
# We flip a coin N times, observe the average number of times it lands on 1,
# and take measurements of this mean num.replications times.
N <- 100 # sample size per exp impacts the CI and Var
p <- 0.2
num.replications <- 2000 # we repeat the experiment 2000 times
# rbinom draws a sample of size N from a binomial distribution
# replicate repeats the process num.replications times
replications <- replicate(num.replications, mean(rbinom(N, 1, p)))
# replications: holds the sampling distribution of the mean
# Task: Create the Sampling distributions and standard errors.
# Construct the confidence intervals for p for a few values of p, N
# you can see that the standard error rapidly diminishes with N.
# Here is the variability in the estimates.
# This is called the sampling distribution.
hist(replications, xlim=c(0,0.4))
var(replications)
# The standard error is the standard deviation of the sampling distribution
# which is approximately normal for sufficiently large N.
est.se <- sd(replications)
mean.est.p <- mean(replications)
mean.est.p
est.se
c(mean.est.p - 1.96*est.se, mean.est.p + 1.96*est.se)
# For large sample sizes (i.e., sample size is almost population size),
# we can approximate this using the following,
# without explicitly constructing the sampling distribution.
single.trial <- rbinom(N, 1, p)
p.single.trial <- mean(single.trial)
# normal approximation of SE
se <- sqrt(p.single.trial*(1-p.single.trial)/N)
se
# norm approx 95% CI
c(p.single.trial - 1.96*se, p.single.trial + 1.96*se)
# This is close to what comes out of a linear regression.
d <- data.frame(y=single.trial)
coeftest(lm(y ~ 1, data=d))
#######################################################
##### Hypothesis Tests ######
#######################################################
# Task: Calculate a one-sample t test on summary statistic
# H0: mean == 100
# We have one sample of size n=50 with mean 94.6 and std 19.6
# First we have to estimate the SE, the variance is unknown.
n = 50
mean.hat = 94.6
sd.hat = 19.6
se.hat = sd.hat / n^0.5
mean.null = 100
t.obs = (mean.null - mean.hat)/ se.hat
#get two-sided p-value
2 * pt(t.obs, n-1)
# p-value is the conditional probability that a statistic is as extreme or more as we observe it
# assuming that H0 is true
# low p-value < 0.05 means that if we assume H0 is true the probability of observing
# our data is less than 5%
# Conduct t-test on sample data
v1 = c(1, 2, 3, 4, 5, 6, 7)
v2 = c(4, 5, 6, 7, 8, 9, 10)
t.test(v1) # default null hypothesis is mean = 0
# we can reject H0 since p < 0.05
# Task: test H0: mean of v1 == 4
t.test(v1, mu=4)
# we cannot reject H0
# Task: test H0: mean v1 == mean v2 --> i.e. difference in means is 0
t.test(v1, v2, mu=0)
# if we have equal variance
t.test(v1, v2, mu=0, var.equal=TRUE)
## pairwise t-test with corrections
attach(airquality)
Month <- factor(Month, labels = month.abb[5:9])
pairwise.t.test(Ozone, Month)
pairwise.t.test(Ozone, Month, p.adj = "bonf")
pairwise.t.test(Ozone, Month, pool.sd = FALSE)
detach()
# Task: Explore the Pearson Chi-Squared Goodness of Fit Test
# Flip a coin 10 times and observe 3 heads
# How likely is this observation if H0 assumes equiprobability (p=0.5 and 1-p = 0.5)
chisq.test(c(3,7))
# per default H0 alway assumes equiprobability
# we cannot reject H0, but what if we would observe only 1 head?
chisq.test(c(1,9), p=c(5, 5), rescale.p = TRUE)
# we can define our expection by hand and change H0
# Lets assume H0 should be that p(head) = 0.6
chisq.test(c(3,7), p=c(6, 4), rescale.p = TRUE)
# The warning message refers to a small expected count in one cell
# chi-square test needs at least 4 observations per cell
# we can use sampling methods to generate more observations
chisq.test(c(3,7), p=c(6, 4), rescale.p = TRUE, simulate.p.value = TRUE)
# Task: Conduct a Chi-Squared Test to explore the relation between
# political party membership and opinion about global warmining (2 categorical vars)
# Random sample of fb users for whom we know their political position:
# How many of them have ever checked in at a nightclub?
# Democrats: 700/1000 said yes
# Republican: 90/1000 said yes
# Do the nightlife preferences differ by political party?
# Give 95% CI for difference in proportions
dems = rep( c(0,1), c(1000-100, 100) )
repubs = rep( c(0,1), c(1000-90, 90) )
mean(dems) #0.1
mean(repubs) #0.09
del.p = mean(dems) - mean(repubs) #0.01 (point estimate)
# generate the sampling distribution of the difference in menas
reps = replicate( 1000, {
ds = sample( dems, 1000, replace=TRUE )
rs = sample( repubs, 1000, replace=TRUE )
mean( ds ) - mean( rs )
} )
# SE is the SD of the sampling distribution
SE = sd( reps ) # 0.0131
SE
c( del.p - 2*SE, del.p + 2*SE ) #-0.0162 0.0362 (interval estimate)
# Task: Explore differences in proportions of 2 indpedent samples
# sex-difference in smoking behjavior
# 70 women out of 190 smoked in our study
# 65 men out of 205 smoked
prop.test(c(70,65),c(190,205))
#############################################################
##### Regression ######
#############################################################
# Task: conduct a linear regression between 2 variables
# Can we use the observed time to explain/predict the distance?
# Note: the independent t-test is a special case of a regression where the
# predictor is binary
time <- c(3, 6, 9, 12, 15, 18)
distance <- c(6.3, 3.9, 2.3, 2, 2.8, 1.6)
# lm function fits a linear model using least square estimation
# Y = b0 + Xb1
# lm(Y ~ X)
lm.res = lm(distance ~ time)
# let's plot the data and the line which we fitted
plot (time, distance, xlab="Time", ylab="Distance")
abline(5.86, -0.2581, lty=2)
# we could also say abilne(lm.res)
# how well does the model fit the data? Explore R squared.
summary(lm.res)$r.square
# around 70% of the variability in the distance variable can be explained by the time variable
summary(lm.res)
# note that the residual standard error is the standard error of the estimate
# no CI are provided. Let's compute them
confint(lm.res, level=.95)
# use the model to predict Y for new values of X
new.data = data.frame(time=c(10.5, 45))
predict(lm.res, newdata = new.data, interval='confidence')
#############################################################
##### ANOVA ######
#############################################################
# ANOVA is a special case of a regression involving nominal predictors
## Task: Conduct a One-Way ANOVA
## we have 2 groups of people, 10 people per group
## does the group memebership explains their test scores which we observe
group <- c(rep(1, 10), rep(2, 10))
observation <- c(2, 3, 1, 1, 1, 1, 1, 1, 1, 1, 6, 7, 5, 6, 7, 5, 6, 7, 5, 3)
data <- data.frame(Group=factor(group), Observation=observation)
# can we explain our observations via the group-memberships?
# do the group memberships have an significant effect on the observation values?
aov_res = aov(Observation ~ Group , data )
summary(aov_res)
###########################################
### Clustering Example
### In parts taken from: http://www.stat.sc.edu/~hitchcock/chapter6_R_examples.txt
###########################################
food <- read.table("http://www.stat.sc.edu/~hitchcock/foodstuffs.txt", header=T)
attach(food)
head(food)
# The hclust function requires that a distance object be input:
# scale the data by dividing each variable by its standard deviation:
std <- sd(as.matrix(food[,-1])) # finding standard deviations of variables
food.std <- sweep(food[,-1],2,std,FUN="/")
# Calculating pairwise Euclidean distances between the (standardized) objects:
dist.food <- dist(food.std)
###########################################
### Kmeans
###########################################
# A K-means clustering with k = 5:
# Note that the stability of the result can be improved by increasing the maximum number
# of iterations and using multiple random starts:
food.k5 <- kmeans(food.std, centers=5, iter.max=100, nstart=25)
food.k5
# Let's try k=4:
food.k4 <- kmeans(food.std, centers=4, iter.max=100, nstart=25)
food.k4
# Printing the clustering vector for the 4-cluster solution:
food.k4$cluster
food.k4.clust <- lapply(1:4, function(nc) Food[food.k4$cluster==nc])
food.k4.clust # printing the clusters in terms of the Food labels
### Choose best k
# Choose the value of k associated with the LARGEST average silhouette width.
# The silhouette width is like the diameter or the radius another measure of assessing the cluster quality.
# It measure of how tightly grouped all the data in the cluster are.
# If there are too many or too few clusters,
# some of the clusters will typically display much narrower silhouettes than the rest.
summary(silhouette(kmeans(food.std, centers=2, iter.max=100, nstart=25)$cluster, dist(food.std)))$avg.width
summary(silhouette(kmeans(food.std, centers=3, iter.max=100, nstart=25)$cluster, dist(food.std)))$avg.width
summary(silhouette(kmeans(food.std, centers=4, iter.max=100, nstart=25)$cluster, dist(food.std)))$avg.width
summary(silhouette(kmeans(food.std, centers=5, iter.max=100, nstart=25)$cluster, dist(food.std)))$avg.width
############# Visualization of Clusters:
### Via the scatterplot matrix:
###
pairs(food[,-1], panel=function(x,y) text(x,y,food.k4$cluster))
# Cluster 1 foods tend to be high in calcium. (this comment does not reflect all runs of the algorithm)
# Cluster 4 foods tend to be high in fat. (this comment does not reflect all runs of the algorithm)
### How can we visualize the high dimensional clustered data?
### Produce a 2 dimensional plot using the
### scores on the first 2 principal components,
### with the clusters separated by color:
food.pc <- princomp(food[,-1],cor=T)
# Setting up the colors for the 5 clusters on the plot:
my.color.vector <- rep("green", times=nrow(food))
my.color.vector[food.k4$cluster==2] <- "blue"
my.color.vector[food.k4$cluster==3] <- "red"
my.color.vector[food.k4$cluster==4] <- "orange"
# Plotting the PC scores:
par(pty="s")
plot(food.pc$scores[,1], food.pc$scores[,2], ylim=range(food.pc$scores[,1]),
xlab="PC 1", ylab="PC 2", type ='n', lwd=2)
text(food.pc$scores[,1], food.pc$scores[,2], labels=Food, cex=0.7, lwd=2,
col=my.color.vector)
# Cluster 1 is the "canned seafood" cluster. (this comment does not reflect all runs of the algorithm)
# Cluster 2 is the clams cluster. (this comment does not reflect all runs of the algorithm)
## NOTE: The default for the kmeans function in R is the Hartigan-Wong (1979) algorithm.
## The MacQueen algorithm (1967) can be used by altering the code to, say:
## kmeans(food.std, centers=4,algorithm="MacQueen")
## You can try it in this case -- I don't think the MacQueen algorithm produces as good of a result.
###########################################
### Agglomerative Hierarchical Clustering
##########################################
# see: http://stat.ethz.ch/R-manual/R-patched/library/stats/html/hclust.html
# We inspect 3 different clustering methods here. All start with assigning each point to
# its own cluster. The methods differ frome each other by how the distance between clusters is computed.
# Single linkage Method:
# The distance between 2 clusters is defined as the shortest possible distance between the members
# of the 2 clusters (focus on best friends).
food.single.link <- hclust(dist.food, method='single')
# Plotting the single linkage dendrogram:
plot(food.single.link, labels=Food, ylab="Distance")
# Complete Linkage:
# The distance between 2 clusters is defined as the longest possible distance between the members
# of 2 clusters (focus on worst friends).
food.complete.link <- hclust(dist.food, method='complete')
# Plotting the complete linkage dendrogram:
plot(food.complete.link, labels=Food, ylab="Distance")
# Average Linkage:
# The distance between 2 clusters is defined as the average possible distance between the members
# of 2 clusters (focus on average).
food.avg.link <- hclust(dist.food, method='average')
# Plotting the average linkage dendrogram:
plot(food.avg.link, labels=Food, ylab="Distance")
# Note the complete linkage algorithm is slightly less prone to forming
# "outlier-only" clusters.