# Introduction to Measurement
# Eric Guntermann, Centre for the Study of Democratic Citizenship, January 8th, 2016
# Special thanks to Bill Jacoby and Dave Armstrong, who taught me scaling at ICPSR in 2013.
# Much of the code below is an adaptation of code they wrote.
## Summated rating scale
library(foreign) # Load foreign package, allows us to open STATA (and other) files
library(lattice) # Load lattice to create graphs
library(MiscPsycho) # Load MiscPyscho for reliability analysis
# Read in dataset. Before doing so, we need to make sure that it is in the same folder
# as this code file. Then click Session > Set Working Directory > To Source File Location
ces11 <- read.dta("ces11.dta")
# This dataset (2011 CES) contains responses to six items asking people to indicatge how much
# confidence they have in various institutions (courts, civil service, police, federal
# government, provincial government, Elections Canada). Higher values indicate greater
# confidence. Note that variables have been converted to numeric and missing values have
# been deleted.
summary(ces11) # Allows us to see summary of each variable
# Let's standardize all the items
ces11 <- as.data.frame(scale(ces11)) # We need to convert our data back to a data frame,
# because scale converts it to a matrix.
# Calculate Cronbach's alpha, which we use to assess the reliability of the scale
alpha_ces11 <- alpha(ces11) # calculate alpha
alpha_ces11$coefficients # display alpha. This is quite high.
# Create summated rating scale by taking average of each row.
scale <- apply(ces11, MARGIN = 1, FUN = mean)
# Get summary statistics of the scale
summary(scale)
sd(scale)
# Get a histogram of the scale
histogram(~scale, aspect = .75, xlab = "Summated rating scale")
## Item analysis of scale
# Get alpha coefficients for restscores, created by eliminating each variable from the scale
# If alpha goes up when we remove an item (in other words reliability of the scale increases)
# then we should remove it.
alpha_ces11$condAlpha # We see that alpha would decrease if we removed any item from the scale
# This suggests that the reliability of our scale is good. However, alpha tends to
# underestimate reliability, so let's check using restplots to verify that all items have a
# monotonic relationship to the restscores.
# The following function, written by Dave Armstrong, calculates restscores and plots them
# against the items
source("restplot.R") # Make sure file is in working directory (Session, Set Working Directory,
# To Source File Location)
restplot(ces11, jitter=.5, as.table=T) # We can see that all items have a monotonic
# relationship to the restscores, which is the central assumption of summative rating scales
# Therefore, our scale is good.
## Principal components analysis
# For this and the following sections we will use like-dislike scales of political parties
# and party leaders from the survey conducted by the Making Electoral Democracy Work project
# for the 2012 Quebec election.
# Let's load the dataset. It contains a variable for vote intentions (which we will use later)
# and the like-dislike questions
qc12 <- read.dta("qc12.dta")
ld <- qc12[,2:7] # for now, we'll only use evaluations of parties
ld.pca <- princomp(ld, cor=T)
summary(ld.pca, loadings=T) # Shows us the standard deviation of each component, the
# proportion of variance they capture, and the cumulative proportion of variance. The first
# two components account for about 69% of variance.
# Since we specified the loadings=T option. We get the component loadings, which tell us how
# each component relates to each variable (i.e. how each variable loads on each component).
# We can see the components as linear combinations of the variables, with the loadings
# as the coefficients.
biplot(ld.pca) # A biplot shows how much variation in each variable is captured by
# each component. It also shows the position of each respondent on each dimension.
# This is a two dimensional biplot, but biplots can also be three-dimensional
# This looks a bit messy. Let's hide the points by making them white. Let's also make
# the variable vectors black.
biplot(ld.pca, col=c("white", "black"))
# We can access the respondents' values on the components (their scores), by typing:
ld.pca$scores
# Now that we have our principal components, we can append the first two to our original
# dataset and regress vote intentions on the two components
qc12[, c("comp1", "comp2")] <- ld.pca$scores[,1:2]
library(nnet) # We use the multinom function (in the nnet package, which we load)
mod1 <- multinom(vote ~ comp1 + comp2, data=qc12)
library(effects) # We can visualize the effect of these two variables using the effects package
plot(allEffects(mod1)) # allEffects shows us effects plots for all independent variables
# Factor analysis
# Let's use the like-dislike scores of parties and leaders
ld2 <- qc12[,2:12]
R <- cor(ld2) # First, we calculate correlations among the variables
psi2.hat <- solve(diag(diag(solve(R)))) # get matrix of unique variance
Rc <- R-psi2.hat # Calculate reduced correlation matrix (common variance)
# To figure out the number of factors we need we can use theory or a scree plot
# It tell us how many factors are actually useful (in the sense of explaining variance).
#This is how we make one:
eigenv <- eigen(Rc)$values
plot(eigenv, type="b", ylab="Eigenvalue", xlab="Ordinal # of Eigenvalue", pch=16)
# We take the number of factors corresponding to the first bend in the scree plot (2). This
# suggests that any other factors would not explain much variance.
# Now let's perform the factor analysis. First with no rotation:
library(psych)
fa1 <- fa(scale(ld2), nfactors=2, rotate="none", fm="pa")
lambda <- loadings(fa1) # Let's assign the loadings to lambda
lambda # Here we see the factor pattern matrix. It gives the loadings of each variable on
# the factor
# Let's plot the results:
plot(c(-1, 1), c(-1, 1), type="n", xlab="Factor 1", ylab="Factor 2")
arrows(rep(0, nrow(lambda)), rep(0, nrow(lambda)), lambda[,1], lambda[,2])
abline(h=0, v=0)
text(lambda[,1], lambda[,2], colnames(ld2))
# Now, let's try a promax rotation
fa2 <- fa(scale(ld2), nfactors=2, rotate="promax", fm="pa")
lambda2 <- loadings(fa2)
plot(c(-1, 1), c(-1, 1), type="n", xlab="Factor 1", ylab="Factor 2")
arrows(rep(0, nrow(lambda2)), rep(0, nrow(lambda2)), lambda2[,1], lambda2[,2])
abline(h=0, v=0)
text(lambda2[,1], lambda2[,2], colnames(ld2))
# Now a varimax rotation
fa3 <- fa(scale(ld2), nfactors=2, rotate="varimax", fm="pa")
lambda3 <- loadings(fa3)
plot(c(-1, 1), c(-1, 1), type="n", xlab="Factor 1", ylab="Factor 2")
arrows(rep(0, nrow(lambda3)), rep(0, nrow(lambda3)), lambda3[,1], lambda3[,2])
abline(h=0, v=0)
text(lambda3[,1], lambda3[,2], colnames(ld2))
# The promax rotation (oblique rotation) appears to provide the simplest structure
# (i.e. factors run through clouds of variable vectors)
# The first factor appears to be a left-right dimension, while the second seems to explain
# feelings about the CAQ and its leader.
# We can get scores of each survey respondent by typing
fa2$scores
# We can add these to our dataset by typing
qc12$f1 <- fa2$scores[,1]
qc12$f2 <- fa2$scores[,2]
# We can now run regression models with them
mod2 <- multinom(vote ~ f1 + f2, data=qc12)
plot(allEffects(mod2))
## Multidimensional scaling (MDS)
# The package we will use is smacof (stress minimization using a complicated function)
# Let's use the ratings of parties and candidates again
tld2 <- t(ld2) # we want to calculate distances among parties/leaders, but dist function
# calculates distances among rows, so we take the transpose of the data matrix
dismat <- dist(tld2) # calculates dissimilarties among parties/candidates
dismat # Let's take a look at dissimilarities
# We're not sure whether the data are interval/ratio or ordinal, so let's try both metric
# and non-metric MDS. Metric first.
# To find out optimal number of dimensions, we use a scree plot:
eigenv <- eigen(dismat)$values
plot(eigenv, type="b", ylab="Eigenvalue", xlab="Ordinal # of Eigenvalue", pch=16)
# Elbow suggests 2 dimensions are best
# Run MDS
library(smacof)
metric <- smacofSym(dismat, ndim=2, type="ratio", ties="primary", verbose=T)
metric # To see summary of what happened. Metric stress is a measure of goodness of fit
# (like R2). Usually anything below 0.05 is considered good.
# Get point configuration
metric.config <- as.data.frame(metric$conf)
metric.config
# These are coordinates of points on both axes
# Now, let's graph it:
xyplot(D2 ~ D1, data = metric.config,
aspect = 1,
panel=function (x, y) {
panel.xyplot(x, y, col = "black")
panel.text(x[-2], y[-2]-.03, labels = rownames(metric.config)[-2], cex = .75)
panel.text(x[2], y[2]+.04, labels = rownames(metric.config)[2], cex = .75)
},
xlab = "MDS Axis 1",
ylab = "MDS Axis 2",
xlim = c(-1, 1), # Important to keep aspect ratio
ylim = c(-1, 1)
)
# There are two ways to interpret the output from MDS. We can think about dimensions or
# clusters of points.
# Can also use external information. For example, coded party platforms. We would
# regress external information on on coordinate axes. Dimensions don't have to correspond
# to axes.
# For metric MDS to be appropriate, we should find a linear relationship between
# input dissimilarities and distances. We can see the relationship using a Shepard diagram:
scaledd <- as.vector(metric$confdiss)
inputd <- as.vector(metric$delta)
xyplot(scaledd ~ inputd,
aspect = 1,
panel = function (x, y) {
panel.xyplot(x, y, col = "black")
panel.loess(x, y, span = .5, family = "symmetric", degree = 1)
},
xlab = "Input dissimilarities",
ylab = "Distances obtained from MDS solution"
)
# It's pretty close to linear, but since it isn't, let's try non-metric MDS
nonmetric <- smacofSym(dismat, ndim=2, type="ordinal", ties="primary", verbose=T)
nonmetric.config <- as.data.frame(nonmetric$conf)
# and plot it
xyplot(D2 ~ D1, data = nonmetric.config,
aspect = 1,
panel=function (x, y) {
panel.xyplot(x, y, col = "black")
panel.text(x[-2], y[-2]-.03, labels = rownames(nonmetric.config)[-2], cex = .75)
panel.text(x[2], y[2]+.04, labels = rownames(nonmetric.config)[2], cex = .75)
},
xlab = "MDS Axis 1",
ylab = "MDS Axis 2",
xlim = c(-1, 1),
ylim = c(-1, 1)
)
# This looks a bit weird
scaledd2 <- as.vector(nonmetric$confdiss)
inputd2 <- as.vector(nonmetric$delta)
xyplot(scaledd2 ~ inputd2,
aspect = 1,
panel = function (x, y) {
panel.xyplot(x, y, col = "black")
panel.loess(x, y, span = .5, family = "symmetric", degress = 1)
},
xlab = "Input dissimilarities",
ylab = "Distances obtained from MDS solution"
)
# The relationship between dissimilarities and distances is monotonic as required, but the
# distances don't distinguish much among the points, so metric MDS seems preferable.
# Wordfish
library(tm)
corp <- Corpus(DirSource("directory with manifestos"), readerControl=list(language="english", load=TRUE))
corp <- tm_map(corp, tolower)
corp <- tm_map(corp, removeWords, stopwords("english"))
corp <- tm_map(corp, removePunctuation)
corp <- tm_map(corp, removeNumbers)
corp <- tm_map(corp, function(x) gsub("\\W", " ", x))
dat <- tm_map(corp, PlainTextDocument)
dtm <- DocumentTermMatrix(dat)
dat <- inspect(dtm)
dat <- data.matrix(t(aggregate(t(dat) ~ rownames(t(dat)), FUN=sum)))
colnames(dat) <- dat[1,]
dat <- dat[2:nrow(dat),]
dat2 <- matrix(nrow=nrow(dat), ncol=0)
for(i in 1:ncol(dat)){
dat2 <- cbind(dat2, as.numeric(dat[,i]))
}
colnames(dat2) <- colnames(dat)
dat <- dat2
rownames(dat) <- c("Conservative", "Liberal", "NDP", "Green")
count0s <- function(x){
sum(x==0)
}
zeros <- apply(dat,2,count0s)
dat <- dat[,zeros<2]
View(dat)
library(austin) # use install.packages() to install it
wfm <- as.wfm(dat, word.margin=2)
results <- wordfish(wfm, dir=c(3,1))
plot(results)
slopes <- results$beta
intercepts <- results$psi
plot(slopes, intercepts, type="n")
text(slopes, intercepts, label=results$words)