File:NZ Elections 2005-2008 - PPM bias.png
From Wikimedia Commons, the free media repository
Jump to navigation
Jump to search
NZ_Elections_2005-2008_-_PPM_bias.png (600 × 600 pixels, file size: 8 KB, MIME type: image/png)
File information
Structured data
Captions
Summary[edit]
DescriptionNZ Elections 2005-2008 - PPM bias.png |
English: Biases in the preferred prime minister polls. Bias is defined as the (p-s)/s, where p is the value estimated by the individual poll, and s is the "mean" value estimated using a Loess smoother taking into account all polls in the period 2005-2008 (see eg the Wikipedia page, Opinion_polling_for_the_New_Zealand_general_election,_2008, for the figure showing the Loess smoother and the source data) |
Date | |
Source | Own work |
Author | Mark Payne, Denmark |
Figure is produced using the R statistical package, using the following code. It first reads the HTML directly from the website, then parses the data and saves the graph into your working directory. It should be able to be run directly by anyone with R.
rm(list=ls())
#Load the complete HTML file into memory
html <- readLines(url("http://en.wikipedia.org/wiki/Opinion_polling_for_the_New_Zealand_general_election,_2008"),encoding="UTF-8")
closeAllConnections()
#The fourth table is the opinion poll data
tbl <- html[(grep("<table.*",html)[4]):(grep("</table.*",html)[4])]
#Now split it into the rows, based on the <tr> tag
tbl.rows <- split(tbl,cumsum(tbl=="<tr>"))
#Now extract the data
survey.dat <- lapply(tbl.rows,function(x) {
#Start by only considering where we have <td> tags
td.tags <- x[grep("<td",x)]
#Polling data appears in columns 3-6
dat <- td.tags[3:6]
#Now strip the data and covert to numeric format
dat <- gsub("<td>|</td>","",dat)
dat <- gsub("%","",dat)
dat <- gsub("-","0",dat)
dat <- as.numeric(dat)
#Getting the date strings is a little harder. The approach we will take is to take advantage
#of the title="date" hyperlinks to generate a set of dates
date.str <- td.tags[2] #Dates are in the second column
date.str <- gsub("<sup.*</sup>","",date.str) #Throw out anything between superscript tags, as its an reference to the source
titles <- gregexpr("(?U)title=\".*\"",date.str,perl=TRUE)[[1]] #Find the location of the title tags
#Now, extract the actual date strings
date.strings <- rep(NULL,length(titles))
for(i in 1:length(titles)) {
date.strings[i] <- substr(date.str,titles[i]+7,titles[i]+attr(titles,"match.length")[i]-2)
}
yr <- rev(date.strings)[1]
dates <- rep(as.POSIXct(Sys.time()),length(date.strings)-1)
for(i in 1:(length(date.strings)-1)) {
dates[i] <- as.POSIXct(strptime(paste(date.strings[i],yr),"%B %d %Y"))
}
survey.time <- mean(dates)
#Get the name of the survey company too
survey.comp <- td.tags[1]
survey.comp <- gsub("<sup.*</sup>","",survey.comp)
survey.comp <- gsub("<td>|</td>","",survey.comp)
survey.comp <- gsub("<U+2013>","-",survey.comp,fixed=TRUE)
survey.comp <- gsub("(?U)<.*>","",survey.comp,perl=TRUE)
#And now return results
return(data.frame(Company=survey.comp,Date=survey.time,t(dat)))
})
#Combine results
surveys <- do.call(rbind,survey.dat)
leaders <- c("Helen Clark","Don Brash","John Key","Winston Peters")
colnames(surveys) <- c("Company","Date",leaders)
#Now extract the data and filter out the NAs
cols <- c("red","purple","blue","black")
polls <- subset(surveys,!is.na(surveys$Date))
polls$Date <- as.double(polls$Date)
#And start marking the graph!
ticks <- ISOdate(c(2005,rep(2006,3),rep(2007,3),rep(2008,3)),c(9,rep(c(1,5,9),3)),1)
xlims <- range(as.double(c(ticks,ISOdate(2009,2,1))))
png("NZ_opinion_polls_2005-2008 -PPM.png",width=778,height=487,pointsize=16)
par(mar=c(3,4,1,1))
matplot(polls$Date,polls[,leaders],pch=NA,xlim=xlims,ylab="Preferred Prime Minister (%)",xlab="",col=cols,xaxt="n",ylim=c(0,60))
abline(h=seq(0,95,by=5),col="lightgrey",lty=3)
abline(v=as.double(ticks),col="lightgrey",lty=3)
#Now add loess smoothers
smoothed <- list()
for(i in 1:length(leaders)) {
smoother <- loess(polls[,leaders[i]] ~ polls[,"Date"],span=0.33)
smoothed[[i]] <- predict(smoother,se=TRUE)
# polygon(c(polls[,"Date"],rev(polls[,"Date"])),
# c(smoothed[[i]]$fit+smoothed[[i]]$se.fit*1.96,rev(smoothed[[i]]$fit-smoothed[[i]]$se.fit*1.96)),
# col=rgb(0.5,0.5,0.5,0.5),border=NA)
lines(polls[,"Date"],pmax(0,smoothed[[i]]$fit),col=cols[i],lwd=2) #Constraints it to be positive
}
names(smoothed) <- leaders
matpoints(polls$Date,polls[,leaders],pch=20,col=cols)
legend("topleft",legend=leaders,col=cols,pch=20,bg="white",lwd=2)
axis(1,at=as.double(ticks),labels=format(ticks,format="%b\n%Y"),cex.axis=0.8)
axis(4,at=axTicks(4),labels=rep("",length(axTicks(4))))
#Add best estimates
for(i in 1:length(smoothed)) {
lbl <- sprintf("%4.0f%% ± %2.0f",abs(round(rev(smoothed[[i]]$fit)[1],0)),round(1.96*rev(smoothed[[i]]$se.fit)[1],0))
text(rev(polls$Date)[1],rev(smoothed[[i]]$fit)[1],labels=lbl,pos=4,col=cols[i])
}
dev.off()
#As a cross validation, print the rows where there are NA's
checks <- subset(surveys,apply(surveys,1,function(x) any(is.na(x))))
print(checks)
#Now, lets look at the poll residuals
#First, restack everything into a single long list
resid.dat <- data.frame(Company=NULL,Date=NULL,leader=NULL,data=NULL,fit=NULL)
for(lead in leaders) {
resid.dat <- rbind(resid.dat,data.frame(polls[,c("Company","Date")],leader=lead,data=polls[,lead],fit=smoothed[[lead]]$fit))
}
#Calculate residuals
resid.dat$bias <- (resid.dat$data-resid.dat$fit)/resid.dat$fit
#Prepare for plotting
plot.dat <- resid.dat
replacements <- data.frame(old=c("TV3-TNS","One News-Colmar Brunton","Herald-DigiPoll","Roy Morgan Research",
"UMR Research","Fairfax Media Poll","Fairfax Media-Nielsen"),
new=c("TNS","Colmar\nBrunton","DigiPoll","Roy\nMorgan",
"UMR","Nielsen","Nielsen"))
for(i in 1:nrow(replacements)) {
plot.dat$Company <- gsub(replacements$old[i],replacements$new[i],plot.dat$Company,fixed=TRUE)
}
plot.dat$Company <- factor(plot.dat$Company)
#Exclude zeros or "near zeros"
plot.dat <- plot.dat[-which(plot.dat$data==0 | plot.dat$fit<0.75),]
#Plot figure
library(lattice)
p<-bwplot(bias*100~Company|leader,data=plot.dat,as.table=TRUE,
scales=list(alternating=c(1),relation="free"),
xlab="Company",
ylab="Relative Bias (%)",
panel=function(...) {
panel.abline(h=0)
tmp <-list(...)
tmp <-split(tmp$y,factor(tmp$x))
sig.diff <-sapply(tmp,function(x){
(1-pt(abs(mean(x))/sd(x),df=length(x)-1))*2 < 0.05
})
fill.col <- ifelse(cols[panel.number()]=="black","darkgrey",cols[panel.number()])
# fill.col <- ifelse(sig.diff,fill.col,NA)
panel.bwplot(...,pch="|",fill=fill.col)},
par.settings = list(box.rectangle=list(col="black",lty=1),
box.umbrella=list(col="black",lty=1),
plot.symbol=list(col="black")))
png("NZ Elections 2005-2008 - PPM bias.png",width=600,height=600)
plot(p)
dev.off()
#Biases by party and leader
biases <- lapply(split(plot.dat,plot.dat$Company,drop=TRUE),function(company) {
sapply(split(company,company$leader,drop=TRUE),function(x) {
data.frame(mean=mean(x$bias)*100,p.value= (1-pt(abs(mean(x$bias))/sd(x$bias),df=length(x$bias)-1))*2)
})
})
Licensing[edit]
I, the copyright holder of this work, hereby publish it under the following license:
This file is licensed under the Creative Commons Attribution 3.0 Unported license.
- You are free:
- to share – to copy, distribute and transmit the work
- to remix – to adapt the work
- Under the following conditions:
- attribution – You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
File history
Click on a date/time to view the file as it appeared at that time.
Date/Time | Thumbnail | Dimensions | User | Comment | |
---|---|---|---|---|---|
current | 12:29, 13 October 2008 | 600 × 600 (8 KB) | Trevva (talk | contribs) | {{Information |Description={{en|1=Biases in the preferred prime minister polls. Bias is defined as the (p-s)/s, where p is the value estimated by the individual poll, and s is the "mean" value estimated using the Loess smoother taking into account all pol |
You cannot overwrite this file.
File usage on Commons
There are no pages that use this file.