Skip to content
Extraits de code Groupes Projets
Valider c5e88e3b rédigé par Alain Guillet's avatar Alain Guillet
Parcourir les fichiers

Code for 11 rounds

parent b34d9120
Aucune branche associée trouvée
Aucune étiquette associée trouvée
Aucune requête de fusion associée trouvée
# load packages
library(dplyr) # for data wrangling
# library(essurvey) # to download ESS data
# if necessary, install with this command:
# devtools::install_github("ropensci/essurvey")
library(sjlabelled) # to convert party vote choice into names
library(data.table) # for the "fread" function to quickly load large csv files
# we first save the 11 rounds from the ESS data website
ess_raw <- fread("ESS1e06_7-ESS2e03_6-ESS3e03_7-ESS4e04_6-ESS5e03_5-ESS6e02_6-ESS7e02_3-ESS8e02_3-ESS9e03_2-ESS10-ESS10SC-ESS11-subset.csv", header = TRUE, sep = ",")
# Now we need to create a function to:
# (i) select required variables from each of the 9 datasets
# (ii) create a generalized party vote choice variable, instead of having lots of country-round specific variables
# note: for Germany there are TWO vote intention variables
# since they cast 1 vote for a candidate "prtvde1" and then 1 vote for a party list "prtvde2"
# I will just use the party of the candidate vote
# which is why I drop variables ending in "de2" in the function below
# You can add the variables you want to extract in the select function below
# Make sure to get the variable name exactly right: http://nesstar.ess.nsd.uib.no/webview/
# Use "start_with()" / "ends_with()" to grab all variables starting with that string
# es.df.clean <- function(x){
# esx <- x %>% select("essround", # REQUIRED: essround
# "idno", # REQUIRED: respondent ID
# "cntry", # REQUIRED: country
# starts_with("inw"), # REQUIRED: interview date (to match vote recall to specific election)
# "gndr" , # gender
# "agea", # age
# starts_with("edulvl"), # educational attainment (several vars)
# starts_with("isco"), # occupation
# starts_with("prtv"), # party vote
# -ends_with("de1"), # drop 1st German vote intention var
# ) %>%
# as.data.frame()
# # find FIRST country-specific vote variable
# start <- head(grep("prtv", colnames(esx)), n=1)
# # find LAST country-specific vote variable
# end <- tail(grep("prtv", colnames(esx)), n=1)
# # mini dataset of party choice vars
# es.vote <- esx %>% select(start:end)
# # create dataset-wide vote variable by merging the country-specific vars
# esx$party.vote.num <- as.vector(do.call(coalesce, es.vote))
# # convert numeric values into party names
# es.vote.named <- as_label(es.vote)
# # convert factors into characters to make sure they're stored properly
# es.vote.named[] <- lapply(es.vote.named, as.character)
# # create another dataset-wide vote variable, this time for the character variable
# esx$party.vote.name <- as.vector(do.call(coalesce, es.vote.named))
# # convert to UTF encoding to deal with special characters
# # delete unnecessary variables
# start <- head(grep("prtvt", colnames(esx)), n=1)
# end <- tail(grep("prtvt", colnames(esx)), n=1)
# esx <- esx %>% select(-(start:end))
# esx
# }
es.df.clean <- function(x) {
# Convert to data.table
# x <- as.data.table(x)
# Select required variables
fixed_columns <- c("essround", "idno", "cntry", "gndr", "agea")
columns_starting_inw <- grep("^inw", names(x), value = TRUE)
columns_starting_edulvl <- grep("^edulvl", names(x), value = TRUE)
columns_starting_isco <- grep("^isco", names(x), value = TRUE)
columns_starting_prtv <- grep("^prtv", names(x), value = TRUE)
selected_columns <- c(fixed_columns, columns_starting_inw, columns_starting_edulvl, columns_starting_isco, columns_starting_prtv)
# apply the selections
esx <- x[, .SD, .SDcols = selected_columns]
# Drop variables ending with "de1"
columns_to_delete <- grep("de1$", names(esx), value = TRUE)
esx[, (columns_to_delete) := NULL]
# Drop columns prtvtro and prtvtait that only are missing values
# esx[,.N, by=prtvtro]
# esx[,.N, by=prtvtait]
esx[, c("prtvtro", "prtvtait") := NULL]
# Find FIRST and LAST country-specific vote variable
vote_cols <- grep("^prtv", names(esx), value = TRUE)
# Create dataset-wide vote variable by merging the country-specific vars
esx[, party.vote.num := do.call(fcoalesce, .SD), .SDcols = vote_cols]
# Convert numeric values into party names
# es.vote.named <- as.data.table(lapply(esx[, ..vote_cols], as_label))
es.vote.named <- esx[, lapply(.SD, as.character)]
# Create another dataset-wide vote variable, this time for the character variable
esx[, party.vote.name := do.call(fcoalesce, es.vote.named)]
# Drop unnecessary variables
unnecessary_columns <- grep("^prtv", names(esx), value = TRUE)
# apply the selections
esx <- esx[, .SD, .SDcols = -unnecessary_columns]
return(esx)
}
# Clean the dataset with 11 rounds
ess_clean <- es.df.clean(ess_raw)
# Save the cleaned data to be able to reuse it later without the preprocessing
write.csv(ess_clean, file = "ess_clean.csv", row.names = FALSE)
# Clean R memory
rm(list = ls())
gc()
# useful function
tabl <- function(...) table(..., useNA='ifany')
# open the cleaned data
ess <- read.csv("ess_clean.csv")
# EDUCATION:
# Let's create a dummy variable indicating that the respondent
# has attained a bachelor's degree or above
# ESS rounds 1-4 use the "edulvla" variable
xtabs(~ essround + edulvla, data=ess)
# ESS rounds 5 onwards use a more detailed "edulvlb" variable
xtabs(~ essround + edulvlb, data=ess)
# First let's code "other" as missing
ess$edulvla[ess$edulvla==55] <- NA # "other"
ess$edulvlb[ess$edulvlb==5555] <- NA # "other"
# now create dummy for bachelors degree
# for more details on the categories: https://www.europeansocialsurvey.org/docs/round8/survey/ESS8_data_protocol_e01_4.pdf
ess$educ.ba <- ifelse(ess$essround<5 & ess$edulvla==5, 1,
ifelse(ess$essround>=5 & ess$edulvlb>600, 1, 0))
tabl(ess$educ.ba)
# OCCUPATION
head(xtabs(~ iscoco + essround, data=ess))
head(xtabs(~ isco08 + essround, data=ess))
# load Oesch occupation-class crosswalks from the Github repo
# Alternatively, you can run the script "oesch_class_crosswalks.R" to produce them yourself
# crosswalk for ISCO 1988 codes
cw88 <- fread("https://raw.githubusercontent.com/sophieehill/ess-cumulative/master/crosswalks/oesch_88_4dig_cleaned.csv")
cw88 <- cw88[,-1]
# crosswalk for ISCO 2008 codes
names(cw88) <- c("isco88", "isco88_desc", "oesch_class88")
cw08 <- fread("https://raw.githubusercontent.com/sophieehill/ess-cumulative/master/crosswalks/oesch_08_4dig_cleaned.csv")
cw08 <- cw08[,-1]
names(cw08) <- c("isco08", "isco_desc08", "oesch_class08")
ess <- left_join(ess, cw88[!is.na(cw88$isco88),], by=c("iscoco"="isco88"))
ess <- left_join(ess, cw08, by=c("isco08"="isco08"))
ess <- ess|>
mutate(oesch_class = coalesce(oesch_class88, oesch_class08))
tabl(ess$oesch_class)
# NOTE: since I am constructing the class mapping just based on occupation
# categories 1-4 (the self-employed) of the Oesch class schema will not be included here
# if you want to include these categories, follow Oesch's mapping using the additional variables
# "emplrel" and "emplno"
ess <- ess|>
mutate(oesch_class_sum = case_when(
oesch_class %in% c(1,2) ~ "Self-employed professionals",
oesch_class %in% c(3,4) ~ "Small business owners",
oesch_class %in% c(5,6) ~ "Technical (semi-)professionals",
oesch_class %in% c(7,8) ~ "Production workers",
oesch_class %in% c(9,10) ~ "(Associate) managers",
oesch_class %in% c(11,12) ~ "Clerks",
oesch_class %in% c(13,14) ~ "Sociocultural (semi-)professionals",
oesch_class %in% c(15,16) ~ "Service workers"))
tabl(ess$oesch_class_sum)
# gender
tabl(ess$gndr)
ess$female <- ifelse(ess$gndr==1, 0,
ifelse(ess$gndr==2, 1, NA))
tabl(ess$female)
# age
table(ess$agea)
ess$age <- ess$agea
ess$age[ess$agea==999] <- NA
table(ess$age)
ess$age.group <- cut(ess$age, breaks=c(0,20,35,50,65,75, 120))
table(ess$age.group)
# year
ess$essround.year <- NA
ess$essround.year[ess$essround==1] <- 2002
ess$essround.year[ess$essround==2] <- 2004
ess$essround.year[ess$essround==3] <- 2006
ess$essround.year[ess$essround==4] <- 2008
ess$essround.year[ess$essround==5] <- 2010
ess$essround.year[ess$essround==6] <- 2012
ess$essround.year[ess$essround==7] <- 2014
ess$essround.year[ess$essround==8] <- 2016
ess$essround.year[ess$essround==9] <- 2018
ess$essround.year[ess$essround==10] <- 2020
ess$essround.year[ess$essround==11] <- 2022
# party.vote.ess
table(ess$cntry)
ess$party.vote.ess <- ifelse(is.na(ess$party.vote.num), NA,
paste0(ess$cntry, "-", ess$essround, "-", ess$party.vote.num))
tabl(ess$party.vote.ess)
# load the ESS-Partyfacts extended crosswalk
cw_ess_pf <- fread("https://raw.githubusercontent.com/sophieehill/ess-partyfacts-crosswalk/master/ess-partyfacts-extended.csv")
cw_ess_pf$party.vote.ess <- paste0(cw_ess_pf$cntry, "-", cw_ess_pf$essround, "-", cw_ess_pf$ess_id)
cw_ess_pf <- cw_ess_pf |>
select(party.vote.ess, partyfacts_id, partyfacts_name)
# merge partyfacts IDs into main dataset
ess <- left_join(ess, cw_ess_pf, by="party.vote.ess")
tabl(ess$party.vote.ess)
tabl(ess$partyfacts_id) # bcp NA
# now load the Partyfacts-External crosswalk and select the Manifesto dataset
# this lets us link those partyfacts IDs to *other* datasets
cw_pf <- fread("https://partyfacts.herokuapp.com/download/external-parties-csv", fill = TRUE)
cw_pf_cmp <- cw_pf |>
filter(dataset_key == "manifesto") |>
select(partyfacts_id, dataset_party_id)
cw_pf_cmp$dataset_party_id <- as.numeric(cw_pf_cmp$dataset_party_id)
names(cw_pf_cmp) <- c("partyfacts_id", "cmp_id")
ess <- left_join(ess, cw_pf_cmp, by="partyfacts_id")
tabl(ess$cmp_id)
# In order to merge in election-level variables (e.g. measures of a party's manifesto for a particular election), we need to match up the ESS dates to the most recent election
# Some ESS fieldwork occurs over an election period, meaning that respondents within the same country-round would be referring to different elections when they recall their "past vote"
# First, let's import the dataset from Denis Cohen's github: https://github.com/denis-cohen/ess-election-dates
ess_dates <- fread("https://raw.githubusercontent.com/denis-cohen/ess-election-dates/master/ess_election_dates.csv")
# select needed vars
ess_dates <- ess_dates |>
select(cntry, essround, recent_election, recent_election_split1)
# merge in
ess <- left_join(ess, ess_dates, by=c("cntry", "essround"))
# create a variable indicating date of interview for each respondent
# first create day/month/year variables consistent across rounds
# from ESS Round 3 onwards, they give us the start (inwdds) AND end date (inwdde) of the interview
# here I am taking the start date as our reference point
# I *think* the politics module occurs fairly early during the survey
# Alternatively we coulld take the midpoint, or use the end date?
ess <- ess |>
mutate(int.day = case_when(essround<3 ~ inwdd,
essround>2 ~ inwdds)) |>
mutate(int.month = case_when(essround<3 ~ inwmm,
essround>2 ~ inwmms)) |>
mutate(int.year = case_when(essround<3 ~ inwyr,
essround>2 ~ inwyys))
ess <- ess |>
mutate(int.date = as.Date(paste(int.year, int.month, int.day, sep="-")))
tabl(ess$int.date)
# for each respondent, let's define their "most recent election", based on start interview date
ess <- ess |>
mutate(ref.election = case_when(int.date > recent_election ~ recent_election,
int.date <= recent_election ~ recent_election_split1))
tabl(ess$ref.election)
# if the specific date is missing let's just match up using the country-year pair
# Merge with CMP data to get party families
# Download latest CMP dataset
# (Use API or just load "cmp.csv")
library(manifestoR)
# set API key
mp_setapikey(key = "70af9d9d7f76a3d66d41142debe969f6")
# download latest dataset
cmp <- as.data.frame(mp_maindataset())
# save for replicability
# write.csv(cmp, "cmp_main_2020.csv")
head(cmp)
tabl(cmp$edate)
summary(cmp$party)
# create election year variable
cmp$election.year <- as.numeric(substr(cmp$date, 1, 4))
# create econ l-r and lib-auth scales, following Bakker & Hobolt (2013)
cmp <- cmp |>
mutate(econlr = scale_logit(data=cmp,
pos=c("per401", "per402", "per407", "per505",
"per507", "per410", "per414", "per702"),
neg=c("per403", "per404", "per406", "per504",
"per506", "per413", "per412", "per701",
"per405", "per409", "per415", "per503"),
zero_offset = 0.5))
cmp <- cmp |>
mutate(econlr.sal = (per401 + per402 + per407 + per505 + per507 + per410 + per414 + per702) +
(per403 + per404 + per406 + per504 + per506 + per413 + per412 + per701 + per405 + per409 + per415 + per503))
summary(cmp$econlr.sal)
cmp <- cmp |>
mutate(auth = scale_logit(data=cmp,
pos=c("per305", "per601", "per603", "per605",
"per608", "per606"),
neg=c("per501", "per602", "per604", "per502",
"per607", "per416", "per705", "per706",
"per201", "per202"),
zero_offset = 0.5))
cmp <- cmp |>
mutate(auth.sal = (per305 + per601 + per603 + per605 + per608 + per606) +
(per501 + per602 + per604 + per502 + per607 + per416 + per705 + per706 + per201 + per202))
# select party code, party family
# as well as party-election specific variables like right/left coding of the manifesto
cmp.x <- cmp |>
select(party, parfam, election.year, edate, rile, econlr, econlr.sal, auth, auth.sal)
names(cmp.x)[1:2] <- c("cmp_id", "cmp_parfam") # relabel for clarity
head(cmp.x)
ess$election.year <- as.numeric(substr(ess$ref.election, 1, 4))
tabl(ess$election.year)
# match up by election year
# N.B. this won't work for cases where two elections happen in the same year, and ESS fieldwork window covers the 2nd election
ess <- left_join(ess, cmp.x, by=c("cmp_id", "election.year"))
# alternatively we could match on exact election date
# cmp.x$election.date <- as.Date(cmp.x$edate)
# ess$election.date <- as.Date(ess$ref.election)
# ess <- left_join(ess, cmp.x, by=c("cmp_id", "election.date"))
# create left vote recall based on party families
# 10 = ecological
# 20 = socialist or other left
# 30 = social democratic
ess$vote.left <- ifelse(ess$cmp_parfam==10 | ess$cmp_parfam==20 | ess$cmp_parfam==30, 1, 0)
tabl(ess$vote.left)
names(ess)
head(ess)
essx <- ess |>
select(idno, cntry, essround, essround.year, int.date,
female, age, age.group, educ.ba,
oesch_class, oesch_class_sum,
# domicil, nace.summary, lrscale,
party.vote.ess, partyfacts_id, partyfacts_name,
cmp_id, cmp_parfam, vote.left, ref.election,
election.year, edate, rile,
econlr, econlr.sal, auth, auth.sal) |>
as.data.frame()
write.csv(essx, "ess_cumulative_core.csv")
0% Chargement en cours ou .
You are about to add 0 people to the discussion. Proceed with caution.
Terminez d'abord l'édition de ce message.
Veuillez vous inscrire ou vous pour commenter