diff --git a/code_11rounds.R b/code_11rounds.R new file mode 100644 index 0000000000000000000000000000000000000000..df1ef111a949f30f0bde64f1cf2c3221076e75a9 --- /dev/null +++ b/code_11rounds.R @@ -0,0 +1,352 @@ +# load packages +library(dplyr) # for data wrangling +# library(essurvey) # to download ESS data +# if necessary, install with this command: +# devtools::install_github("ropensci/essurvey") +library(sjlabelled) # to convert party vote choice into names +library(data.table) # for the "fread" function to quickly load large csv files + + +# we first save the 11 rounds from the ESS data website +ess_raw <- fread("ESS1e06_7-ESS2e03_6-ESS3e03_7-ESS4e04_6-ESS5e03_5-ESS6e02_6-ESS7e02_3-ESS8e02_3-ESS9e03_2-ESS10-ESS10SC-ESS11-subset.csv", header = TRUE, sep = ",") + +# Now we need to create a function to: +# (i) select required variables from each of the 9 datasets +# (ii) create a generalized party vote choice variable, instead of having lots of country-round specific variables + +# note: for Germany there are TWO vote intention variables +# since they cast 1 vote for a candidate "prtvde1" and then 1 vote for a party list "prtvde2" +# I will just use the party of the candidate vote +# which is why I drop variables ending in "de2" in the function below + +# You can add the variables you want to extract in the select function below +# Make sure to get the variable name exactly right: http://nesstar.ess.nsd.uib.no/webview/ +# Use "start_with()" / "ends_with()" to grab all variables starting with that string +# es.df.clean <- function(x){ +# esx <- x %>% select("essround", # REQUIRED: essround +# "idno", # REQUIRED: respondent ID +# "cntry", # REQUIRED: country +# starts_with("inw"), # REQUIRED: interview date (to match vote recall to specific election) +# "gndr" , # gender +# "agea", # age +# starts_with("edulvl"), # educational attainment (several vars) +# starts_with("isco"), # occupation +# starts_with("prtv"), # party vote +# -ends_with("de1"), # drop 1st German vote intention var +# ) %>% +# as.data.frame() +# # find FIRST country-specific vote variable +# start <- head(grep("prtv", colnames(esx)), n=1) +# # find LAST country-specific vote variable +# end <- tail(grep("prtv", colnames(esx)), n=1) +# # mini dataset of party choice vars +# es.vote <- esx %>% select(start:end) +# # create dataset-wide vote variable by merging the country-specific vars +# esx$party.vote.num <- as.vector(do.call(coalesce, es.vote)) +# # convert numeric values into party names +# es.vote.named <- as_label(es.vote) +# # convert factors into characters to make sure they're stored properly +# es.vote.named[] <- lapply(es.vote.named, as.character) +# # create another dataset-wide vote variable, this time for the character variable +# esx$party.vote.name <- as.vector(do.call(coalesce, es.vote.named)) +# # convert to UTF encoding to deal with special characters +# # delete unnecessary variables +# start <- head(grep("prtvt", colnames(esx)), n=1) +# end <- tail(grep("prtvt", colnames(esx)), n=1) +# esx <- esx %>% select(-(start:end)) +# esx +# } + +es.df.clean <- function(x) { + # Convert to data.table + # x <- as.data.table(x) + + # Select required variables + fixed_columns <- c("essround", "idno", "cntry", "gndr", "agea") + columns_starting_inw <- grep("^inw", names(x), value = TRUE) + columns_starting_edulvl <- grep("^edulvl", names(x), value = TRUE) + columns_starting_isco <- grep("^isco", names(x), value = TRUE) + columns_starting_prtv <- grep("^prtv", names(x), value = TRUE) + selected_columns <- c(fixed_columns, columns_starting_inw, columns_starting_edulvl, columns_starting_isco, columns_starting_prtv) + # apply the selections + esx <- x[, .SD, .SDcols = selected_columns] + + # Drop variables ending with "de1" + columns_to_delete <- grep("de1$", names(esx), value = TRUE) + esx[, (columns_to_delete) := NULL] + + # Drop columns prtvtro and prtvtait that only are missing values + # esx[,.N, by=prtvtro] + # esx[,.N, by=prtvtait] + esx[, c("prtvtro", "prtvtait") := NULL] + + # Find FIRST and LAST country-specific vote variable + vote_cols <- grep("^prtv", names(esx), value = TRUE) + + # Create dataset-wide vote variable by merging the country-specific vars + esx[, party.vote.num := do.call(fcoalesce, .SD), .SDcols = vote_cols] + + # Convert numeric values into party names + # es.vote.named <- as.data.table(lapply(esx[, ..vote_cols], as_label)) + es.vote.named <- esx[, lapply(.SD, as.character)] + + # Create another dataset-wide vote variable, this time for the character variable + esx[, party.vote.name := do.call(fcoalesce, es.vote.named)] + + # Drop unnecessary variables + unnecessary_columns <- grep("^prtv", names(esx), value = TRUE) + # apply the selections + esx <- esx[, .SD, .SDcols = -unnecessary_columns] + + return(esx) +} + + +# Clean the dataset with 11 rounds +ess_clean <- es.df.clean(ess_raw) + +# Save the cleaned data to be able to reuse it later without the preprocessing +write.csv(ess_clean, file = "ess_clean.csv", row.names = FALSE) + +# Clean R memory +rm(list = ls()) +gc() + +# useful function +tabl <- function(...) table(..., useNA='ifany') + +# open the cleaned data +ess <- read.csv("ess_clean.csv") + +# EDUCATION: +# Let's create a dummy variable indicating that the respondent +# has attained a bachelor's degree or above +# ESS rounds 1-4 use the "edulvla" variable +xtabs(~ essround + edulvla, data=ess) +# ESS rounds 5 onwards use a more detailed "edulvlb" variable +xtabs(~ essround + edulvlb, data=ess) + +# First let's code "other" as missing +ess$edulvla[ess$edulvla==55] <- NA # "other" +ess$edulvlb[ess$edulvlb==5555] <- NA # "other" + +# now create dummy for bachelors degree +# for more details on the categories: https://www.europeansocialsurvey.org/docs/round8/survey/ESS8_data_protocol_e01_4.pdf +ess$educ.ba <- ifelse(ess$essround<5 & ess$edulvla==5, 1, + ifelse(ess$essround>=5 & ess$edulvlb>600, 1, 0)) +tabl(ess$educ.ba) + + +# OCCUPATION +head(xtabs(~ iscoco + essround, data=ess)) +head(xtabs(~ isco08 + essround, data=ess)) + +# load Oesch occupation-class crosswalks from the Github repo +# Alternatively, you can run the script "oesch_class_crosswalks.R" to produce them yourself + +# crosswalk for ISCO 1988 codes +cw88 <- fread("https://raw.githubusercontent.com/sophieehill/ess-cumulative/master/crosswalks/oesch_88_4dig_cleaned.csv") +cw88 <- cw88[,-1] +# crosswalk for ISCO 2008 codes +names(cw88) <- c("isco88", "isco88_desc", "oesch_class88") +cw08 <- fread("https://raw.githubusercontent.com/sophieehill/ess-cumulative/master/crosswalks/oesch_08_4dig_cleaned.csv") +cw08 <- cw08[,-1] +names(cw08) <- c("isco08", "isco_desc08", "oesch_class08") + + +ess <- left_join(ess, cw88[!is.na(cw88$isco88),], by=c("iscoco"="isco88")) +ess <- left_join(ess, cw08, by=c("isco08"="isco08")) +ess <- ess|> + mutate(oesch_class = coalesce(oesch_class88, oesch_class08)) +tabl(ess$oesch_class) + +# NOTE: since I am constructing the class mapping just based on occupation +# categories 1-4 (the self-employed) of the Oesch class schema will not be included here +# if you want to include these categories, follow Oesch's mapping using the additional variables +# "emplrel" and "emplno" +ess <- ess|> + mutate(oesch_class_sum = case_when( + oesch_class %in% c(1,2) ~ "Self-employed professionals", + oesch_class %in% c(3,4) ~ "Small business owners", + oesch_class %in% c(5,6) ~ "Technical (semi-)professionals", + oesch_class %in% c(7,8) ~ "Production workers", + oesch_class %in% c(9,10) ~ "(Associate) managers", + oesch_class %in% c(11,12) ~ "Clerks", + oesch_class %in% c(13,14) ~ "Sociocultural (semi-)professionals", + oesch_class %in% c(15,16) ~ "Service workers")) +tabl(ess$oesch_class_sum) + +# gender +tabl(ess$gndr) +ess$female <- ifelse(ess$gndr==1, 0, + ifelse(ess$gndr==2, 1, NA)) +tabl(ess$female) + +# age +table(ess$agea) +ess$age <- ess$agea +ess$age[ess$agea==999] <- NA +table(ess$age) +ess$age.group <- cut(ess$age, breaks=c(0,20,35,50,65,75, 120)) +table(ess$age.group) + +# year +ess$essround.year <- NA +ess$essround.year[ess$essround==1] <- 2002 +ess$essround.year[ess$essround==2] <- 2004 +ess$essround.year[ess$essround==3] <- 2006 +ess$essround.year[ess$essround==4] <- 2008 +ess$essround.year[ess$essround==5] <- 2010 +ess$essround.year[ess$essround==6] <- 2012 +ess$essround.year[ess$essround==7] <- 2014 +ess$essround.year[ess$essround==8] <- 2016 +ess$essround.year[ess$essround==9] <- 2018 +ess$essround.year[ess$essround==10] <- 2020 +ess$essround.year[ess$essround==11] <- 2022 + +# party.vote.ess +table(ess$cntry) +ess$party.vote.ess <- ifelse(is.na(ess$party.vote.num), NA, + paste0(ess$cntry, "-", ess$essround, "-", ess$party.vote.num)) +tabl(ess$party.vote.ess) + +# load the ESS-Partyfacts extended crosswalk +cw_ess_pf <- fread("https://raw.githubusercontent.com/sophieehill/ess-partyfacts-crosswalk/master/ess-partyfacts-extended.csv") +cw_ess_pf$party.vote.ess <- paste0(cw_ess_pf$cntry, "-", cw_ess_pf$essround, "-", cw_ess_pf$ess_id) +cw_ess_pf <- cw_ess_pf |> + select(party.vote.ess, partyfacts_id, partyfacts_name) + +# merge partyfacts IDs into main dataset +ess <- left_join(ess, cw_ess_pf, by="party.vote.ess") +tabl(ess$party.vote.ess) +tabl(ess$partyfacts_id) # bcp NA + +# now load the Partyfacts-External crosswalk and select the Manifesto dataset +# this lets us link those partyfacts IDs to *other* datasets +cw_pf <- fread("https://partyfacts.herokuapp.com/download/external-parties-csv", fill = TRUE) +cw_pf_cmp <- cw_pf |> + filter(dataset_key == "manifesto") |> + select(partyfacts_id, dataset_party_id) +cw_pf_cmp$dataset_party_id <- as.numeric(cw_pf_cmp$dataset_party_id) + +names(cw_pf_cmp) <- c("partyfacts_id", "cmp_id") + +ess <- left_join(ess, cw_pf_cmp, by="partyfacts_id") +tabl(ess$cmp_id) + +# In order to merge in election-level variables (e.g. measures of a party's manifesto for a particular election), we need to match up the ESS dates to the most recent election +# Some ESS fieldwork occurs over an election period, meaning that respondents within the same country-round would be referring to different elections when they recall their "past vote" +# First, let's import the dataset from Denis Cohen's github: https://github.com/denis-cohen/ess-election-dates +ess_dates <- fread("https://raw.githubusercontent.com/denis-cohen/ess-election-dates/master/ess_election_dates.csv") +# select needed vars +ess_dates <- ess_dates |> + select(cntry, essround, recent_election, recent_election_split1) +# merge in +ess <- left_join(ess, ess_dates, by=c("cntry", "essround")) + +# create a variable indicating date of interview for each respondent +# first create day/month/year variables consistent across rounds +# from ESS Round 3 onwards, they give us the start (inwdds) AND end date (inwdde) of the interview +# here I am taking the start date as our reference point +# I *think* the politics module occurs fairly early during the survey +# Alternatively we coulld take the midpoint, or use the end date? +ess <- ess |> + mutate(int.day = case_when(essround<3 ~ inwdd, + essround>2 ~ inwdds)) |> + mutate(int.month = case_when(essround<3 ~ inwmm, + essround>2 ~ inwmms)) |> + mutate(int.year = case_when(essround<3 ~ inwyr, + essround>2 ~ inwyys)) +ess <- ess |> + mutate(int.date = as.Date(paste(int.year, int.month, int.day, sep="-"))) +tabl(ess$int.date) +# for each respondent, let's define their "most recent election", based on start interview date +ess <- ess |> + mutate(ref.election = case_when(int.date > recent_election ~ recent_election, + int.date <= recent_election ~ recent_election_split1)) +tabl(ess$ref.election) +# if the specific date is missing let's just match up using the country-year pair + + +# Merge with CMP data to get party families +# Download latest CMP dataset +# (Use API or just load "cmp.csv") +library(manifestoR) +# set API key +mp_setapikey(key = "70af9d9d7f76a3d66d41142debe969f6") +# download latest dataset +cmp <- as.data.frame(mp_maindataset()) +# save for replicability +# write.csv(cmp, "cmp_main_2020.csv") +head(cmp) +tabl(cmp$edate) +summary(cmp$party) +# create election year variable +cmp$election.year <- as.numeric(substr(cmp$date, 1, 4)) +# create econ l-r and lib-auth scales, following Bakker & Hobolt (2013) +cmp <- cmp |> + mutate(econlr = scale_logit(data=cmp, + pos=c("per401", "per402", "per407", "per505", + "per507", "per410", "per414", "per702"), + neg=c("per403", "per404", "per406", "per504", + "per506", "per413", "per412", "per701", + "per405", "per409", "per415", "per503"), + zero_offset = 0.5)) + +cmp <- cmp |> + mutate(econlr.sal = (per401 + per402 + per407 + per505 + per507 + per410 + per414 + per702) + + (per403 + per404 + per406 + per504 + per506 + per413 + per412 + per701 + per405 + per409 + per415 + per503)) + + +summary(cmp$econlr.sal) + +cmp <- cmp |> + mutate(auth = scale_logit(data=cmp, + pos=c("per305", "per601", "per603", "per605", + "per608", "per606"), + neg=c("per501", "per602", "per604", "per502", + "per607", "per416", "per705", "per706", + "per201", "per202"), + zero_offset = 0.5)) + +cmp <- cmp |> + mutate(auth.sal = (per305 + per601 + per603 + per605 + per608 + per606) + + (per501 + per602 + per604 + per502 + per607 + per416 + per705 + per706 + per201 + per202)) +# select party code, party family +# as well as party-election specific variables like right/left coding of the manifesto +cmp.x <- cmp |> + select(party, parfam, election.year, edate, rile, econlr, econlr.sal, auth, auth.sal) +names(cmp.x)[1:2] <- c("cmp_id", "cmp_parfam") # relabel for clarity +head(cmp.x) +ess$election.year <- as.numeric(substr(ess$ref.election, 1, 4)) +tabl(ess$election.year) +# match up by election year +# N.B. this won't work for cases where two elections happen in the same year, and ESS fieldwork window covers the 2nd election +ess <- left_join(ess, cmp.x, by=c("cmp_id", "election.year")) +# alternatively we could match on exact election date +# cmp.x$election.date <- as.Date(cmp.x$edate) +# ess$election.date <- as.Date(ess$ref.election) +# ess <- left_join(ess, cmp.x, by=c("cmp_id", "election.date")) + +# create left vote recall based on party families +# 10 = ecological +# 20 = socialist or other left +# 30 = social democratic +ess$vote.left <- ifelse(ess$cmp_parfam==10 | ess$cmp_parfam==20 | ess$cmp_parfam==30, 1, 0) +tabl(ess$vote.left) + +names(ess) + +head(ess) +essx <- ess |> + select(idno, cntry, essround, essround.year, int.date, + female, age, age.group, educ.ba, + oesch_class, oesch_class_sum, + # domicil, nace.summary, lrscale, + party.vote.ess, partyfacts_id, partyfacts_name, + cmp_id, cmp_parfam, vote.left, ref.election, + election.year, edate, rile, + econlr, econlr.sal, auth, auth.sal) |> + as.data.frame() + +write.csv(essx, "ess_cumulative_core.csv")