code_11rounds.R

# load packages
library(dplyr) # for data wrangling
# library(essurvey) # to download ESS data
# if necessary, install with this command:
# devtools::install_github("ropensci/essurvey")
library(sjlabelled) # to convert party vote choice into names
library(data.table) # for the "fread" function to quickly load large csv files


# we first save the 11 rounds from the ESS data website
ess_raw <- fread("ESS1e06_7-ESS2e03_6-ESS3e03_7-ESS4e04_6-ESS5e03_5-ESS6e02_6-ESS7e02_3-ESS8e02_3-ESS9e03_2-ESS10-ESS10SC-ESS11-subset.csv", header = TRUE, sep = ",")

# Now we need to create a function to:
# (i) select required variables from each of the 9 datasets
# (ii) create a generalized party vote choice variable, instead of having lots of country-round specific variables

# note: for Germany there are TWO vote intention variables
# since they cast 1 vote for a candidate "prtvde1" and then 1 vote for a party list "prtvde2"
# I will just use the party of the candidate vote
# which is why I drop variables ending in "de2" in the function below

# You can add the variables you want to extract in the select function below
# Make sure to get the variable name exactly right: http://nesstar.ess.nsd.uib.no/webview/
# Use "start_with()" / "ends_with()" to grab all variables starting with that string
# es.df.clean <- function(x){
#   esx <- x %>% select("essround", # REQUIRED: essround
#                       "idno", # REQUIRED: respondent ID
#                       "cntry", # REQUIRED: country 
#                       starts_with("inw"), # REQUIRED: interview date (to match vote recall to specific election)
#                       "gndr" , # gender
#                       "agea", # age
#                       starts_with("edulvl"), # educational attainment (several vars)
#                       starts_with("isco"), # occupation
#                       starts_with("prtv"), # party vote
#                       -ends_with("de1"), # drop 1st German vote intention var
#   ) %>% 
#     as.data.frame()
#   # find FIRST country-specific vote variable
#   start <- head(grep("prtv", colnames(esx)), n=1)
#   # find LAST country-specific vote variable
#   end <- tail(grep("prtv", colnames(esx)), n=1)
#   # mini dataset of party choice vars
#   es.vote <- esx %>% select(start:end)
#   # create dataset-wide vote variable by merging the country-specific vars
#   esx$party.vote.num <- as.vector(do.call(coalesce, es.vote))
#   # convert numeric values into party names
#   es.vote.named <- as_label(es.vote)
#   # convert factors into characters to make sure they're stored properly
#   es.vote.named[] <- lapply(es.vote.named, as.character)
#   # create another dataset-wide vote variable, this time for the character variable
#   esx$party.vote.name <- as.vector(do.call(coalesce, es.vote.named))
#   # convert to UTF encoding to deal with special characters
#   # delete unnecessary variables
#   start <- head(grep("prtvt", colnames(esx)), n=1)
#   end <- tail(grep("prtvt", colnames(esx)), n=1)
#   esx <- esx %>% select(-(start:end))
#   esx
# }

es.df.clean <- function(x) {
  # Convert to data.table
  # x <- as.data.table(x)
  
  # Select required variables
  fixed_columns <- c("essround", "idno", "cntry", "gndr", "agea")
  columns_starting_inw <- grep("^inw", names(x), value = TRUE)
  columns_starting_edulvl <- grep("^edulvl", names(x), value = TRUE)
  columns_starting_isco <- grep("^isco", names(x), value = TRUE)
  columns_starting_prtv <- grep("^prtv", names(x), value = TRUE)
  selected_columns <- c(fixed_columns, columns_starting_inw, columns_starting_edulvl, columns_starting_isco, columns_starting_prtv)
  # apply the selections
  esx <- x[, .SD, .SDcols = selected_columns]
  
  # Drop variables ending with "de1"
  columns_to_delete <- grep("de1$", names(esx), value = TRUE)
  esx[, (columns_to_delete) := NULL]
  
  # Drop columns prtvtro and prtvtait that only are missing values
  # esx[,.N, by=prtvtro]
  # esx[,.N, by=prtvtait]
  esx[, c("prtvtro", "prtvtait") := NULL]

  # Find FIRST and LAST country-specific vote variable
  vote_cols <- grep("^prtv", names(esx), value = TRUE)
  
  # Create dataset-wide vote variable by merging the country-specific vars
  esx[, party.vote.num := do.call(fcoalesce, .SD), .SDcols = vote_cols]
  
  # Convert numeric values into party names
  # es.vote.named <- as.data.table(lapply(esx[, ..vote_cols], as_label))
  es.vote.named <- esx[, lapply(.SD, as.character)]
  
  # Create another dataset-wide vote variable, this time for the character variable
  esx[, party.vote.name := do.call(fcoalesce, es.vote.named)]
  
  # Drop unnecessary variables
  unnecessary_columns <- grep("^prtv", names(esx), value = TRUE)
  # apply the selections
  esx <- esx[, .SD, .SDcols = -unnecessary_columns]
 
  return(esx)
}


# Clean the dataset with 11 rounds
ess_clean <- es.df.clean(ess_raw)

# Save the cleaned data to be able to reuse it later without the preprocessing
write.csv(ess_clean, file = "ess_clean.csv", row.names = FALSE)

# Clean R memory
rm(list = ls())
gc()

# useful function
tabl <- function(...) table(..., useNA='ifany')

# open the cleaned data
ess <- read.csv("ess_clean.csv")

# EDUCATION:
# Let's create a dummy variable indicating that the respondent
# has attained a bachelor's degree or above
# ESS rounds 1-4 use the "edulvla" variable
xtabs(~ essround + edulvla, data=ess)
# ESS rounds 5 onwards use a more detailed "edulvlb" variable
xtabs(~ essround + edulvlb, data=ess)

# First let's code "other" as missing
ess$edulvla[ess$edulvla==55] <- NA # "other"
ess$edulvlb[ess$edulvlb==5555] <- NA # "other"

# now create dummy for bachelors degree
# for more details on the categories: https://www.europeansocialsurvey.org/docs/round8/survey/ESS8_data_protocol_e01_4.pdf
ess$educ.ba <- ifelse(ess$essround<5 & ess$edulvla==5, 1,
                      ifelse(ess$essround>=5 & ess$edulvlb>600, 1, 0))
tabl(ess$educ.ba)


# OCCUPATION
head(xtabs(~  iscoco + essround, data=ess))
head(xtabs(~  isco08 + essround, data=ess))

# load Oesch occupation-class crosswalks from the Github repo
# Alternatively, you can run the script "oesch_class_crosswalks.R" to produce them yourself

# crosswalk for ISCO 1988 codes
cw88 <- fread("https://raw.githubusercontent.com/sophieehill/ess-cumulative/master/crosswalks/oesch_88_4dig_cleaned.csv")
cw88 <- cw88[,-1]
# crosswalk for ISCO 2008 codes
names(cw88) <- c("isco88", "isco88_desc", "oesch_class88")
cw08 <- fread("https://raw.githubusercontent.com/sophieehill/ess-cumulative/master/crosswalks/oesch_08_4dig_cleaned.csv")
cw08 <- cw08[,-1]
names(cw08) <- c("isco08", "isco_desc08", "oesch_class08")


ess <- left_join(ess, cw88[!is.na(cw88$isco88),], by=c("iscoco"="isco88"))
ess <- left_join(ess, cw08, by=c("isco08"="isco08"))
ess <- ess|> 
  mutate(oesch_class = coalesce(oesch_class88, oesch_class08))
tabl(ess$oesch_class)

# NOTE: since I am constructing the class mapping just based on occupation
# categories 1-4 (the self-employed) of the Oesch class schema will not be included here
# if you want to include these categories, follow Oesch's mapping using the additional variables
# "emplrel" and "emplno"
ess <- ess|> 
  mutate(oesch_class_sum = case_when(
    oesch_class %in% c(1,2) ~ "Self-employed professionals",
    oesch_class %in% c(3,4) ~ "Small business owners",
    oesch_class %in% c(5,6) ~ "Technical (semi-)professionals",
    oesch_class %in% c(7,8) ~ "Production workers",
    oesch_class %in% c(9,10) ~ "(Associate) managers",
    oesch_class %in% c(11,12) ~ "Clerks",
    oesch_class %in% c(13,14) ~ "Sociocultural (semi-)professionals",
    oesch_class %in% c(15,16) ~ "Service workers"))
tabl(ess$oesch_class_sum)

# gender
tabl(ess$gndr)
ess$female <- ifelse(ess$gndr==1, 0, 
                     ifelse(ess$gndr==2, 1, NA))
tabl(ess$female)

# age
table(ess$agea)
ess$age <- ess$agea
ess$age[ess$agea==999] <- NA
table(ess$age)
ess$age.group <- cut(ess$age, breaks=c(0,20,35,50,65,75, 120))
table(ess$age.group)

# year
ess$essround.year <- NA
ess$essround.year[ess$essround==1] <- 2002
ess$essround.year[ess$essround==2] <- 2004
ess$essround.year[ess$essround==3] <- 2006
ess$essround.year[ess$essround==4] <- 2008
ess$essround.year[ess$essround==5] <- 2010
ess$essround.year[ess$essround==6] <- 2012
ess$essround.year[ess$essround==7] <- 2014
ess$essround.year[ess$essround==8] <- 2016
ess$essround.year[ess$essround==9] <- 2018
ess$essround.year[ess$essround==10] <- 2020
ess$essround.year[ess$essround==11] <- 2022

# party.vote.ess
table(ess$cntry)
ess$party.vote.ess <- ifelse(is.na(ess$party.vote.num), NA,
                             paste0(ess$cntry, "-", ess$essround, "-", ess$party.vote.num))
tabl(ess$party.vote.ess)

# load the ESS-Partyfacts extended crosswalk
cw_ess_pf <- fread("https://raw.githubusercontent.com/sophieehill/ess-partyfacts-crosswalk/master/ess-partyfacts-extended.csv")
cw_ess_pf$party.vote.ess <- paste0(cw_ess_pf$cntry, "-", cw_ess_pf$essround, "-", cw_ess_pf$ess_id)
cw_ess_pf <- cw_ess_pf |> 
  select(party.vote.ess, partyfacts_id, partyfacts_name)

# merge partyfacts IDs into main dataset
ess <- left_join(ess, cw_ess_pf, by="party.vote.ess")
tabl(ess$party.vote.ess)
tabl(ess$partyfacts_id) # bcp NA

# now load the Partyfacts-External crosswalk and select the Manifesto dataset
# this lets us link those partyfacts IDs to *other* datasets
cw_pf <- fread("https://partyfacts.herokuapp.com/download/external-parties-csv", fill = TRUE)
cw_pf_cmp <- cw_pf |> 
  filter(dataset_key == "manifesto") |> 
  select(partyfacts_id, dataset_party_id)
cw_pf_cmp$dataset_party_id <- as.numeric(cw_pf_cmp$dataset_party_id)

names(cw_pf_cmp) <- c("partyfacts_id", "cmp_id")

ess <- left_join(ess, cw_pf_cmp, by="partyfacts_id")
tabl(ess$cmp_id)

# In order to merge in election-level variables (e.g. measures of a party's manifesto for a particular election), we need to match up the ESS dates to the most recent election
# Some ESS fieldwork occurs over an election period, meaning that respondents within the same country-round would be referring to different elections when they recall their "past vote"
# First, let's import the dataset from Denis Cohen's github: https://github.com/denis-cohen/ess-election-dates
ess_dates <- fread("https://raw.githubusercontent.com/denis-cohen/ess-election-dates/master/ess_election_dates.csv")
# select needed vars
ess_dates <- ess_dates |> 
  select(cntry, essround, recent_election, recent_election_split1)
# merge in
ess <- left_join(ess, ess_dates, by=c("cntry", "essround"))

# create a variable indicating date of interview for each respondent
# first create day/month/year variables consistent across rounds
# from ESS Round 3 onwards, they give us the start (inwdds) AND end date (inwdde) of the interview
# here I am taking the start date as our reference point
# I *think* the politics module occurs fairly early during the survey
# Alternatively we coulld take the midpoint, or use the end date?
ess <- ess |> 
  mutate(int.day = case_when(essround<3 ~ inwdd,
                                          essround>2 ~ inwdds)) |>
  mutate(int.month = case_when(essround<3 ~ inwmm,
                               essround>2 ~ inwmms)) |>
  mutate(int.year = case_when(essround<3 ~ inwyr,
                              essround>2 ~ inwyys))
ess <- ess |> 
  mutate(int.date = as.Date(paste(int.year, int.month, int.day, sep="-")))
tabl(ess$int.date)
# for each respondent, let's define their "most recent election", based on start interview date
ess <- ess |> 
  mutate(ref.election = case_when(int.date > recent_election ~ recent_election,
                                               int.date <= recent_election ~ recent_election_split1))
tabl(ess$ref.election)
# if the specific date is missing let's just match up using the country-year pair


# Merge with CMP data to get party families
# Download latest CMP dataset
# (Use API or just load "cmp.csv")
library(manifestoR)
# set API key
mp_setapikey(key = "70af9d9d7f76a3d66d41142debe969f6")
# download latest dataset
cmp <- as.data.frame(mp_maindataset())
# save for replicability
# write.csv(cmp, "cmp_main_2020.csv")
head(cmp)
tabl(cmp$edate)
summary(cmp$party)
# create election year variable
cmp$election.year <- as.numeric(substr(cmp$date, 1, 4))
# create econ l-r and lib-auth scales, following Bakker & Hobolt (2013)
cmp <- cmp |> 
  mutate(econlr = scale_logit(data=cmp,
                              pos=c("per401", "per402", "per407", "per505",
                                   "per507", "per410", "per414", "per702"),
                              neg=c("per403", "per404", "per406", "per504",
                                   "per506", "per413", "per412", "per701",
                                   "per405", "per409", "per415", "per503"),
                              zero_offset = 0.5))

cmp <- cmp |> 
  mutate(econlr.sal = (per401 + per402 + per407 + per505 + per507 + per410 + per414 + per702) +
                        (per403 + per404 + per406 + per504 + per506 + per413 + per412 + per701 + per405 + per409 + per415 + per503))


summary(cmp$econlr.sal)

cmp <- cmp |> 
  mutate(auth = scale_logit(data=cmp,
                            pos=c("per305", "per601", "per603", "per605",
                                 "per608", "per606"),
                            neg=c("per501", "per602", "per604", "per502",
                                 "per607", "per416", "per705", "per706",
                                 "per201", "per202"),
                            zero_offset = 0.5))

cmp <- cmp |> 
  mutate(auth.sal = (per305 + per601 + per603 + per605 + per608  + per606) +
                    (per501 + per602 + per604 + per502 + per607 + per416 + per705 + per706 + per201 + per202))
# select party code, party family
# as well as party-election specific variables like right/left coding of the manifesto
cmp.x <- cmp |> 
  select(party, parfam, election.year, edate, rile, econlr, econlr.sal, auth, auth.sal)
names(cmp.x)[1:2] <- c("cmp_id", "cmp_parfam") # relabel for clarity
head(cmp.x)
ess$election.year <- as.numeric(substr(ess$ref.election, 1, 4))
tabl(ess$election.year)
# match up by election year
# N.B. this won't work for cases where two elections happen in the same year, and ESS fieldwork window covers the 2nd election
ess <- left_join(ess, cmp.x, by=c("cmp_id", "election.year"))
# alternatively we could match on exact election date
# cmp.x$election.date <- as.Date(cmp.x$edate)
# ess$election.date <- as.Date(ess$ref.election)
# ess <- left_join(ess, cmp.x, by=c("cmp_id", "election.date"))

# create left vote recall based on party families
# 10 = ecological
# 20 = socialist or other left
# 30 = social democratic
ess$vote.left <- ifelse(ess$cmp_parfam==10 | ess$cmp_parfam==20 | ess$cmp_parfam==30, 1, 0)
tabl(ess$vote.left)

names(ess)

head(ess)
essx <- ess |> 
  select(idno, cntry, essround, essround.year, int.date,
         female, age, age.group, educ.ba, 
         oesch_class, oesch_class_sum,
         # domicil, nace.summary, lrscale,
         party.vote.ess, partyfacts_id, partyfacts_name,
         cmp_id, cmp_parfam, vote.left, ref.election,
         election.year, edate, rile,
         econlr, econlr.sal, auth, auth.sal) |>
  as.data.frame()

write.csv(essx, "ess_cumulative_core.csv")