Skip to content
Extraits de code Groupes Projets
code_11rounds.R 14,9 ko
Newer Older
  • Learn to ignore specific revisions
  • Alain Guillet's avatar
    Alain Guillet a validé
    # load packages
    library(dplyr) # for data wrangling
    # library(essurvey) # to download ESS data
    # if necessary, install with this command:
    # devtools::install_github("ropensci/essurvey")
    library(sjlabelled) # to convert party vote choice into names
    library(data.table) # for the "fread" function to quickly load large csv files
    
    
    # we first save the 11 rounds from the ESS data website
    ess_raw <- fread("ESS1e06_7-ESS2e03_6-ESS3e03_7-ESS4e04_6-ESS5e03_5-ESS6e02_6-ESS7e02_3-ESS8e02_3-ESS9e03_2-ESS10-ESS10SC-ESS11-subset.csv", header = TRUE, sep = ",")
    
    # Now we need to create a function to:
    # (i) select required variables from each of the 9 datasets
    # (ii) create a generalized party vote choice variable, instead of having lots of country-round specific variables
    
    # note: for Germany there are TWO vote intention variables
    # since they cast 1 vote for a candidate "prtvde1" and then 1 vote for a party list "prtvde2"
    # I will just use the party of the candidate vote
    # which is why I drop variables ending in "de2" in the function below
    
    # You can add the variables you want to extract in the select function below
    # Make sure to get the variable name exactly right: http://nesstar.ess.nsd.uib.no/webview/
    # Use "start_with()" / "ends_with()" to grab all variables starting with that string
    # es.df.clean <- function(x){
    #   esx <- x %>% select("essround", # REQUIRED: essround
    #                       "idno", # REQUIRED: respondent ID
    #                       "cntry", # REQUIRED: country 
    #                       starts_with("inw"), # REQUIRED: interview date (to match vote recall to specific election)
    #                       "gndr" , # gender
    #                       "agea", # age
    #                       starts_with("edulvl"), # educational attainment (several vars)
    #                       starts_with("isco"), # occupation
    #                       starts_with("prtv"), # party vote
    #                       -ends_with("de1"), # drop 1st German vote intention var
    #   ) %>% 
    #     as.data.frame()
    #   # find FIRST country-specific vote variable
    #   start <- head(grep("prtv", colnames(esx)), n=1)
    #   # find LAST country-specific vote variable
    #   end <- tail(grep("prtv", colnames(esx)), n=1)
    #   # mini dataset of party choice vars
    #   es.vote <- esx %>% select(start:end)
    #   # create dataset-wide vote variable by merging the country-specific vars
    #   esx$party.vote.num <- as.vector(do.call(coalesce, es.vote))
    #   # convert numeric values into party names
    #   es.vote.named <- as_label(es.vote)
    #   # convert factors into characters to make sure they're stored properly
    #   es.vote.named[] <- lapply(es.vote.named, as.character)
    #   # create another dataset-wide vote variable, this time for the character variable
    #   esx$party.vote.name <- as.vector(do.call(coalesce, es.vote.named))
    #   # convert to UTF encoding to deal with special characters
    #   # delete unnecessary variables
    #   start <- head(grep("prtvt", colnames(esx)), n=1)
    #   end <- tail(grep("prtvt", colnames(esx)), n=1)
    #   esx <- esx %>% select(-(start:end))
    #   esx
    # }
    
    es.df.clean <- function(x) {
      # Convert to data.table
      # x <- as.data.table(x)
      
      # Select required variables
      fixed_columns <- c("essround", "idno", "cntry", "gndr", "agea")
      columns_starting_inw <- grep("^inw", names(x), value = TRUE)
      columns_starting_edulvl <- grep("^edulvl", names(x), value = TRUE)
      columns_starting_isco <- grep("^isco", names(x), value = TRUE)
      columns_starting_prtv <- grep("^prtv", names(x), value = TRUE)
      selected_columns <- c(fixed_columns, columns_starting_inw, columns_starting_edulvl, columns_starting_isco, columns_starting_prtv)
      # apply the selections
      esx <- x[, .SD, .SDcols = selected_columns]
      
      # Drop variables ending with "de1"
      columns_to_delete <- grep("de1$", names(esx), value = TRUE)
      esx[, (columns_to_delete) := NULL]
      
      # Drop columns prtvtro and prtvtait that only are missing values
      # esx[,.N, by=prtvtro]
      # esx[,.N, by=prtvtait]
      esx[, c("prtvtro", "prtvtait") := NULL]
    
      # Find FIRST and LAST country-specific vote variable
      vote_cols <- grep("^prtv", names(esx), value = TRUE)
      
      # Create dataset-wide vote variable by merging the country-specific vars
      esx[, party.vote.num := do.call(fcoalesce, .SD), .SDcols = vote_cols]
      
      # Convert numeric values into party names
      # es.vote.named <- as.data.table(lapply(esx[, ..vote_cols], as_label))
      es.vote.named <- esx[, lapply(.SD, as.character)]
      
      # Create another dataset-wide vote variable, this time for the character variable
      esx[, party.vote.name := do.call(fcoalesce, es.vote.named)]
      
      # Drop unnecessary variables
      unnecessary_columns <- grep("^prtv", names(esx), value = TRUE)
      # apply the selections
      esx <- esx[, .SD, .SDcols = -unnecessary_columns]
     
      return(esx)
    }
    
    
    # Clean the dataset with 11 rounds
    ess_clean <- es.df.clean(ess_raw)
    
    # Save the cleaned data to be able to reuse it later without the preprocessing
    write.csv(ess_clean, file = "ess_clean.csv", row.names = FALSE)
    
    # Clean R memory
    rm(list = ls())
    gc()
    
    # useful function
    tabl <- function(...) table(..., useNA='ifany')
    
    # open the cleaned data
    ess <- read.csv("ess_clean.csv")
    
    # EDUCATION:
    # Let's create a dummy variable indicating that the respondent
    # has attained a bachelor's degree or above
    # ESS rounds 1-4 use the "edulvla" variable
    xtabs(~ essround + edulvla, data=ess)
    # ESS rounds 5 onwards use a more detailed "edulvlb" variable
    xtabs(~ essround + edulvlb, data=ess)
    
    # First let's code "other" as missing
    ess$edulvla[ess$edulvla==55] <- NA # "other"
    ess$edulvlb[ess$edulvlb==5555] <- NA # "other"
    
    # now create dummy for bachelors degree
    # for more details on the categories: https://www.europeansocialsurvey.org/docs/round8/survey/ESS8_data_protocol_e01_4.pdf
    ess$educ.ba <- ifelse(ess$essround<5 & ess$edulvla==5, 1,
                          ifelse(ess$essround>=5 & ess$edulvlb>600, 1, 0))
    tabl(ess$educ.ba)
    
    
    # OCCUPATION
    head(xtabs(~  iscoco + essround, data=ess))
    head(xtabs(~  isco08 + essround, data=ess))
    
    # load Oesch occupation-class crosswalks from the Github repo
    # Alternatively, you can run the script "oesch_class_crosswalks.R" to produce them yourself
    
    # crosswalk for ISCO 1988 codes
    cw88 <- fread("https://raw.githubusercontent.com/sophieehill/ess-cumulative/master/crosswalks/oesch_88_4dig_cleaned.csv")
    cw88 <- cw88[,-1]
    # crosswalk for ISCO 2008 codes
    names(cw88) <- c("isco88", "isco88_desc", "oesch_class88")
    cw08 <- fread("https://raw.githubusercontent.com/sophieehill/ess-cumulative/master/crosswalks/oesch_08_4dig_cleaned.csv")
    cw08 <- cw08[,-1]
    names(cw08) <- c("isco08", "isco_desc08", "oesch_class08")
    
    
    ess <- left_join(ess, cw88[!is.na(cw88$isco88),], by=c("iscoco"="isco88"))
    ess <- left_join(ess, cw08, by=c("isco08"="isco08"))
    ess <- ess|> 
      mutate(oesch_class = coalesce(oesch_class88, oesch_class08))
    tabl(ess$oesch_class)
    
    # NOTE: since I am constructing the class mapping just based on occupation
    # categories 1-4 (the self-employed) of the Oesch class schema will not be included here
    # if you want to include these categories, follow Oesch's mapping using the additional variables
    # "emplrel" and "emplno"
    ess <- ess|> 
      mutate(oesch_class_sum = case_when(
        oesch_class %in% c(1,2) ~ "Self-employed professionals",
        oesch_class %in% c(3,4) ~ "Small business owners",
        oesch_class %in% c(5,6) ~ "Technical (semi-)professionals",
        oesch_class %in% c(7,8) ~ "Production workers",
        oesch_class %in% c(9,10) ~ "(Associate) managers",
        oesch_class %in% c(11,12) ~ "Clerks",
        oesch_class %in% c(13,14) ~ "Sociocultural (semi-)professionals",
        oesch_class %in% c(15,16) ~ "Service workers"))
    tabl(ess$oesch_class_sum)
    
    # gender
    tabl(ess$gndr)
    ess$female <- ifelse(ess$gndr==1, 0, 
                         ifelse(ess$gndr==2, 1, NA))
    tabl(ess$female)
    
    # age
    table(ess$agea)
    ess$age <- ess$agea
    ess$age[ess$agea==999] <- NA
    table(ess$age)
    ess$age.group <- cut(ess$age, breaks=c(0,20,35,50,65,75, 120))
    table(ess$age.group)
    
    # year
    ess$essround.year <- NA
    ess$essround.year[ess$essround==1] <- 2002
    ess$essround.year[ess$essround==2] <- 2004
    ess$essround.year[ess$essround==3] <- 2006
    ess$essround.year[ess$essround==4] <- 2008
    ess$essround.year[ess$essround==5] <- 2010
    ess$essround.year[ess$essround==6] <- 2012
    ess$essround.year[ess$essround==7] <- 2014
    ess$essround.year[ess$essround==8] <- 2016
    ess$essround.year[ess$essround==9] <- 2018
    ess$essround.year[ess$essround==10] <- 2020
    ess$essround.year[ess$essround==11] <- 2022
    
    # party.vote.ess
    table(ess$cntry)
    ess$party.vote.ess <- ifelse(is.na(ess$party.vote.num), NA,
                                 paste0(ess$cntry, "-", ess$essround, "-", ess$party.vote.num))
    tabl(ess$party.vote.ess)
    
    # load the ESS-Partyfacts extended crosswalk
    cw_ess_pf <- fread("https://raw.githubusercontent.com/sophieehill/ess-partyfacts-crosswalk/master/ess-partyfacts-extended.csv")
    cw_ess_pf$party.vote.ess <- paste0(cw_ess_pf$cntry, "-", cw_ess_pf$essround, "-", cw_ess_pf$ess_id)
    cw_ess_pf <- cw_ess_pf |> 
      select(party.vote.ess, partyfacts_id, partyfacts_name)
    
    # merge partyfacts IDs into main dataset
    ess <- left_join(ess, cw_ess_pf, by="party.vote.ess")
    tabl(ess$party.vote.ess)
    tabl(ess$partyfacts_id) # bcp NA
    
    # now load the Partyfacts-External crosswalk and select the Manifesto dataset
    # this lets us link those partyfacts IDs to *other* datasets
    cw_pf <- fread("https://partyfacts.herokuapp.com/download/external-parties-csv", fill = TRUE)
    cw_pf_cmp <- cw_pf |> 
      filter(dataset_key == "manifesto") |> 
      select(partyfacts_id, dataset_party_id)
    cw_pf_cmp$dataset_party_id <- as.numeric(cw_pf_cmp$dataset_party_id)
    
    names(cw_pf_cmp) <- c("partyfacts_id", "cmp_id")
    
    ess <- left_join(ess, cw_pf_cmp, by="partyfacts_id")
    tabl(ess$cmp_id)
    
    # In order to merge in election-level variables (e.g. measures of a party's manifesto for a particular election), we need to match up the ESS dates to the most recent election
    # Some ESS fieldwork occurs over an election period, meaning that respondents within the same country-round would be referring to different elections when they recall their "past vote"
    # First, let's import the dataset from Denis Cohen's github: https://github.com/denis-cohen/ess-election-dates
    ess_dates <- fread("https://raw.githubusercontent.com/denis-cohen/ess-election-dates/master/ess_election_dates.csv")
    # select needed vars
    ess_dates <- ess_dates |> 
      select(cntry, essround, recent_election, recent_election_split1)
    # merge in
    ess <- left_join(ess, ess_dates, by=c("cntry", "essround"))
    
    # create a variable indicating date of interview for each respondent
    # first create day/month/year variables consistent across rounds
    # from ESS Round 3 onwards, they give us the start (inwdds) AND end date (inwdde) of the interview
    # here I am taking the start date as our reference point
    # I *think* the politics module occurs fairly early during the survey
    # Alternatively we coulld take the midpoint, or use the end date?
    ess <- ess |> 
      mutate(int.day = case_when(essround<3 ~ inwdd,
                                              essround>2 ~ inwdds)) |>
      mutate(int.month = case_when(essround<3 ~ inwmm,
                                   essround>2 ~ inwmms)) |>
      mutate(int.year = case_when(essround<3 ~ inwyr,
                                  essround>2 ~ inwyys))
    ess <- ess |> 
      mutate(int.date = as.Date(paste(int.year, int.month, int.day, sep="-")))
    tabl(ess$int.date)
    # for each respondent, let's define their "most recent election", based on start interview date
    ess <- ess |> 
      mutate(ref.election = case_when(int.date > recent_election ~ recent_election,
                                                   int.date <= recent_election ~ recent_election_split1))
    tabl(ess$ref.election)
    # if the specific date is missing let's just match up using the country-year pair
    
    
    # Merge with CMP data to get party families
    # Download latest CMP dataset
    # (Use API or just load "cmp.csv")
    library(manifestoR)
    # set API key
    mp_setapikey(key = "70af9d9d7f76a3d66d41142debe969f6")
    # download latest dataset
    cmp <- as.data.frame(mp_maindataset())
    # save for replicability
    # write.csv(cmp, "cmp_main_2020.csv")
    head(cmp)
    tabl(cmp$edate)
    summary(cmp$party)
    # create election year variable
    cmp$election.year <- as.numeric(substr(cmp$date, 1, 4))
    # create econ l-r and lib-auth scales, following Bakker & Hobolt (2013)
    cmp <- cmp |> 
      mutate(econlr = scale_logit(data=cmp,
                                  pos=c("per401", "per402", "per407", "per505",
                                       "per507", "per410", "per414", "per702"),
                                  neg=c("per403", "per404", "per406", "per504",
                                       "per506", "per413", "per412", "per701",
                                       "per405", "per409", "per415", "per503"),
                                  zero_offset = 0.5))
    
    cmp <- cmp |> 
      mutate(econlr.sal = (per401 + per402 + per407 + per505 + per507 + per410 + per414 + per702) +
                            (per403 + per404 + per406 + per504 + per506 + per413 + per412 + per701 + per405 + per409 + per415 + per503))
    
    
    summary(cmp$econlr.sal)
    
    cmp <- cmp |> 
      mutate(auth = scale_logit(data=cmp,
                                pos=c("per305", "per601", "per603", "per605",
                                     "per608", "per606"),
                                neg=c("per501", "per602", "per604", "per502",
                                     "per607", "per416", "per705", "per706",
                                     "per201", "per202"),
                                zero_offset = 0.5))
    
    cmp <- cmp |> 
      mutate(auth.sal = (per305 + per601 + per603 + per605 + per608  + per606) +
                        (per501 + per602 + per604 + per502 + per607 + per416 + per705 + per706 + per201 + per202))
    # select party code, party family
    # as well as party-election specific variables like right/left coding of the manifesto
    cmp.x <- cmp |> 
      select(party, parfam, election.year, edate, rile, econlr, econlr.sal, auth, auth.sal)
    names(cmp.x)[1:2] <- c("cmp_id", "cmp_parfam") # relabel for clarity
    head(cmp.x)
    ess$election.year <- as.numeric(substr(ess$ref.election, 1, 4))
    tabl(ess$election.year)
    # match up by election year
    # N.B. this won't work for cases where two elections happen in the same year, and ESS fieldwork window covers the 2nd election
    ess <- left_join(ess, cmp.x, by=c("cmp_id", "election.year"))
    # alternatively we could match on exact election date
    # cmp.x$election.date <- as.Date(cmp.x$edate)
    # ess$election.date <- as.Date(ess$ref.election)
    # ess <- left_join(ess, cmp.x, by=c("cmp_id", "election.date"))
    
    # create left vote recall based on party families
    # 10 = ecological
    # 20 = socialist or other left
    # 30 = social democratic
    ess$vote.left <- ifelse(ess$cmp_parfam==10 | ess$cmp_parfam==20 | ess$cmp_parfam==30, 1, 0)
    tabl(ess$vote.left)
    
    names(ess)
    
    head(ess)
    essx <- ess |> 
      select(idno, cntry, essround, essround.year, int.date,
             female, age, age.group, educ.ba, 
             oesch_class, oesch_class_sum,
             # domicil, nace.summary, lrscale,
             party.vote.ess, partyfacts_id, partyfacts_name,
             cmp_id, cmp_parfam, vote.left, ref.election,
             election.year, edate, rile,
             econlr, econlr.sal, auth, auth.sal) |>
      as.data.frame()
    
    write.csv(essx, "ess_cumulative_core.csv")