Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
# load packages
library(dplyr) # for data wrangling
# library(essurvey) # to download ESS data
# if necessary, install with this command:
# devtools::install_github("ropensci/essurvey")
library(sjlabelled) # to convert party vote choice into names
library(data.table) # for the "fread" function to quickly load large csv files
# we first save the 11 rounds from the ESS data website
ess_raw <- fread("ESS1e06_7-ESS2e03_6-ESS3e03_7-ESS4e04_6-ESS5e03_5-ESS6e02_6-ESS7e02_3-ESS8e02_3-ESS9e03_2-ESS10-ESS10SC-ESS11-subset.csv", header = TRUE, sep = ",")
# Now we need to create a function to:
# (i) select required variables from each of the 9 datasets
# (ii) create a generalized party vote choice variable, instead of having lots of country-round specific variables
# note: for Germany there are TWO vote intention variables
# since they cast 1 vote for a candidate "prtvde1" and then 1 vote for a party list "prtvde2"
# I will just use the party of the candidate vote
# which is why I drop variables ending in "de2" in the function below
# You can add the variables you want to extract in the select function below
# Make sure to get the variable name exactly right: http://nesstar.ess.nsd.uib.no/webview/
# Use "start_with()" / "ends_with()" to grab all variables starting with that string
# es.df.clean <- function(x){
# esx <- x %>% select("essround", # REQUIRED: essround
# "idno", # REQUIRED: respondent ID
# "cntry", # REQUIRED: country
# starts_with("inw"), # REQUIRED: interview date (to match vote recall to specific election)
# "gndr" , # gender
# "agea", # age
# starts_with("edulvl"), # educational attainment (several vars)
# starts_with("isco"), # occupation
# starts_with("prtv"), # party vote
# -ends_with("de1"), # drop 1st German vote intention var
# ) %>%
# as.data.frame()
# # find FIRST country-specific vote variable
# start <- head(grep("prtv", colnames(esx)), n=1)
# # find LAST country-specific vote variable
# end <- tail(grep("prtv", colnames(esx)), n=1)
# # mini dataset of party choice vars
# es.vote <- esx %>% select(start:end)
# # create dataset-wide vote variable by merging the country-specific vars
# esx$party.vote.num <- as.vector(do.call(coalesce, es.vote))
# # convert numeric values into party names
# es.vote.named <- as_label(es.vote)
# # convert factors into characters to make sure they're stored properly
# es.vote.named[] <- lapply(es.vote.named, as.character)
# # create another dataset-wide vote variable, this time for the character variable
# esx$party.vote.name <- as.vector(do.call(coalesce, es.vote.named))
# # convert to UTF encoding to deal with special characters
# # delete unnecessary variables
# start <- head(grep("prtvt", colnames(esx)), n=1)
# end <- tail(grep("prtvt", colnames(esx)), n=1)
# esx <- esx %>% select(-(start:end))
# esx
# }
es.df.clean <- function(x) {
# Convert to data.table
# x <- as.data.table(x)
# Select required variables
fixed_columns <- c("essround", "idno", "cntry", "gndr", "agea")
columns_starting_inw <- grep("^inw", names(x), value = TRUE)
columns_starting_edulvl <- grep("^edulvl", names(x), value = TRUE)
columns_starting_isco <- grep("^isco", names(x), value = TRUE)
columns_starting_prtv <- grep("^prtv", names(x), value = TRUE)
selected_columns <- c(fixed_columns, columns_starting_inw, columns_starting_edulvl, columns_starting_isco, columns_starting_prtv)
# apply the selections
esx <- x[, .SD, .SDcols = selected_columns]
# Drop variables ending with "de1"
columns_to_delete <- grep("de1$", names(esx), value = TRUE)
esx[, (columns_to_delete) := NULL]
# Drop columns prtvtro and prtvtait that only are missing values
# esx[,.N, by=prtvtro]
# esx[,.N, by=prtvtait]
esx[, c("prtvtro", "prtvtait") := NULL]
# Find FIRST and LAST country-specific vote variable
vote_cols <- grep("^prtv", names(esx), value = TRUE)
# Create dataset-wide vote variable by merging the country-specific vars
esx[, party.vote.num := do.call(fcoalesce, .SD), .SDcols = vote_cols]
# Convert numeric values into party names
# es.vote.named <- as.data.table(lapply(esx[, ..vote_cols], as_label))
es.vote.named <- esx[, lapply(.SD, as.character)]
# Create another dataset-wide vote variable, this time for the character variable
esx[, party.vote.name := do.call(fcoalesce, es.vote.named)]
# Drop unnecessary variables
unnecessary_columns <- grep("^prtv", names(esx), value = TRUE)
# apply the selections
esx <- esx[, .SD, .SDcols = -unnecessary_columns]
return(esx)
}
# Clean the dataset with 11 rounds
ess_clean <- es.df.clean(ess_raw)
# Save the cleaned data to be able to reuse it later without the preprocessing
write.csv(ess_clean, file = "ess_clean.csv", row.names = FALSE)
# Clean R memory
rm(list = ls())
gc()
# useful function
tabl <- function(...) table(..., useNA='ifany')
# open the cleaned data
ess <- read.csv("ess_clean.csv")
# EDUCATION:
# Let's create a dummy variable indicating that the respondent
# has attained a bachelor's degree or above
# ESS rounds 1-4 use the "edulvla" variable
xtabs(~ essround + edulvla, data=ess)
# ESS rounds 5 onwards use a more detailed "edulvlb" variable
xtabs(~ essround + edulvlb, data=ess)
# First let's code "other" as missing
ess$edulvla[ess$edulvla==55] <- NA # "other"
ess$edulvlb[ess$edulvlb==5555] <- NA # "other"
# now create dummy for bachelors degree
# for more details on the categories: https://www.europeansocialsurvey.org/docs/round8/survey/ESS8_data_protocol_e01_4.pdf
ess$educ.ba <- ifelse(ess$essround<5 & ess$edulvla==5, 1,
ifelse(ess$essround>=5 & ess$edulvlb>600, 1, 0))
tabl(ess$educ.ba)
# OCCUPATION
head(xtabs(~ iscoco + essround, data=ess))
head(xtabs(~ isco08 + essround, data=ess))
# load Oesch occupation-class crosswalks from the Github repo
# Alternatively, you can run the script "oesch_class_crosswalks.R" to produce them yourself
# crosswalk for ISCO 1988 codes
cw88 <- fread("https://raw.githubusercontent.com/sophieehill/ess-cumulative/master/crosswalks/oesch_88_4dig_cleaned.csv")
cw88 <- cw88[,-1]
# crosswalk for ISCO 2008 codes
names(cw88) <- c("isco88", "isco88_desc", "oesch_class88")
cw08 <- fread("https://raw.githubusercontent.com/sophieehill/ess-cumulative/master/crosswalks/oesch_08_4dig_cleaned.csv")
cw08 <- cw08[,-1]
names(cw08) <- c("isco08", "isco_desc08", "oesch_class08")
ess <- left_join(ess, cw88[!is.na(cw88$isco88),], by=c("iscoco"="isco88"))
ess <- left_join(ess, cw08, by=c("isco08"="isco08"))
ess <- ess|>
mutate(oesch_class = coalesce(oesch_class88, oesch_class08))
tabl(ess$oesch_class)
# NOTE: since I am constructing the class mapping just based on occupation
# categories 1-4 (the self-employed) of the Oesch class schema will not be included here
# if you want to include these categories, follow Oesch's mapping using the additional variables
# "emplrel" and "emplno"
ess <- ess|>
mutate(oesch_class_sum = case_when(
oesch_class %in% c(1,2) ~ "Self-employed professionals",
oesch_class %in% c(3,4) ~ "Small business owners",
oesch_class %in% c(5,6) ~ "Technical (semi-)professionals",
oesch_class %in% c(7,8) ~ "Production workers",
oesch_class %in% c(9,10) ~ "(Associate) managers",
oesch_class %in% c(11,12) ~ "Clerks",
oesch_class %in% c(13,14) ~ "Sociocultural (semi-)professionals",
oesch_class %in% c(15,16) ~ "Service workers"))
tabl(ess$oesch_class_sum)
# gender
tabl(ess$gndr)
ess$female <- ifelse(ess$gndr==1, 0,
ifelse(ess$gndr==2, 1, NA))
tabl(ess$female)
# age
table(ess$agea)
ess$age <- ess$agea
ess$age[ess$agea==999] <- NA
table(ess$age)
ess$age.group <- cut(ess$age, breaks=c(0,20,35,50,65,75, 120))
table(ess$age.group)
# year
ess$essround.year <- NA
ess$essround.year[ess$essround==1] <- 2002
ess$essround.year[ess$essround==2] <- 2004
ess$essround.year[ess$essround==3] <- 2006
ess$essround.year[ess$essround==4] <- 2008
ess$essround.year[ess$essround==5] <- 2010
ess$essround.year[ess$essround==6] <- 2012
ess$essround.year[ess$essround==7] <- 2014
ess$essround.year[ess$essround==8] <- 2016
ess$essround.year[ess$essround==9] <- 2018
ess$essround.year[ess$essround==10] <- 2020
ess$essround.year[ess$essround==11] <- 2022
# party.vote.ess
table(ess$cntry)
ess$party.vote.ess <- ifelse(is.na(ess$party.vote.num), NA,
paste0(ess$cntry, "-", ess$essround, "-", ess$party.vote.num))
tabl(ess$party.vote.ess)
# load the ESS-Partyfacts extended crosswalk
cw_ess_pf <- fread("https://raw.githubusercontent.com/sophieehill/ess-partyfacts-crosswalk/master/ess-partyfacts-extended.csv")
cw_ess_pf$party.vote.ess <- paste0(cw_ess_pf$cntry, "-", cw_ess_pf$essround, "-", cw_ess_pf$ess_id)
cw_ess_pf <- cw_ess_pf |>
select(party.vote.ess, partyfacts_id, partyfacts_name)
# merge partyfacts IDs into main dataset
ess <- left_join(ess, cw_ess_pf, by="party.vote.ess")
tabl(ess$party.vote.ess)
tabl(ess$partyfacts_id) # bcp NA
# now load the Partyfacts-External crosswalk and select the Manifesto dataset
# this lets us link those partyfacts IDs to *other* datasets
cw_pf <- fread("https://partyfacts.herokuapp.com/download/external-parties-csv", fill = TRUE)
cw_pf_cmp <- cw_pf |>
filter(dataset_key == "manifesto") |>
select(partyfacts_id, dataset_party_id)
cw_pf_cmp$dataset_party_id <- as.numeric(cw_pf_cmp$dataset_party_id)
names(cw_pf_cmp) <- c("partyfacts_id", "cmp_id")
ess <- left_join(ess, cw_pf_cmp, by="partyfacts_id")
tabl(ess$cmp_id)
# In order to merge in election-level variables (e.g. measures of a party's manifesto for a particular election), we need to match up the ESS dates to the most recent election
# Some ESS fieldwork occurs over an election period, meaning that respondents within the same country-round would be referring to different elections when they recall their "past vote"
# First, let's import the dataset from Denis Cohen's github: https://github.com/denis-cohen/ess-election-dates
ess_dates <- fread("https://raw.githubusercontent.com/denis-cohen/ess-election-dates/master/ess_election_dates.csv")
# select needed vars
ess_dates <- ess_dates |>
select(cntry, essround, recent_election, recent_election_split1)
# merge in
ess <- left_join(ess, ess_dates, by=c("cntry", "essround"))
# create a variable indicating date of interview for each respondent
# first create day/month/year variables consistent across rounds
# from ESS Round 3 onwards, they give us the start (inwdds) AND end date (inwdde) of the interview
# here I am taking the start date as our reference point
# I *think* the politics module occurs fairly early during the survey
# Alternatively we coulld take the midpoint, or use the end date?
ess <- ess |>
mutate(int.day = case_when(essround<3 ~ inwdd,
essround>2 ~ inwdds)) |>
mutate(int.month = case_when(essround<3 ~ inwmm,
essround>2 ~ inwmms)) |>
mutate(int.year = case_when(essround<3 ~ inwyr,
essround>2 ~ inwyys))
ess <- ess |>
mutate(int.date = as.Date(paste(int.year, int.month, int.day, sep="-")))
tabl(ess$int.date)
# for each respondent, let's define their "most recent election", based on start interview date
ess <- ess |>
mutate(ref.election = case_when(int.date > recent_election ~ recent_election,
int.date <= recent_election ~ recent_election_split1))
tabl(ess$ref.election)
# if the specific date is missing let's just match up using the country-year pair
# Merge with CMP data to get party families
# Download latest CMP dataset
# (Use API or just load "cmp.csv")
library(manifestoR)
# set API key
mp_setapikey(key = "70af9d9d7f76a3d66d41142debe969f6")
# download latest dataset
cmp <- as.data.frame(mp_maindataset())
# save for replicability
# write.csv(cmp, "cmp_main_2020.csv")
head(cmp)
tabl(cmp$edate)
summary(cmp$party)
# create election year variable
cmp$election.year <- as.numeric(substr(cmp$date, 1, 4))
# create econ l-r and lib-auth scales, following Bakker & Hobolt (2013)
cmp <- cmp |>
mutate(econlr = scale_logit(data=cmp,
pos=c("per401", "per402", "per407", "per505",
"per507", "per410", "per414", "per702"),
neg=c("per403", "per404", "per406", "per504",
"per506", "per413", "per412", "per701",
"per405", "per409", "per415", "per503"),
zero_offset = 0.5))
cmp <- cmp |>
mutate(econlr.sal = (per401 + per402 + per407 + per505 + per507 + per410 + per414 + per702) +
(per403 + per404 + per406 + per504 + per506 + per413 + per412 + per701 + per405 + per409 + per415 + per503))
summary(cmp$econlr.sal)
cmp <- cmp |>
mutate(auth = scale_logit(data=cmp,
pos=c("per305", "per601", "per603", "per605",
"per608", "per606"),
neg=c("per501", "per602", "per604", "per502",
"per607", "per416", "per705", "per706",
"per201", "per202"),
zero_offset = 0.5))
cmp <- cmp |>
mutate(auth.sal = (per305 + per601 + per603 + per605 + per608 + per606) +
(per501 + per602 + per604 + per502 + per607 + per416 + per705 + per706 + per201 + per202))
# select party code, party family
# as well as party-election specific variables like right/left coding of the manifesto
cmp.x <- cmp |>
select(party, parfam, election.year, edate, rile, econlr, econlr.sal, auth, auth.sal)
names(cmp.x)[1:2] <- c("cmp_id", "cmp_parfam") # relabel for clarity
head(cmp.x)
ess$election.year <- as.numeric(substr(ess$ref.election, 1, 4))
tabl(ess$election.year)
# match up by election year
# N.B. this won't work for cases where two elections happen in the same year, and ESS fieldwork window covers the 2nd election
ess <- left_join(ess, cmp.x, by=c("cmp_id", "election.year"))
# alternatively we could match on exact election date
# cmp.x$election.date <- as.Date(cmp.x$edate)
# ess$election.date <- as.Date(ess$ref.election)
# ess <- left_join(ess, cmp.x, by=c("cmp_id", "election.date"))
# create left vote recall based on party families
# 10 = ecological
# 20 = socialist or other left
# 30 = social democratic
ess$vote.left <- ifelse(ess$cmp_parfam==10 | ess$cmp_parfam==20 | ess$cmp_parfam==30, 1, 0)
tabl(ess$vote.left)
names(ess)
head(ess)
essx <- ess |>
select(idno, cntry, essround, essround.year, int.date,
female, age, age.group, educ.ba,
oesch_class, oesch_class_sum,
# domicil, nace.summary, lrscale,
party.vote.ess, partyfacts_id, partyfacts_name,
cmp_id, cmp_parfam, vote.left, ref.election,
election.year, edate, rile,
econlr, econlr.sal, auth, auth.sal) |>
as.data.frame()
write.csv(essx, "ess_cumulative_core.csv")