/*****************************************************************************************
* MERGING INDIVIDUAL FILES ACROSS WAVES INTO LONG FORMAT                                 *
* To match individual level files across two waves into a long format                    *
* do the following (for more waves add wave specific prefix in the foreach statement)    *
*****************************************************************************************/

// change current file location
cd "UK life satisfaction"

// assign global macro to refer to Understanding Society data
global ukhls "Understanding Society\UKDA-6614-stata\stata\stata11_se"

//loop through each wave
foreach w in a b c d e f g h i { // For fewer waves use only the wave prefix of the waves you need to merge
	
	// find the wave number
	local waveno=strpos("abcdefghijklmnopqrstuvwxyz","`w'")
	
	// open the individual level file
	use pidp `w'_* using "$ukhls/ukhls_w`waveno'/`w'_indresp", clear // gets file with waveno in there file with correct letter
	
	// drop the wave prefix from all variables
	rename `w'_* *
	
	// create a wave variable
	gen wave=`waveno'
	
	// save one file for each wave
	save temp`w', replace
}

// open the file for the first wave (wave a_)
use tempa, clear

// loop through the remaining waves
foreach w in b c d e f g h i {

	// append the files for the second wave onwards
	append using temp`w'
}

// check how many observations are available from each wave
tab wave

// save the long file
save longfile, replace

* up to here code from Understanding Society team

* merge wave 10 (Covid sample, April 2020)
use "ca_indresp_w.dta" , clear
gen wave=10
rename ca_* *
generate intdaty_dv=2020

save temp2020.dta, replace
use longfile, clear
append using temp2020




// erase temporary files
foreach w in a b c d e f g h i 2020 {
erase temp`w'.dta
}

compress 
rename pidp pid // need naming from SOEP
xtset pid wave
rename intdaty_dv syear
save longfile, replace
use longfile, clear

recode wave (1=2010 "≈2010") (2=2011 "≈2011") (3=2012 "≈2012") (4=2013 "≈2013") (5=2014 "≈2014") (6=2015 "≈2015") (7=2016 "≈2016") (8=2017 "≈2017") (9=2018 "≈2018") (10=2020 "4.2020"), gen(approx_syear) // make approx survey year from waves

* Generate life satisfaction variable from 0-100: more = more satisfied with life
capture drop life_sat*
generate life_sat=scghq1_dv if scghq1_dv>=0
// get the min and max of price
sum life_sat
// create the rescaled life satisfaction
gen  life_sat_rs= (life_sat - r(min) ) / ( r(max)-r(min) )*100
sum life_sat_rs
gen life_sat100 = r(max)-life_sat_rs
label var life_sat100 "Well-being 0-100"


* weights used, for info see: Understanding Society –UK Household Longitudinal Study: Wave 1 -9, 2009-2018. User Guide. November 2019, page 75

generate xsect_weight=.
replace xsect_weight=indinui_xw if xsect_weight==. & indinui_xw>=0 // replace with weights for adult main interview (BHPS, GPS, EMBS but WITHOUT IEMBS if missing)
 // weights for adult main interview (BHPS, GPS, EMBS and IEMBS, also reference weight for Covid 19 survey)
replace xsect_weight=indinub_xw if xsect_weight==. & indinub_xw>=0 // replace with weights for adult main interview (BHPS, GPS, EMBS but WITHOUT IEMBS if missing)
replace xsect_weight=indinus_xw if xsect_weight==. & indinus_xw>=0 // replace with weights for adult main interview (without additional sample only for first wave)
replace xsect_weight=betaindin_xw if xsect_weight==. & betaindin_xw>=0 // this weights for covid-19 survey

bysort pid: egen weight_wave10_generalized = max(betaindin_xw)
label var weight_wave10_generalized "Weight Covid survey generalized to all cases of person"

bysort pid: egen max_wave=max(wave)
generate last_weight_pers_constant_temp=xsect_weight if wave==max_wave
bysort pid: egen last_weight_pers_constant=max(last_weight_pers_constant_temp)
label var last_weight_pers_constant "last weight for each person (time constant for xtreg)"
drop last_weight_pers_constant_temp

foreach var of varlist scghqa scghqb scghqc scghqd scghqe scghqf scghqg scghqh scghqi scghqj scghqk scghql {
replace `var'=. if `var'<0
codebook `var'
recode `var' (1=4 "better") (2=3 "same") (3=2 "worse") (4=1 "much worse"), gen(lin_`var')
label var lin_`var' "`:var label `var''"

recode `var' (1/2=0 "same") (3/4=1 "more") (else=.), gen(cat_`var')
label var cat_`var' "increased problem: `:var label `var''"

}

replace age=syear-birthy if syear>0 & birthy>0 & age==. // age for years where missing
replace age=. if age<16

label var scghqk "ghq: believe worthless" 
label var cat_scghqa "Problems concentrating"
label var cat_scghqb "Sleeplessness"
label var cat_scghqc "Not feeling useful"
label var cat_scghqd "Inability to decide"
label var cat_scghqe "Constantly under strain"
label var cat_scghqf "Problems overcoming difficulties"
label var cat_scghqg "Problems enjoying day-to-day activities"
label var cat_scghqh "Inability to face problems" 
label var cat_scghqi "Unhappy or depressed" 
label var cat_scghqj "Losing confidence"
label var cat_scghqk "Feeling worthless"
label var cat_scghql "General happiness lower"

compress
save modified_longfile, replace
use modified_longfile, replace

********************************************************************************
* Calculations for blog post
use modified_longfile, replace
drop if wave<6 // only calculate change from last wave
*keep if weight_wave10_generalized!=.
* FE likelihood to have higher problems
eststo clear
estimates clear
foreach var of varlist scghqg scghqc scghqa scghql scghqb scghqi scghqd scghqh scghqe scghqk scghqj scghqf {
eststo `var': xtlogit cat_`var' ib2018.approx_syear c.age##c.age [weight=last_weight_pers_constant], fe
coefplot, keep(*.approx_syear) title("`:var label cat_`var''") base xline(1, lstyle(dot)) mlabel format(%2.1g) mlabpos(1) saving(cat_`var', replace) plotregion(lwidth(none)) eform
}

graph combine ///
cat_scghqg.gph  ///
cat_scghqc.gph ///
cat_scghqa.gph ///
cat_scghql.gph ///  
cat_scghqb.gph ///
cat_scghqi.gph ///
cat_scghqd.gph ///
cat_scghqh.gph ///
cat_scghqe.gph ///
cat_scghqk.gph ///
cat_scghqj.gph ///
cat_scghqf.gph ///
, imargin(vsmall) scale(.95) col(2) ysize(7) 


* descriptive: share of population with problems
foreach var of varlist scghqa scghqb scghqc scghqd scghqe scghqf scghqg scghqh scghqi scghqj scghqk scghql {
graph bar cat_`var' [weight=xsect_weight], over(approx_syear, label(alt)) blabel(total, format(%4.2f)) ytitle("`:var label cat_`var''") plotregion(lwidth(none)) saving(cat_`var'_desc, replace) 
}

graph combine ///
cat_scghqg_desc.gph  ///
cat_scghqc_desc.gph ///
cat_scghqa_desc.gph ///
cat_scghql_desc.gph ///  
cat_scghqb_desc.gph ///
cat_scghqi_desc.gph ///
cat_scghqd_desc.gph ///
cat_scghqh_desc.gph ///
cat_scghqe_desc.gph ///
cat_scghqk_desc.gph ///
cat_scghqj_desc.gph ///
cat_scghqf_desc.gph ///
, imargin(small) scale(1) col(3) ysize(7) title("Share of population saying (much) more:")


* overall decline in life satisfaction, note: weight_wave10_generalized makes pretty much no difference
use modified_longfile, replace
eststo: xtreg life_sat100 c.age##c.age ib2018.approx_syear [weight=last_weight_pers_constant], vce(robust) fe
margins, at(approx_syear=(2010 2011 2012 2013 2014 2015 2016 2017 2018 2020)) post
coefplot, mlabel format(%5.3g) mlabpos(1) vertical ylabel(,angle(0)) title(Well-being on scale from 0-100) plotregion(lwidth(none)) xlabel(1 "2010" 2 "2011" 3 "2012" 4 "2013" 5 "2014" 6 "2015" 7 "2016" 8 "2017" 9 "2018" 10 "04.2020") saving(long_term_general_change.gph, replace) title("Long term well being change and Covid-19 pandemic" "(net of general age-related changes)") recast(connect)


* compare effects to effect of being unemployed and receiving unemployment-related benefits
capture drop unempl_benefits
generate unempl_benefits = btype1 if btype1  >=0 // receiving unemployment-related benefits, or national insurance credits? 
replace unempl_benefits=0 if employ ==1 & unempl_benefits!=. // do not count as unemployed if in paid employment
xtreg life_sat100 unempl_benefits, fe vce(robust)

*different effects within the pandemic
reg life_sat100 i.hassymp if hassymp>0, vce(robust) // effect of currently having symptoms that could be corona virus


reg life_sat100 i.testresult if testresult>0, vce(robust) // effect of currently having symptoms that could be corona virus



* different subgroups
use modified_longfile, replace

* generate variables for different subgroups
generate general_population=1

* Monthly earnings 04.2020
capture drop monthly_earnings_042020
generate monthly_earnings_042020= netpay_answer if netpay_answer>=0 
replace monthly_earnings_042020=. if netpay_period <0 |netpay_period==5 // missing if no earning period specified or top-coded
replace monthly_earnings_042020=monthly_earnings_042020/7*30  if netpay_period ==1 // change weekly to monthly earnings
replace monthly_earnings_042020=monthly_earnings_042020/14*30  if netpay_period ==2 // change pay every two weeks to monthly earnings
replace monthly_earnings_042020=monthly_earnings_042020/12  if netpay_period ==4 // change pay every year to monthly earnings
sum monthly_earnings_042020 [weight=betaindin_xw], d // sum up average wage 2020
recode monthly_earnings_042020 (0/1450=0 "poorer 50%") (1451/max=1 "richer 50 percent"), gen(upper_50_percent_2020_temp)


* Monthly earnings 01.2020
capture drop monthly_earnings_012020
generate monthly_earnings_012020= blpay_answer if blpay_answer>=0 
replace monthly_earnings_012020=. if blpay_answer <0 | blpay_answer==5 // missing if no earning period specified or top-coded
replace monthly_earnings_012020=monthly_earnings_012020/7*30  if blpay_answer ==1 // change weekly to monthly earnings
replace monthly_earnings_012020=monthly_earnings_012020/14*30  if blpay_answer ==2 // change pay every two weeks to monthly earnings
replace monthly_earnings_012020=monthly_earnings_012020/12  if blpay_answer ==4 // change pay every year to monthly earnings

generate earnings_lost_gained_04_since_01=monthly_earnings_042020-monthly_earnings_012020
label var earnings_lost_gained_04_since_01 "Earnings lost/gained from 01.2020 to 04.2020"
recode earnings_lost_gained_04_since_01 (min/-100=1) (1/max=0), gen(lost_inc_since_jan)
bysort pid: egen lost_inc_since_jan_time_const=max(lost_inc_since_jan) // those who lost income since january


bysort pid: egen upper_50_percent=max(upper_50_percent_2020_temp)
recode upper_50_percent (1=0) (0=1), generate(lower_50_percent)

bysort pid: egen high_covid_risk =max(hrisk_dv) 
bysort pid: egen vhigh_covid_risk =max(vhrisk_dv) 
generate self_employed=0
replace self_employed=1 if jbsemp==2 // variable before 2020
replace self_employed=1 if semp==2 // 2020-variable
bysort pid: egen living_with_partner=max(couple)
recode age (0/70=0) (70/max=1), gen(old)
recode age (0/30=1) (30/max=0), gen(young)
recode sex (2=1) (1=0) (else=.), generate(women)
recode child1 (2=0) (1=1) (else=.), gen(school_age_child2020_temp)
bysort pid: egen with_school_age_children =max(school_age_child2020_temp) 

* general indicator for different groups
eststo clear
estimates clear
drop if approx_syear<2014
foreach var of varlist general_population self_employed living_with_partner old young high_covid_risk lower_50_percent women with_school_age_children lost_inc_since_jan_time_const {
eststo: xtreg life_sat100 c.age##c.age ib2018.approx_syear [weight=last_weight_pers_constant] if `var'==1, vce(robust) fe
*margins, at(approx_syear=(2015 2016 2017 2018 2020)) post
coefplot, drop(_cons *age*) mlabel format(%5.2g) mlabpos(1) vertical base ylabel(,angle(0)) title(Well-being on scale from 0-100) plotregion(lwidth(none)) saving(`var',replace) title(`var') xlabel(,alternate) ylabel(-10(2)8) yline(0, lstyle(dot)) recast(connect) // coefplot regression from margins
}

graph combine general_population.gph young.gph self_employed.gph women.gph  with_school_age_children.gph lower_50_percent.gph living_with_partner.gph old.gph high_covid_risk.gph  , imargin(small) scale(.75) title("Well-being change and Covid-19 pandemic for subgroups" "(net of general age-related changes)") ysize(7)


* different subgroups different indicators
drop if wave<6 // only calculate change from last couple of waves

* FE likelihood to have higher problems
eststo clear
estimates clear
foreach group of varlist self_employed living_with_partner old young high_covid_risk lower_50_percent women with_school_age_children {
foreach var of varlist scghqa scghqb scghqc scghqd scghqe scghqf scghqg scghqh scghqi scghqj scghqk scghql {
preserve 
keep if `group'==1 // only keep if part of group
drop if `var'==. // only keep if var exists 
eststo `var': xtlogit cat_`var' ib2018.approx_syear c.age##c.age [weight=last_weight_pers_constant], fe or
estimates store m`var'
coefplot, keep(*.approx_syear) title("`:var label cat_`var''") base xline(1, lstyle(dot)) mlabel format(%2.1g) mlabpos(1) saving(`group'_cat_`var', replace) plotregion(lwidth(none)) eform
restore
}
}


foreach group in self_employed living_with_partner old young high_covid_risk lower_50_percent women with_school_age_children {
graph combine ///
`group'_cat_scghqg.gph  ///
`group'_cat_scghqc.gph ///
`group'_cat_scghqa.gph ///
`group'_cat_scghql.gph ///  
`group'_cat_scghqb.gph ///
`group'_cat_scghqi.gph ///
`group'_cat_scghqd.gph ///
`group'_cat_scghqh.gph ///
`group'_cat_scghqe.gph ///
`group'_cat_scghqk.gph ///
`group'_cat_scghqj.gph ///
`group'_cat_scghqf.gph ///
, imargin(vsmall) scale(.95) col(2) ysize(7) saving(`group'_group.gph, replace) title(`group')
}

* overall life satisfaction for different groups