************** CODE for the LAB in Winter School ************
* This do-file mirrors the structure of the lecture slides.
* We start from a "perfect data" benchmark: father outcomes are observed,

clear all
set more off
set maxvar 16000



********************************************************************************
* Data source
*
* The data used in this lab come from IPUMS USA.
* They are based on historical U.S. Census microdata
* and can be downloaded (after free registration) from https://usa.ipums.org/usa/linked_data_samples_downloads.shtml
*
* The linked father–son structure is constructed using
* IPUMS-provided household and individual identifiers.
********************************************************************************

cd "C:\Users\Andrea\Documents\Madrid\UC3M\TA\Lecture -WSIT"
global Data "C:\Users\Andrea\Documents\Madrid\UC3M\TA\Lecture -WSIT"

use "$Data\linked_1880-1900_males (1).dta", replace

********************************************************************************
* Keep only variables needed for the lecture:
* - IDs to identify households and linked individuals across years
* - Names/surnames (used to construct surname groups)
* - Demographics (age/sex/race) for sample restrictions
* - Geography (state/county/SEA) for conditioning exercises
* - Occupational outcomes + weights
********************************************************************************
keep serial_2 hhseq pernum_2 serial_1 pernum_1 year_1 year_2 perwt linktype ///
     namelast_1 namefrst_1 namelast_2 namefrst_2 ///
     age_1 age_2 relate_1 relate_2 related_1 related_2 imprel_2 ///
     sex_1 sex_2 race_1 race_2 raced_1 raced_2 ///
     stateicp_1 stateicp_2 county_1 county_2 sea_1 sea_2 ///
     occ1950_1 occ1950_2 occscore_1 occscore_2 occ_1 occ_2 ///
     poploc_1 poploc_2
	 
	 
********************************************************************************
*** Impute the father's outcome to the household
*** (Perfect linking + full overlap benchmark)
***
*** Goal: for each child, create a single "father occupational score" observed in year_1.
*** In the slides, this is the clean benchmark where father outcomes are measured well.
********************************************************************************

* Step 1: within each household in year_1 (serial_1), collect the father's occscore_1.
* Father definition here: relate_1==1 and sex_1==1 (head/male as father proxy).


// Quick cleaning 
capture drop h_household
gen byte h_household = (relate_1==1 & sex_1==1)

capture drop check
egen check=total(h_household), by (serial_1)

tab check

drop check

// Keep ONE record per father (serial_1, pernum_1) when the father appears multiple times.
// In the IPUMS-linked files, the same individual can appear more than once
// (e.g. due to multiple links or repeated records).
// Since (serial_1, pernum_1) uniquely identifies a person in year_1,
// we tag and keep a single observation per father.
capture drop fath_tag
egen fath_tag = tag(serial_1 pernum_1) if h_household==1
replace fath_tag = . if h_household==0
drop if fath_tag==0
drop fath_tag


// Check household structure again:
// After collapsing duplicate father records, each household (serial_1)
// should contain at most one father.
capture drop check
egen check = total(h_household), by(serial_1)

tab check


// Drop the small number of households that still have more than one father.
// These are likely data errors or complex household structures that we
// exclude to keep the benchmark example clean and transparent.
drop if check > 1

drop check


// Construct the household-level father occupational score.
// occscore_f_hh is the father's occupational score recorded at the individual level.
// occscore_f assigns this value to the household by taking the mean within serial_1
// (which is equivalent to copying the father's value to all household members).
capture drop occscore_f_hh occscore_f
gen double occscore_f_hh = occscore_1 if h_household==1
egen occscore_f = mean(occscore_f_hh), by(serial_1)
drop occscore_f_hh

********************************************************************************
*** Generate Variables of Interest
*** Outcomes are logged occupational scores (child and father)
********************************************************************************

gen ln_occscore   = ln(occscore_2)
gen ln_occscore_f = ln(occscore_f)



********************************************************************************
*** Impose sample restriction (Santavirta–Stuhler & Olivetti–Paserman)
*** + Full overlap: keep only observations with observed fathers
********************************************************************************

gen sample = (age_1<16 & raced_1==100 & relate_1==3 & ln_occscore_f!=.)

keep if sample==1

********************************************************************************
*** Generate surname identifier
*** This is the object that will later play the role of the instrument
********************************************************************************

egen surname = group(namelast_1)
replace surname = . if namelast_1==""

********************************************************************************
*** Surname group average
*** This is the surname mean of father outcomes
********************************************************************************

egen mean_surname = mean(ln_occscore_f), by(surname)
replace mean_surname = . if surname==.

********************************************************************************
* Variable labels (for tables and teaching clarity)
********************************************************************************

label var ln_occscore     "Child log occupational score"
label var ln_occscore_f   "Father OccScore"

label var surname         "Surname identifier"

label var mean_surname    "Surname Average OccScore"

label var sample          "Analysis sample indicator"

********************************************************************************
********************* COMPARISON OF ESTIMATORS *********************************
*** Benchmark comparison: Parent–child vs surname-based
********************************************************************************

* DIRECT REGRESSION (parent–child persistence)
reg ln_occscore ln_occscore_f if sample==1 [aw=perwt], rob
eststo direct

* GROUPING REGRESSION (surname-based persistence)
reg ln_occscore mean_surname if sample==1 [aw=perwt], rob
eststo grouping

esttab direct grouping, replace ///
    b(%9.3f) se(%9.3f) ///
    star(* 0.10 ** 0.05 *** 0.01) ///
    keep(ln_occscore_f mean_surname) ///
    order(ln_occscore_f mean_surname) ///
    label ///
    stats(N r2, labels("Observations" "R-squared")) ///
    title("Parent-Child vs Surname-Based Persistence") ///
    mtitles("Parent-Child" "Surname-based") ///
    alignment(D{.}{.}{-1}) 

/*
********************************************************************************
********************************* "2SLS vs OLS" *********************************
*** Interpretation slide: surname regression as IV
********************************************************************************

* OLS already shown above
* IV: surname dummies as instruments for father income
*xi: ivreg2 ln_occscore (ln_occscore_f = i.surname) if sample==1 [aw=perwt], rob
*eststo group_iv

*********************************************************************************
*/




	
********************************************************************************
* Imperfect Linking
*
* In the benchmark section, we could assign each child a father's occscore_1
* observed in the same household (clean "ground truth").
*
* Here we relax that assumption: some father-child links are missing or imperfect.
* This is the realistic case that motivates surname-based/group-based approaches.
********************************************************************************
clear all
set more off
set maxvar 16000

cd "C:\Users\Andrea\Documents\Madrid\UC3M\TA\Lecture -WSIT"
global Data "C:\Users\Andrea\Documents\Madrid\UC3M\TA\Lecture -WSIT"

use "$Data\linked_1880-1900_males (1).dta", replace

********************************************************************************
* Keep only variables needed for the lecture:
* - IDs to identify households and linked individuals across years
* - Names/surnames (used to construct surname groups)
* - Demographics (age/sex/race) for sample restrictions
* - Geography (state/county/SEA) for conditioning exercises
* - Occupational outcomes + weights
********************************************************************************
keep serial_2 hhseq pernum_2 serial_1 pernum_1 year_1 year_2 perwt linktype ///
     namelast_1 namefrst_1 namelast_2 namefrst_2 ///
     age_1 age_2 relate_1 relate_2 related_1 related_2 imprel_2 ///
     sex_1 sex_2 race_1 race_2 raced_1 raced_2 ///
     stateicp_1 stateicp_2 county_1 county_2 sea_1 sea_2 ///
     occ1950_1 occ1950_2 occscore_1 occscore_2 occ_1 occ_2 ///
     poploc_1 poploc_2
	 
	 
	 
	 
********************************************************************************
*** Impute the father's outcome to the household
*** (Perfect linking + full overlap benchmark)
***
*** Goal: for each child, create a single "father occupational score" observed in year_1.
*** In the slides, this is the clean benchmark where father outcomes are measured well.
********************************************************************************

* Step 1: within each household in year_1 (serial_1), collect the father's occscore_1.
* Father definition here: relate_1==1 and sex_1==1 (head/male as father proxy).


// Quick cleaning 
capture drop h_household
gen byte h_household = (relate_1==1 & sex_1==1)

capture drop check
egen check=total(h_household), by (serial_1)

tab check

drop check

// Keep ONE record per father (serial_1, pernum_1) when the father appears multiple times.
// In the IPUMS-linked files, the same individual can appear more than once
// (e.g. due to multiple links or repeated records).
// Since (serial_1, pernum_1) uniquely identifies a person in year_1,
// we tag and keep a single observation per father.
capture drop fath_tag
egen fath_tag = tag(serial_1 pernum_1) if h_household==1
replace fath_tag = . if h_household==0
drop if fath_tag==0
drop fath_tag


// Check household structure again:
// After collapsing duplicate father records, each household (serial_1)
// should contain at most one father.
capture drop check
egen check = total(h_household), by(serial_1)

tab check


// Drop the small number of households that still have more than one father.
// These are likely data errors or complex household structures that we
// exclude to keep the benchmark example clean and transparent.
drop if check > 1

drop check


// Construct the household-level father occupational score.
// occscore_f_hh is the father's occupational score recorded at the individual level.
// occscore_f assigns this value to the household by taking the mean within serial_1
// (which is equivalent to copying the father's value to all household members).
capture drop occscore_f_hh occscore_f
gen double occscore_f_hh = occscore_1 if h_household==1
egen occscore_f = mean(occscore_f_hh), by(serial_1)
drop occscore_f_hh

********************************************************************************
*** Generate Variables of Interest
*** Outcomes are logged occupational scores (child and father)
********************************************************************************

gen ln_occscore   = ln(occscore_2)
gen ln_occscore_f = ln(occscore_f)



********************************************************************************
*** Impose sample restriction (Santavirta–Stuhler & Olivetti–Paserman)
*** + Full overlap: keep only observations with observed fathers
********************************************************************************

gen sample = (age_1<16 & raced_1==100 & relate_1==3 & ln_occscore_f!=.)

keep if sample==1



********************************************************************************
* 2) Imperfect linking proxy: build surname averages using surnames from the other side
*    (here: collapse by namelast_1, then merge on namelast_2)
********************************************************************************

preserve

// Here is the key difference with respect to the previous code:
// Instead of assigning each child his own observed father's outcome,
// we aggregate information at the SURNAME level.
// This mimics the case where individual father links are missing or unreliable.

collapse (mean) ln_occscore_f, by(namelast_1)
drop if ln_occscore_f==.
rename ln_occscore_f mean_surname_partial

// We rename variables so that the surname-level information
// can later be merged to children using their surname.
rename namelast_1 namelast_2


// The resulting dataset is a "surname file":
// each surname is associated with the average occupational outcome
// of fathers/household heads observed in that period.

tempfile surname
save `surname', replace
restore


// Merge surname-level information back to the individual-level data.
// Each child is assigned the average outcome of fathers
// sharing the same surname (partial information proxy).
merge m:1 namelast_2 using `surname'

// Keep only observations that successfully match to a surname
// with at least one observed father in the data.
drop if _merge!=3
drop _merge

********************************************************************************
* 3) Regressions + table
********************************************************************************

label var ln_occscore     "Child log occupational score"
label var ln_occscore_f   "Father OccScore"
label var mean_surname_partial "Surname average"

reg ln_occscore ln_occscore_f if sample==1 [aw=perwt], rob
eststo direct3

reg ln_occscore mean_surname_partial if sample==1 [aw=perwt], rob
eststo grouping3

esttab direct3 grouping3, ///
    b(%9.3f) se(%9.3f) ///
    star(* 0.10 ** 0.05 *** 0.01) ///
    keep(ln_occscore_f mean_surname_partial) ///
    order(ln_occscore_f mean_surname_partial) ///
    label ///
    stats(N r2, labels("Observations" "R-squared")) ///
    title("Intergenerational Persistence: Imperfect Linking") ///
    mtitles("Parent--Child" "Surname (imperfect linking)") ///
    alignment(D{.}{.}{-1})
	
********************************************************************************
* Limited Overlap
*
* Compute surname-level means using an independent 5% subsample,
* then merge this information back to the main sample.
*
* This illustrates the case where parental information comes from
* a different (partially overlapping) population.
********************************************************************************

clear all
set more off
set maxvar 16000

cd "C:\Users\Andrea\Documents\Madrid\UC3M\TA\Lecture -WSIT"
global Data "C:\Users\Andrea\Documents\Madrid\UC3M\TA\Lecture -WSIT"

use "$Data\linked_1880-1900_males (1).dta", replace

********************************************************************************
* Keep only variables needed for the lecture:
* - IDs to identify households and linked individuals across years
* - Names/surnames (used to construct surname groups)
* - Demographics (age/sex/race) for sample restrictions
* - Geography (state/county/SEA) for conditioning exercises
* - Occupational outcomes + weights
********************************************************************************
keep serial_2 hhseq pernum_2 serial_1 pernum_1 year_1 year_2 perwt linktype ///
     namelast_1 namefrst_1 namelast_2 namefrst_2 ///
     age_1 age_2 relate_1 relate_2 related_1 related_2 imprel_2 ///
     sex_1 sex_2 race_1 race_2 raced_1 raced_2 ///
     stateicp_1 stateicp_2 county_1 county_2 sea_1 sea_2 ///
     occ1950_1 occ1950_2 occscore_1 occscore_2 occ_1 occ_2 ///
     poploc_1 poploc_2
	 
	 
	 
	 
********************************************************************************
*** Impute the father's outcome to the household
*** (Perfect linking + full overlap benchmark)
***
*** Goal: for each child, create a single "father occupational score" observed in year_1.
*** In the slides, this is the clean benchmark where father outcomes are measured well.
********************************************************************************

* Step 1: within each household in year_1 (serial_1), collect the father's occscore_1.
* Father definition here: relate_1==1 and sex_1==1 (head/male as father proxy).


// Quick cleaning 
capture drop h_household
gen byte h_household = (relate_1==1 & sex_1==1)

capture drop check
egen check=total(h_household), by (serial_1)

tab check

drop check

// Keep ONE record per father (serial_1, pernum_1) when the father appears multiple times.
// In the IPUMS-linked files, the same individual can appear more than once
// (e.g. due to multiple links or repeated records).
// Since (serial_1, pernum_1) uniquely identifies a person in year_1,
// we tag and keep a single observation per father.
capture drop fath_tag
egen fath_tag = tag(serial_1 pernum_1) if h_household==1
replace fath_tag = . if h_household==0
drop if fath_tag==0
drop fath_tag


// Check household structure again:
// After collapsing duplicate father records, each household (serial_1)
// should contain at most one father.
capture drop check
egen check = total(h_household), by(serial_1)

tab check


// Drop the small number of households that still have more than one father.
// These are likely data errors or complex household structures that we
// exclude to keep the benchmark example clean and transparent.
drop if check > 1

drop check


// Construct the household-level father occupational score.
// occscore_f_hh is the father's occupational score recorded at the individual level.
// occscore_f assigns this value to the household by taking the mean within serial_1
// (which is equivalent to copying the father's value to all household members).
capture drop occscore_f_hh occscore_f
gen double occscore_f_hh = occscore_1 if h_household==1
egen occscore_f = mean(occscore_f_hh), by(serial_1)
drop occscore_f_hh

********************************************************************************
*** Generate Variables of Interest
*** Outcomes are logged occupational scores (child and father)
********************************************************************************

gen ln_occscore   = ln(occscore_2)
gen ln_occscore_f = ln(occscore_f)

********************************************************************************
*** Generate surname identifier
*** This is the object that will later play the role of the instrument
********************************************************************************

egen surname = group(namelast_1)
replace surname = . if namelast_1==""

********************************************************************************
*** Surname group average
*** This is the surname mean of father outcomes
********************************************************************************

egen mean_surname = mean(ln_occscore_f), by(surname)
replace mean_surname = . if surname==.


********************************************************************************
*** Impose sample restriction (Santavirta–Stuhler & Olivetti–Paserman)
*** + Full overlap: keep only observations with observed fathers
********************************************************************************

gen sample = (age_1<16 & raced_1==100 & relate_1==3 & ln_occscore_f!=.)

keep if sample==1

preserve

    * Construct an independent auxiliary sample:
    * We randomly draw 5% of observations to estimate surname means,
    * ensuring minimal overlap with the main estimation sample.
    sort serial_1 pernum_1 serial_2 pernum_2
    set seed 1234
    sample 5

    * Compute surname-level occupational averages
    * using only the auxiliary (5%) sample.
    collapse (mean) ln_occscore_f, by(namelast_1)
    drop if ln_occscore_f == .
    rename ln_occscore_f mean_overlap

    tempfile overlap5
    save `overlap5', replace
restore


* Merge limited-overlap surname means back into the main sample.
* Each child is assigned a surname-level proxy estimated
* from a different (largely non-overlapping) set of families.
merge m:1 namelast_1 using `overlap5'
keep if _merge == 3
drop _merge


// For surnames that do not appear in the auxiliary sample,
// the surname-based proxy is undefined.
replace mean_overlap = . if surname == .

********************************************************************************
* REGRESSIONS + TABLE
********************************************************************************
label var ln_occscore_f "Father Occscore"
label var mean_surname "Surname Average"
label var mean_overlap "Surname average (limited overlap)"

reg ln_occscore ln_occscore_f if sample==1 [aw=perwt], rob
eststo direct2

reg ln_occscore mean_surname if sample==1 [aw=perwt], rob
eststo grouping2

reg ln_occscore mean_overlap if sample==1 [aw=perwt], rob
eststo group_overlap

esttab direct2 grouping2 group_overlap, ///
    b(%9.3f) se(%9.3f) ///
    star(* 0.10 ** 0.05 *** 0.01) ///
    keep(ln_occscore_f mean_surname mean_overlap) ///
    order(ln_occscore_f mean_surname mean_overlap) ///
    label ///
    stats(N r2, labels("Observations" "R-squared")) ///
    title("Intergenerational Persistence: Benchmark and Overlap") ///
    mtitles("Parent-Child" "Surname (Full overlap)" "Surname (Limited overlap)") ///
    alignment(D{.}{.}{-1})
	
	

********************************************************************************
// If you want you can replicate two core extensions from the lecture
********************************************************************************

*** 1) Weight patterns by surname frequency

*** 2) Condition on geography

********************************************************************************



