*-------------------------------------------------------------------------------
* CANAZEI WINTER SCHOOL 2026
*-------------------------------------------------------------------------------
* Laboratory on Using the Forbes Billionaires List as a Research Dataset
* Methods, Challenges, and Insights on the Global Super-Rich
* Lidia Ceriani: lidia.ceriani@univr.it, https://lidiaceriani.github.io/
*-------------------------------------------------------------------------------

version 17.0
clear all
set more off
cap log close

* Run only once
ssc install isocodes

*--- USER MUST EDIT THIS LINE ---
global project "/Users/lidia/Dropbox/Documents/Work/Teaching/Winter_School_Canazei/Analysis"   // or "C:/" on PC

global data     "${project}/Data"
global data_new "${project}/Data_new"
global output   "${project}/Output"
cap mkdir "${project}/Data_new"
cap mkdir "${project}/Data_new"

log using "${output}/forbes_dataset.log", replace text


*-------------------------------------------------------------------------------
* 2004
* Source - Freund and Oliver (2016)
* https://www.piie.com/publications/working-papers/origins-superrich-billionaire-characteristics-database
*-------------------------------------------------------------------------------
use "${data}/Forbes_04.dta", clear

* Renaming some variables to harmonize datasets
*-------------------------------------------------------------------------------
rename name 				full_name
rename citizenship 			country_of_citizenship
rename networthusbillion 	net_worth
rename sourceofwealth 		source
rename company 				organization_name
rename IndustryAggre 		industry_aggregate

decode industry, gen(industry_label)
decode industry_aggregate, gen(industry_aggregate_label)
drop industry industry_aggregate
rename industry_label industry
rename industry_aggregate_label industry_aggregate

* Simplify names to get a workable unique resource identifier (uri) across years 
* and datasets
*-------------------------------------------------------------------------------
gen 	uri = strlower(full_name) //Convert everything to the same case
replace uri = subinstr(uri, " and ", " & ", .) //homogenize style
replace uri = ustrregexra(uri, "[[:punct:]]", "") //Remove punctuation (but keep letters)
replace uri = ustrnormalize(uri, "nfd") //Normalize accents (critical for international names)
replace uri = ustrregexra(uri, "\p{Mark}", "") //Find all accent marks and delete them
replace uri = subinstr(uri, "family", "", .) //Remove "family"
replace uri = regexr(uri, " jr$| sr$", "") //remove suffix
replace uri = stritrim(uri) //Remove extra spaces in string
replace uri = strtrim(uri) //Remove extra spaces at the end and at the beginning of string
replace uri = subinstr(uri, " ", "-", .) //pharse names with "-"

* Fix duplicates uri, by visual inspection
* As a convention, I add the city of residency, if avilable, or Jr, Sr
*-------------------------------------------------------------------------------
duplicates tag uri year, gen(tag)
tab full_name if tag!=0
drop tag

* Fix some entries
*-------------------------------------------------------------------------------
replace age=. if age==0

gen ref = "Freund_Oliver_2016"

keep year rank uri full_name country_of_citizenship net_worth age source organization_name  industry industry_aggregate ref

tempfile data_04
save `data_04'


*-------------------------------------------------------------------------------
* 2014, 2024
* Source - Kaggle
* * https://www.kaggle.com/datasets/guillemservera/forbes-billionaires-1997-2023
*-------------------------------------------------------------------------------
use "${data}/Forbes_14_24", clear
keep year rank net_worth last_name first_name full_name birth_date age gender country_of* city_of* business_category organization_name position* self_made

rename business_category    industry
rename net_worth 			networth
rename organization_name	source

gen networth_str = lower(networth)
replace networth_str = subinstr(networth_str,"$","",.)
replace networth_str = subinstr(networth_str,"b","",.)
replace networth_str = stritrim(networth_str)
destring networth_str, gen(net_worth) force
drop networth*

* Simplify names to get a workable unique resource identifier (uri) across years 
* and datasets
*-------------------------------------------------------------------------------
gen 	uri = strlower(full_name) //Convert everything to the same case
replace uri = subinstr(uri, " and ", " & ", .) //homogenize style
replace uri = ustrregexra(uri, "[[:punct:]]", "") //Remove punctuation (but keep letters)
replace uri = ustrnormalize(uri, "nfd") //Normalize accents (critical for international names)
replace uri = ustrregexra(uri, "\p{Mark}", "") //Find all accent marks and delete them
replace uri = subinstr(uri, "family", "", .) //Remove "family"
replace uri = regexr(uri, " jr$| sr$", "") //remove suffix
replace uri = stritrim(uri) //Remove extra spaces in string
replace uri = strtrim(uri) //Remove extra spaces at the end and at the beginning of string
replace uri = subinstr(uri, " ", "-", .) //pharse names with "-"

* Fix duplicates uri, by visual inspection
* As a convention, I add the city of residency, if avilable, or Jr, Sr
*-------------------------------------------------------------------------------
duplicates tag uri year, gen(tag)
tab full_name if tag!=0
drop tag

* These are duplicates
*-------------------------------------------------------------------------------
* H. Ross Perot, Jr.
* H. Ross Perot, Sr.
* Jim Davis
* Jim Davis & family
* Jin Lei
* Jin Lei & family
* Robert Miller
* Wang Yanqing & family

replace uri="henry-ross-perot-jr" if strpos(full_name, "Ross")>0 & strpos(full_name, "Perot")>0 & strpos(full_name, "Jr")>0
replace uri="henry-ross-perot-sr" if strpos(full_name, "Ross")>0 & strpos(full_name, "Perot")>0 & strpos(full_name, "Sr")>0

replace uri="jim-davis-cockeysville" if strpos(full_name, "Davis")>0 & strpos(city_of_residence, "Cockeysville")>0
replace uri="jim-davis-newton" if strpos(full_name, "Davis")>0 & strpos(city_of_residence, "Newton")>0

replace uri = "robert-miller-ca" if strpos(full_name, "Robert")>0 & strpos(full_name, "Miller")>0 & country_of_citizenship=="Canada"
replace uri = "robert-miller-uk" if strpos(full_name, "Robert")>0 & strpos(full_name, "Miller")>0  & country_of_citizenship=="United Kingdom"

* CHECK THE OTHER ONES, AND FIX THE URI

* CITIZENSHIP
*-------------------------------------------------------------------------------
replace country_of_citizenship = country_of_residence if country_of_citizenship==""
isocodes country_of_citizenship, gen(iso3c)

gen reference="kaggle"
tempfile data_14_24
save `data_14_24'

use `data_04'
append using `data_14_24'


/* Use information in some years to fill the gap in other years where information
is missing or not attendible*/
*-----------------------------------------------------------------------------*/

* Age
*-------------------------------------------------------------------------------
*  Sort within person
bysort uri: egen lastyear = max(cond(!missing(age), year, .))
bysort uri: egen lastage  = max(cond(year==lastyear, age, .))

* Back-calculate "corrected" age for every year
gen age_fill = lastage - (lastyear - year)

* Optional: keep original if lastage missing
replace age_fill = age if missing(lastage)

* Optional: inspect inconsistencies
gen age_diff = age - age_fill
summ age_diff, detail
br birth_date age age_fill if age_fill!=.


* Gender
*-------------------------------------------------------------------------------
replace gender = "Female" if gender=="f"
replace gender = "Male" if gender=="m"

* Identify the last year with observed gender
bysort uri (year): egen gender_lastyear = max(cond(gender!="", year, .))

* Extract the gender from that last year (string-safe)
bysort uri (year): gen gender_anchor = gender if year==gender_lastyear
bysort uri (year): replace gender_anchor = gender_anchor[_N]

* Fill all years using the anchor (only when available)
gen gender_fill = gender
replace gender_fill = gender_anchor if gender_fill=="" & gender_anchor!=""

* Diagnostic

* IDs with no gender at all
bysort uri (year): egen has_gender = max(gender!="")
tab has_gender

* Check nothing real was overwritten
count if gender!="" & gender_fill!=gender


* Self-Made
*-------------------------------------------------------------------------------
* Identify the last year with observed self_made information
bysort uri (year): egen self_made_lastyear = max(cond(self_made!="", year, .))

* Extract the gender from that last year (string-safe)
bysort uri (year): gen self_made_anchor = self_made if year==self_made_lastyear
bysort uri (year): replace self_made_anchor = self_made_anchor[_N]

* Fill all years using the anchor (only when available)
gen self_made_fill = self_made
replace self_made_fill = self_made_anchor if self_made_fill=="" & self_made_anchor!=""

* Diagnostic

* IDs with no self_made at all
bysort uri (year): egen has_self_made = max(self_made!="")
tab has_self_made

* Check nothing real was overwritten
count if self_made!="" & self_made_fill!=self_made

keep year uri full_name rank net_worth age* gender* self_made* country_of_citizenship
* Save Memory Space 
*-------------------------------------------------------------------------------
compress
save "${data_new}/temp_Forbes.dta", replace

* Age
preserve
keep if age_fill==.
keep uri full_name
duplicates drop
export delimited using "${data_new}/missing_age.csv", replace
restore

* Gender
preserve
keep if gender_fill==""
keep uri full_name
duplicates drop
export delimited using "${data_new}/missing_gender.csv", replace
restore

* Self-Made
preserve
keep if self_made_fill==""
keep uri full_name
duplicates drop
export delimited using "${data_new}/missing_self_made.csv", replace
restore

log close


