*-------------------------------------------------------------------------------
* CANAZEI WINTER SCHOOL 2026
*-------------------------------------------------------------------------------
* Laboratory on Using the Forbes Billionaires List as a Research Dataset
* Methods, Challenges, and Insights on the Global Super-Rich
* Lidia Ceriani: lidia.ceriani@univr.it, https://lidiaceriani.github.io/
*-------------------------------------------------------------------------------

version 17.0
clear all
set more off
cap log close

* Run only once
ssc install isocodes

*--- USER MUST EDIT THIS LINE ---
global project "/Users/lidia/Dropbox/Documents/Work/Teaching/Winter_School_Canazei/Analysis"   // or "C:/" on PC

global data     "${project}/Data"
global data_new "${project}/Data_new"
global output   "${project}/Output"
cap mkdir "${project}/Data_new"
cap mkdir "${project}/Data_new"

log using "${output}/forbes_ai_cleaning.log", replace text

* Age
*-------------------------------------------------------------------------------
import delimited "${data_new}/missing_age.csv", clear

* Create chunk ID (groups of 25)
gen chunk = ceil(_n/25)
tab chunk

* Export one CSV per chunk (fully automated)
levelsof chunk, local(chunks)

foreach c of local chunks {
    preserve
        keep if chunk == `c'
        export delimited using "ai_input_chunk_`c'.csv", replace
    restore
}


/*
* PROMPT CHAT GPT
*-------------------------------------------------------------------------------

You are assisting an academic project studying extreme wealth.

Below is a list of individuals identified by a unique URI and full name.
For each individual, retrieve publicly available information on year of birth.

Use authoritative sources only:
- Forbes billionaire profiles (preferred)
- Official biographies
- Reputable major news outlets

IMPORTANT RULES:
- Do NOT guess or infer.
- If year of birth cannot be determined, return "unknown".
- Cite sources explicitly.
- Flag ambiguous or conflicting cases.
- For groups/families, keep the OLDER individual only.

OUTPUT REQUIREMENTS:
- Output strictly as CSV (comma-separated)
- No commentary or markdown
- Use exactly the following columns and order:

uri,full_name,year_of_birth,sources,confidence,notes

If you are unsure or cannot find a reliable source, return "unknown".

BEGIN LIST
[PASTE CONTENTS OF ai_input_chunk_X.csv]
END LIST

Save the response as:
ai_output_chunk_X.csv
*/

stop

*-------------------------------------------------------------------------------
clear
local first = 1

levelsof chunk, local(chunks)

foreach c of local chunks {
    import delimited "${data_new}/ai_output_chunk_`c'.csv", clear
    tempfile t`c'
    save `t`c''
}

clear
foreach c of local chunks {
    append using `t`c''
}

save "ai_birthyear_results_all.dta", replace
