/* This do-file prepares the data for our analysis (makes final sample restrictions, merges in Foreigness Indecies ect...)


We have 3 alternative versions of the Foreignness index:  
"original" - already in data - F-index based on cohort of birth.
"dumb" - to match the modern method, this version doesn't use age-specific index
"mbpl" - counts second-gen (mother foreign born) as foreign */


cap clear
set more off
global Kbulk "/disk/bulkw/hkissel/cens1930.work/"
global datamaps "/disk/homedirs/nber/hkissel/cens1930.work/CultAssim/AppendixTables/datamaps"
global writeup "/disk/homedirs/nber/hkissel/cens1930.work/CultAssim/AppendixTables/tables"
global datadir "/disk/bulkw/hkissel/cens1930.work/Cultural_Assim_data"
global indir "/homes/data/cens1930.work/keriksso/keriksso/forHelen"



/*  ---------------------------------------------------------------------------------
							SET UP - boys
---------------------------------------------------------------------------------*/

* prior version: use "$Kbulk/Full1920_data_FBIndex_NEW.dta", clear
use $indir/Data_cultassim_boys.dta, clear

gen bothparents = native_foreignbo
gen oneparent = native_foreignmo==1 | native_foreignfa==1
cap drop hh_years_in_us_birth
gen hh_years_in_us_birth = birthyear - yrimmig_dad

sum hh_years_in_us_birth
replace hh_years_in_us_birth = . if hh_years_in_us_birth<0 | hh_years_in_us_birth>40

cap drop mom_years_in_us_birth

gen mom_years_in_us_birth = birthyear - yrimmig_mom

sum mom_years_in_us_birth
replace mom_years_in_us_birth = . if mom_years_in_us_birth<0 | mom_years_in_us_birth>40

*** sample restriction
*children's age younger or equal to 18
keep if birthyear >= 1902 

*drop blacks
keep if race==100

drop age
gen age = 1920 - birthyear
sum age

*mom's age restriction*
keep if mother_birthyear>=1878

*family size restriction
bys serial: egen maxx = max(birth_order_all)
keep if maxx<=10

*drop weird birthorders
drop if birth_order_son==0
drop if birth_order_all==0

**** Original foriengness index (FB) - cohort specifict  /* Note: there is already an f-index in the data from Katherine (f_index_20y_r_m_1940), but this uses the 1940 data*/
cap drop freqUS
cap drop freqFB
gen firstname = name_given	
merge m:1 firstname age using $datadir/US1920_FBIndex, keep(1 3) keepusing(FBindex freqUS freqFB)
rename FBindex FB 
drop _merge


*gen REL_INDEX = FB/(1-FB)

	*generate freq as a fraction of all names (from older version... no longer used in paper)*
	gen freqall = freqUS + freqFB
	egen t = tag(name_given)
	gen x = freqall if t==1
	egen totalfreq = sum(x)
	gen freq_frac = freqall/totalfreq
	drop freqUS freqFB freqall t x totalfreq

* Merge in "dumb" index
cap drop FBindex
merge m:1 firstname using $datadir/US1920_FBIndex_dumb, keep(1 3) keepusing(FBindex freqUS freqFB)
drop  _merge
rename FBindex FB_dumb
gen REL_INDEX_dumb = FB_dumb/ (1-FB_dumb)


	* freq as fraction of all names 
	gen freqall = freqUS + freqFB
	egen t = tag(name_given)
	gen x = freqall if t==1
	egen totalfreq = sum(x)
	gen freq_frac_dumb = freqall/totalfreq
	drop freqUS freqFB freqall t x totalfreq
	
	
* Merge in "dumb" "parents only" index 
cap drop FBindex
merge m:1 firstname using $datadir/US1920_FBIndex_dumb_parentsonly, keep(1 3) keepusing(FBindex freqUS freqFB)
drop  _merge
rename FBindex FB_dumb_parentsonly
drop freqUS freqFB


* Merge in "mbpl" index 
merge m:1 firstname birthyear using $datadir//US1920_FBIndex_mbpl, keep(1 3) keepusing(FBindex freqUS freqFB)
rename FBindex FB_mbpl
gen REL_INDEX_mbpl = FB_mbpl/ (1-FB_mbpl)
drop  _merge

	* freq as fraction of all names 
	gen freqall = freqUS + freqFB
	egen t = tag(name_given)
	gen x = freqall if t==1
	egen totalfreq = sum(x)
	gen freq_frac_mbpl = freqall/totalfreq
	drop freqUS freqFB freqall t x totalfreq

* 5-year age bands:
gen by_1900_1904 = birthyear<=1904
gen by_1905_1909 = birthyear>=1905 & birthyear<=1909
gen by_1910_1914 = birthyear>=1910 & birthyear<=1914
gen by_1915_1920 = birthyear>=1915 & birthyear<=1920

* 3-year age bands
gen three_0_2 = age <= 2
gen three_3_5 = age >= 3 & age <= 5
gen three_6_8 = age >= 6 & age <= 8 
gen three_9_11 = age >= 9 & age <= 11
gen three_12_15 = age >= 12 & age <= 15 



gen birthplace_head = floor(fbpl/100)

sort name_given

merge m:1 name_given using "$datamaps/biblical_firstnames.dta"
gen has_biblical = _merge==3
drop if _merge==2
drop _merge


merge m:1 name_given using "$datamaps/saints_firstnames.dta"
gen has_saint = _merge==3
drop if _merge==2
drop _merge


save $datadir/CultAssim_data_foranalysis.dta, replace





/*  ---------------------------------------------------------------------------------
							SET UP - girls
---------------------------------------------------------------------------------*/

*prior version: use "$Kbulk/Full1920_data_FBIndex_women_NEW.dta", clear
use $indir/Data_cultassim_girls.dta, clear
cap drop hh_years_in_us_birth

gen hh_years_in_us_birth = birthyear - yrimmig_dad

sum hh_years_in_us_birth
replace hh_years_in_us_birth = . if hh_years_in_us_birth<0 | hh_years_in_us_birth>40

cap drop mom_years_in_us_birth

gen mom_years_in_us_birth = birthyear - yrimmig_mom

sum mom_years_in_us_birth
replace mom_years_in_us_birth = . if mom_years_in_us_birth<0 | mom_years_in_us_birth>40



*** sample restriction
*children's age younger or equal to 18
keep if birthyear >= 1902 
*drop blacks
keep if race==100

drop age

gen age = 1920 - birthyear

*mom's age restriction*
keep if mother_birthyear>=1878

*family size restriction
bys serial: egen maxx = max(birth_order_all)
keep if maxx<=10

*drop weird birthorders
drop if birth_order_dau==0
drop if birth_order_all==0

**** Original foreigness index (FB) - cohort specifict  /* Note: there is already an f-index in the data from Katherine (f_index_20y_r_f_1940), but this uses the 1940 data*/
cap drop freqUS
cap drop freqFB
gen firstname = name_given	
merge m:1 firstname age using $datadir/US1920_FBIndex_girls, keep(1 3) keepusing(FBindex freqUS freqFB)
rename FBindex FB 
drop _merge

*gen REL_INDEX = FB/(1-FB)

	*generate freq as a fraction of all names (in this dataset....)*
	gen freqall = freqUS + freqFB
	egen t = tag(name_given)
	gen x = freqall if t==1
	egen totalfreq = sum(x)
	gen freq_frac = freqall/totalfreq
	drop freqUS freqFB freqall t x totalfreq


* Merge in "dumb" index
cap drop FBindex
merge m:1 firstname using $datadir/US1920_FBIndex_dumb_girls, keep(1 3) keepusing(FBindex freqUS freqFB)
drop  _merge
rename FBindex FB_dumb
gen REL_INDEX_dumb = FB_dumb/ (1-FB_dumb)


	* freq as fraction of all names 
	gen freqall = freqUS + freqFB
	egen t = tag(name_given)
	gen x = freqall if t==1
	egen totalfreq = sum(x)
	gen freq_frac_dumb = freqall/totalfreq
	drop freqUS freqFB freqall t x totalfreq
	
* Merge in "dumb" "parents only" index 
cap drop FBindex
merge m:1 firstname using $datadir/US1920_FBIndex_dumb_parentsonly_girls, keep(1 3) keepusing(FBindex freqUS freqFB)
drop  _merge
rename FBindex FB_dumb_parentsonly
drop freqUS freqFB

* Merge in "mbpl" index 
merge m:1 firstname birthyear using $datadir//US1920_FBIndex_mbpl_girls, keep(1 3) keepusing(FBindex freqUS freqFB)
rename FBindex FB_mbpl
gen REL_INDEX_mbpl = FB_mbpl/ (1-FB_mbpl)
drop  _merge

	* freq as fraction of all names 
	gen freqall = freqUS + freqFB
	egen t = tag(name_given)
	gen x = freqall if t==1
	egen totalfreq = sum(x)
	gen freq_frac_mbpl = freqall/totalfreq
	drop freqUS freqFB freqall t x totalfreq


gen by_1900_1904 = birthyear<=1904
gen by_1905_1909 = birthyear>=1905 & birthyear<=1909
gen by_1910_1914 = birthyear>=1910 & birthyear<=1914
gen by_1915_1920 = birthyear>=1915 & birthyear<=1920


gen birthplace_head = floor(fbpl/100)



sort name_given

merge m:1 name_given using "$datamaps/biblical_firstnames.dta"
gen has_biblical = _merge==3
drop if _merge==2
drop _merge


merge m:1 name_given using "$datamaps/saints_firstnames.dta"
gen has_saint = _merge==3
drop if _merge==2
drop _merge



save $datadir/CultAssim_data_foranalysis_girls.dta, replace





append using $datadir/CultAssim_data_foranalysis.dta



* create families here based on serial numbers and mom's location in household: 


tostring serial_orig, gen(s)
tostring momloc, gen(p)

gen SS = s + "_" + p
egen family = group(SS)

*** now keep only the family groupings that all have the same mom

save $datadir/CultAssim_data_foranalysis_all.dta, replace

