set more off
clear

log using read_ipums, text replace

/*

This do file constructs datasets of occupations' sizes and educational outcomes using data from the decennial census.
The output of these datases will then be used in aej_calculations.do  .

cps_00029 dataset was downloaded from IPUMS-CPS. It contains the CPS-ASEC, each
   year from 1962 to 2000. Within these datasets we download the following variables:

    Type	Variable	Label
H	YEAR	Survey year
H	SERIAL	Household serial number
H	MONTH	Month
H	CPSID	CPSID, household record
H	ASECFLAG	Flag for ASEC
P	PERNUM	Person number in sample unit
P	CPSIDP	CPSID, person record
P	AGE	Age
P	SEX	Sex
P	RACE	Race
P	MARST	Marital status
P	EMPSTAT	Employment status
P	LABFORCE	Labor force status
P	OCC1990	Occupation, 1990 basis
P	IND1990	Industry, 1990 basis
P	CLASSWKR	Class of worker
P	OCC90LY	Occupation last year, 1990 basis
P	CLASSWLY	Class of worker last year
P	WKSWORK2	Weeks worked last year, intervalled
P	WKSUNEM2	Weeks unemployed last year, intervalled
P	FULLPART	Worked full or part time last year
P	NUMEMPS	Number of employers last year
P	INCWAGE	Wage and salary income

and the following data flags:

    P	QAGE
P	QMARST
P	QSEX
P	QRACE
P	QCLASSWK
P	QEMPSTAT
P	QLABFORC
P	QOCC
P	QCLASSWL
P	QNUMEMPS
P	QWKSUNEM
P	QWKSWORK

usa_00034 is downloaded from ipums. Here are the samples, variables, and data quality flags we have selected.

Sample	Density	Note
1940 1%	1.0%	
1950 1%	1.0%	
1960 5%	5.0%	
1970 1% metro fm1	1.0%	
1970 1% metro fm2	1.0%	
1980 5% state	5.0%	
1990 5% state	5.0%	
2000 5%	5.0%	
2007 ACS 3yr	3.0%	
2013 ACS 3yr	3.0%	

H	YEAR	Census year	--
H	DATANUM	Data set number	--
H	SERIAL	Household serial number	--
H	HHWT	Household weight	--
H	STATEFIP	State (FIPS code)	--
H	COUNTYFIP	County (FIPS code)	--
H	METAREA (general)	Metropolitan area [general version]	--
H	METAREAD (detailed)	Metropolitan area [detailed version]	--
H	GQ	Group quarters status	--
P	PERNUM	Person number in sample unit	--
P	PERWT	Person weight	--
P	SEX	Sex	--
P	AGE	Age	details
P	MARST	Marital status	--
P	RACE (general)	Race [general version]	--
P	RACED (detailed)	Race [detailed version]	--
P	EDUC (general)	Educational attainment [general version]	--
P	EDUCD (detailed)	Educational attainment [detailed version]	--
P	EMPSTAT (general)	Employment status [general version]	--
P	EMPSTATD (detailed)	Employment status [detailed version]	--
P	LABFORCE	Labor force status	details
P	OCC	Occupation	--
P	OCC1950	Occupation, 1950 basis	--
P	OCC1990	Occupation, 1990 basis	--
P	OCC2010	Occupation, 2010 basis	--
P	IND	Industry	--
P	IND1990	Industry, 1990 basis	--
P	CLASSWKR (general)	Class of worker [general version]	--
P	CLASSWKRD (detailed)	Class of worker [detailed version]	--
P	OCCSOC	Occupation, SOC classification	--
P	WKSWORK1	Weeks worked last year	--
P	WKSWORK2	Weeks worked last year, intervalled	--
P	UHRSWORK	Usual hours worked per week	--
P	INCWAGE	Wage and salary income	--
P	INCEARN	Total personal earned income	--

DATA QUALITY FLAGS:11
 
Type	Variable
P	QAGE
P	QSEX
P	QRACE
P	QEDUC
P	QEMPSTAT
P	QIND
P	QOCC
P	QWKSWORK
P	QINCBUS
P	QINCFARM
P	QINCWAGE

*/

/* This "move rates" dataset that we construct here will be used later
   on in the construction of Table 18 in Appendix E of the paper.*/

 use occsoc year perwt occ* incwage wkswork2 age labf qocc qeduc classwkr qsex qage marst using usa_00034, replace
replace occsoc=substr(occsoc,1,4)
gen willenium=(year>=2000)
destring occsoc, replace force
sort occ1990 will
by occ1990 will: egen mode1990=mode(occsoc)
replace mode1990=. if will==0
by occ1990: egen mode1990_temp=min(mode1990)
replace occsoc=mode1990_temp
keep if occsoc~=.  
sort occ1990 
by occ1990: keep if _n==1
keep occ1990 occsoc
count
local l1=r(N)
local l2=`l1'+9
set obs `l2'
replace occsoc=1520 if _n==`l1'+1
replace occsoc=1930 if _n==`l1'+2
replace occsoc=1940 if _n==`l1'+3
replace occsoc=4341 if _n==`l1'+4
replace occsoc=5140 if _n==`l1'+5
replace occsoc=5141 if _n==`l1'+6
replace occsoc=5360 if _n==`l1'+7
replace occsoc=5371 if _n==`l1'+8
replace occsoc=0 if _n==`l1'+9

replace occ1990 = 67 if _n==`l1'+1
replace occ1990 = 168 if _n==`l1'+2
replace occ1990 = 235 if _n==`l1'+3
replace occ1990 = 326 if _n==`l1'+4
replace occ1990 = 684 if _n==`l1'+5
replace occ1990 = 726 if _n==`l1'+6
replace occ1990 = 834 if _n==`l1'+7
replace occ1990 = 876 if _n==`l1'+8
replace occ1990 = 999 if _n==`l1'+9
replace occsoc=1511 if occsoc==1510
replace occsoc=5151 if occsoc==5150
save occ1990_occsoc_correspondence, replace

  use cps_00029, replace
  keep if age>=16 & age<65 & wkswork2>=4 & empstat==10
  drop if qage>=1 & qage<=3 
  drop if qrace>=1 & qrace<=11
  drop if qsex>=1 & qsex<=11
  drop if qocc==2 | qocc==3
  drop if qnumemps==1
  gen move=0
  replace move=1 if numemps==2 | numemps==3  
  replace move=1 if occ90ly~=occ1990
  keep if year>=1970
  bys year: egen people=sum(wtsupp)
  bys occ1990 year: egen people2=sum(wtsupp)
  gen size=people2/people
  collapse (mean) move size [aw=max(wtsupp,0)], by(occ1990 year)  
  collapse (mean) move size , by(occ1990)
  merge 1:n occ1990 using occ1990_occsoc_correspondence
  drop if _merge==2
  collapse (mean) move [aw=size], by(occsoc)
  drop if occsoc==.
  save move_rates_by_occsoc, replace

use  usa_00034, replace
keep if year<=2000 & year>=1950
keep if wkswork2>=4 & incwage~=0 & incwage~=. 
keep if age>=16 & age<65 & labf==2 & qocc~=4 & qeduc~=4 & classwkr ==2 & qsex~=4 & qage~=4 & marst~=.
gen occsize=1
gen byte metro_ny=(metarea==560 & county~=41 & county~=37 & county~=35 & county~=23 & county~=19)
gen byte metro_boston=(metarea==112)
keep if metro_ny==1 | metro_boston==1
collapse (sum) occsize   [aw=perwt], by(occ1990 year)
joinby occ1990 using occ1990_occsoc_correspondence
collapse (sum) occsize, by(occsoc year)
destring occsoc, replace force
drop if occsoc==0 | occsoc==.
bys year: egen s1=sum(occsize)
replace occsize=occsize/s1
drop s1
save ipums_sizes_nybos, replace

use  usa_00034, replace
keep if year<=2000 & year>=1950
keep if wkswork2>=4 & incwage~=0 & incwage~=. 
keep if age>=16 & age<65 & labf==2 & qocc~=4 & qeduc~=4 & classwkr ==2 & qsex~=4 & qage~=4 & marst~=.
gen occsize=1
keep perwt occ1990 educ educd year
gen ba=(educ==10)
gen ma=(educ==11)
joinby occ1990 using occ1990_occsoc_correspondence
collapse (mean) ba ma [aw=perwt], by(occsoc year)
save ipums_occupation_education, replace

use  usa_00034, replace
keep if year<=2000 & year>=1950
keep if wkswork2>=4 & incwage~=0 & incwage~=. 
keep if age>=16 & age<65 & labf==2 & qocc~=4 & qeduc~=4 & classwkr ==2 & qsex~=4 & qage~=4 & marst~=.
gen occsize=1
collapse (sum) occsize  [aw=perwt], by(occ1990 year)
joinby occ1990 using occ1990_occsoc_correspondence
collapse (sum) occsize, by(occsoc year)
drop if occsoc==0
bys year: egen s1=sum(occsize)
replace occsize=occsize/s1
drop s1
save ipums_sizes_, replace

use ind* sex occsoc educd year perwt occ* incwage wkswork2 age labf qocc qeduc classwkr qsex qage marst  using  usa_00034, replace
keep if wkswork2>=4 & incwage~=0 & incwage~=.
keep if age>=16 & age<65 & labf==2 & qocc~=4 & qeduc~=4 & classwkr ==2 & qsex~=4 & qage~=4 & marst~=.
keep if year>=1960 & year<=2000
gen byte edu1=(educd>=20 &  educd<=61 & educd~=60) | (educd<=17 & educd>=2)
gen byte edu2=((educd>=62 & educd<=64) | educd==60)
gen byte edu3=(educd>=65 & educd<=90)
gen byte edu4=(educd==100 | educd==101)
gen byte edu5=(educd>=110 & educd<.)

gen eduG=0
replace eduG=1 if edu2==1
replace eduG=2 if edu3==1
replace eduG=3 if edu4==1
replace eduG=4 if edu5==1

gen occsize=1
collapse (sum) occsize [aw=perwt], by(occ1990 year  ind1990 eduG sex)
joinby occ1990 using occ1990_occsoc_correspondence
collapse (sum) occsize, by(occsoc year ind1990 eduG sex)
fillin sex ind edu occsoc year
replace occsize=0 if occsize==.
drop _fillin
save ipums_sizes_by_ind_gender_educ, replace

log close