/*

Multiple-Product Firms and Product Switching
This Stata .do file inputs raw census files and outputs datasets used to generate the results in the paper.
See the accompanying .do file 20060523_create_results.do for the code used to generate the results.
v2009.2.4

This program has the following sections/steps
1 Create firm-ssic5-year dataset from the cmf_base## files
2 Create a full firm x product x year dataset for all firm-prod combinations in the 
3 Create version of the above full dataset that excludes ssic5 that do not show up in every year of sample
4 Compute index TFP measures based on firms' major industry (msic) by sample:
	c8797: only includes ssic5 that are common to 1987, 1992 and 1997 censuses
	c7282: only includes ssic5 that are common to 1972, 1977 and 1982 censuses
	c7297: only includes ssic5 that are common to 1972 - 1997 censuses 
5 Create product mix dummies by sample
6 Create firms major industry code

Note on variable names 
	lrd names 
	ppn     = plant identifier
      curpc   = product code from trailer file
	firmid  = firm id 
	        = alpha+0000 for multi-unit firms 
	        = 0+ein      for single unit firms
	tvs     = total value of shipments $000 (I think) 
	va      = value added 
	tae     = total assets ending (k) -- $000 (I think) 
	mr      = machinery and equip rental 
	br      = building rental 
	cm      = total cost of materials 
	numprods= number of products 
	exp     = value of export shipments  
	te      = total employment 
	pw      = production workers 
	oe      = other workers 
	sw      = total salaries and wages -- $000 (I think) 
	ww      = production worker salaries and wages -- $000 (I think) 
	ow      = other (non-prod) workers wages -- $000 (I think) 
	xxx     = region variable used to deflating wages by region cpi 
	censt   = census state; 11-96; first digit is a census region; ca=93 
	fipst   = state code; alphabetical; 1-56 

Census Data
As noted in the paper, our analysis relies data gathered and maintained by the U.S. Census Bureau. These data are not available to the general public and can only be accessed by permission of the Census Bureau. Guidelines for gaining such permission are available on the Center for Economic Studies (CES) website, www.ces.census.gov. 

Other Data Files
1. bbg96_87.dta is available on the nber website at http://www.nber.org/nberces/nbprod96.htm
2. region_state_cpi.dta (CPI of plant's BLS region; see www.bls.gov)
3. cmf_base`y'.dta are the census of manufactures base files for census year y from the census server (converted from SAS to stata). note that into the base files we have merged the raw product trailer files by ppn and year
4. cmf`yr'prod.dta are the census of manufactures product trailer files for census year y from the census server (converted from SAS to stata).

If you spot any problems with this code, please email peter.schott@yale.edu

*/ 


**0 Preliminaries
clear
set more off
set mem 2700m


**1 Create firm-ssic5-year dataset from the cmf_base## files
capture log close
log using ps4_basic_part1, replace text

*first prep the deflators in the nber database and from the regional cpis to be base year 1997
use bbg96_87.dta, clear
keep sic year piship piinv pimat
foreach xxx in piship pimat {
	gen t1 = year==1997
	gen t2 = t1*`xxx'
	replace t2=. if t2==0
	egen t3 = mean(t2), by(sic)
	gen `xxx'97 = `xxx'/t3
	drop t1-t3
}
sort sic year
gen lag_piinv = piinv[_n-1]
replace piinv = lag_piinv if year==1997
foreach xxx in piinv {
	gen t1 = year==1996
	gen t2 = t1*`xxx'
	replace t2=. if t2==0
	egen t3 = mean(t2), by(sic)
	gen `xxx'97 = `xxx'/t3
	drop t1-t3
}
rename sic pt_ind
sort pt_ind year
save ps4_bbg_deflators, replace
rename pt_ind sic4
sort sic4 year
save ps4_bbg_deflators_pv, replace

use region_state_cpi, clear
rename state censt
gen t1=year==97
gen t2=t1*rcpi
egen t3=max(t2), by(region)
gen rcpi97=rcpi/t3*100
drop t1 t2 t3
replace year=year+1900
sort censt year
save ps4_region_state_cpi_base97, replace

*check total value in base file, by year, deflate and create firm- and plantchars files
forvalues y=72(5)97 {
	use cmf_base`y', clear

	rename frmnu t1
	destring t1, force g(frmnu)
	rename censt t2
	destring t2, force g(censt)
	drop t1 t2

	collapse (max) ar (mean) tvs numprods te pw oe sw ww ow tae mr br cm exp censt frmnu pt_ind, by(ppn firmid year)

	*deflate wages
	sort censt year
	merge censt year using ps4_region_state_cpi_base97, keep(rcpi97)
	tab _merge
	drop if _merge==2
	drop _merge
	gen rsw = sw/(rcpi97/100)
	gen row = ow/(rcpi97/100)
	gen rww = ww/(rcpi97/100)

	*deflate other inputs
	sort pt_ind year
	merge pt_ind year using ps4_bbg_deflators, keep(piship97 piinv97 pimat97)
	tab _merge 
	drop if _merge==2
	drop _merge
	gen rtvs = tvs/(piship97)
	gen rcm  = cm/(pimat97)
	gen rtae = tae/(piinv97)

	tab pt_ind if piship97==.
	gen i = piship97~=.
	table i, c(sum tvs) f(%20.0fc)

	table year, c(sum tvs count tvs sum rtvs count rtvs) f(%12.0fc)

	save ps4_plantchars`y', replace

	collapse (max) ar (sum) tvs numprods te pw oe sw ww ow tae mr br cm exp rsw row rww rtvs rcm rtae, by(firmid year)

	table year, c(sum tvs count tvs sum rtvs count rtvs) f(%12.0fc)
	sort firmid year
	save ps4_firmchars`y', replace
}

*put together firm- and plantchar files
use ps4_firmchars72, clear
forvalues y=77(5)97 {
	append using ps4_firmchars`y'
}
sort firmid year
save ps4_firmchars, replace

use ps4_plantchars72, clear
forvalues y=77(5)97 {
	append using ps4_plantchars`y'
}
sort ppn year
save ps4_plantchars, replace

forvalues y=72(5)97 {
	use cmf_base`y', clear
	keep firmid ppn year tvs curpc pt_pv ar frmnu censt
	gen str3 suffix3 = substr(curpc,5,3)
	gen str1 prefix1 = substr(curpc,1,1)
	gen str5 ssic5 = substr(curpc,1,5)

	**drop obs based on ar, balance codes, etc
	gen obsdrop = 0
	replace obsdrop = 1 if ar==1
	replace obsdrop = 2 if ~(prefix1=="2" | prefix=="3")
	replace obsdrop = 3 if pt_pv==.
	replace obsdrop = 4 if pt_pv==0

	*check out how many obs get dropped
	table obsdrop year, c(count ar)
	keep if obsdrop==0
	tab prefix1

	rename frmnu t1
	destring t1, force g(frmnu)
	rename censt t2
	destring t2, force g(censt)
	drop t1 t2

	*collapse to ppn-ssic5-year 
	collapse (sum) pt_pv (mean) tvs ar frmnu censt, by(firmid ppn ssic5 year)

	*deflate for plant file
	gen ssic4 = substr(ssic5,1,4)
	destring ssic4, force g(sic4)
	sort sic4 year
	merge sic4 year using ps4_bbg_deflators_pv, keep(piship97)
	tab _merge
	drop if _merge==2
	drop _merge
	gen rpt_pv = pt_pv/piship97
	compare rpt_pv pt_pv
	drop piship97 ssic4 sic4
	save ps4_plantlevel`y', replace  

	*collapse to firmid-ssic5-year 
	collapse (sum) pt_pv ar, by(firmid ssic5 year)

	save fp`y', replace
} 


use fp72, clear
forvalues y=77(5)97 {
	append using fp`y'
}
sort firmid year
merge firmid year using ps4_firmchars
tab _merge
drop if _merge==2
drop _merge

*deflate product shipment values
gen ssic4 = substr(ssic5,1,4)
destring ssic4, force g(sic4)
sort sic4 year
merge sic4 year using ps4_bbg_deflators_pv, keep(piship97)
tab _merge
drop if _merge==2
drop _merge
gen rpt_pv = pt_pv/piship97
compare rpt_pv pt_pv
drop piship97 ssic4 sic4

sort firmid ssic5 year
save fp_01, replace

**check how much is left in sample now
use fp_01, clear
keep firmid tvs year
collapse (mean) tvs, by(firmid year)	
gen v=tvs/1000000
table year                , c(sum v count v)

use ps4_plantlevel72, clear
forvalues y=77(5)97 {
	append using ps4_plantlevel`y'
}
sort ppn ssic5 year
save ps4_plantlevel, replace

*create pre-winnowed birthdeath for firmage purposes
use fp_01, clear
collapse (mean) tvs, by(firmid year)
drop if tvs==0 | tvs==.
egen born = min(year), by(firmid)
egen died = max(year), by(firmid)
gen birthyear = born==year
gen t1  = birthyear*year
replace t1=. if birthyear==0
egen t2 = mean(t1), by(firmid)
gen age = year-t2
replace age=age+5      /*so logs can be taken below*/
sort firmid year
save ps4_firmage, replace	


log close



**2 Create a full firm x product x year dataset for all firm-prod combinations in the 
**  actual data. Then merge in the actual data, with missing values for obs that are not
**  actually in the dataset.
capture log close
log using ps4_basic_part2, text replace

*pre file for years
clear
set obs 6
gen year=1
replace year = 1972 in 1
replace year = 1977 in 2
replace year = 1982 in 3
replace year = 1987 in 4
replace year = 1992 in 5
replace year = 1997 in 6
sort year
save year, replace

use fp_01, clear
collapse (sum) pt_pv, by(firmid ssic5)
drop pt_pv
cross using year
sort firmid ssic5 year
merge firmid ssic5 year using fp_01
tab _merge
sort firmid ssic5 year
rename _merge dataflag
keep if year>=1972 & year<=1997
label var dataflag "_3=original data; _1=from fillin"

gen neg=pt_pv<0
tab neg year
replace pt_pv=0 if pt_pv<0 
*drop neg

save fp5_02, replace

log close




**3. Create versions of fp5_02 that exclude ssic5 that do not show up in every year of sample
**   Call these samples _c8797, c7282 and c7297
**
**   First, create the lists of common products
**   Second, exclude extraneous ssic5 from samples 
**
**   Note that these common lists have dropped any five digit ending in 0 unless it was the only 5 digit
**   in the four digit industry
**

**3a.  create the common lists
capture log close 
log using yr_to_yr_curpc_02.log, replace 
**Create Unique List of curpc5 for each year 
foreach yr in 72 77 82 87 92 97 { 
	use cmf`yr'prod, clear 
	keep curpc 
	gen curpc5=substr(curpc,1,5) 
	gen curpc1=substr(curpc,1,1) 
	destring curpc1, replace 
	drop if curpc1<2 
	drop if curpc1>3 
	drop curpc1 
	sort curpc5 
	drop if curpc5==curpc5[_n-1] 
	*tab curpc5 
	keep curpc5
	sort curpc5 
	save curpc5_`yr', replace 
	drop _all 
} 


*by year, look for codes that end in zero 
foreach yr in 97 92 87 82 77 72   { 
	use curpc5_`yr', clear 
	gen last0  = substr(curpc5,5,1)=="0" 
	gen curpc4 = substr(curpc5,1,4) 
	egen n5    = count(last0), by(curpc4) 
	egen n5_0  = total(last0), by(curpc4) 
	gen i      = n5==1 & n5_0==1 
	tab i     

} 

**create common list for all years 
use curpc5_72, clear 
sort curpc5 
foreach yr in 77 82 87 92 97 { 
	merge curpc5 using curpc5_`yr' 
	tab _merge 
	keep if _merge==3 
	drop _merge 
	sort curpc5 
	gen year=`yr' 
	tab year 
	drop year 
} 
*drop codes ending in 0 if it is not the only 5 in a 4 
sum  
gen last0  = substr(curpc5,5,1)=="0" 
gen curpc4 = substr(curpc5,1,4) 
egen n5    = count(last0), by(curpc4) 
egen n5_0  = total(last0), by(curpc4) 
gen i      = n5==1 & n5_0==1 
tab last0 i     
drop if last0==1 & i==0 
tab last0 i 
drop last0-i 
save curpc5_c7297, replace 



**create common list for 72-82 
use curpc5_72, clear 
sort curpc5 
foreach yr in 77 82 { 
	merge curpc5 using curpc5_`yr' 
	tab _merge 
	keep if _merge==3 
	drop _merge 
	sort curpc5 
	gen year=`yr' 
	tab year 
	drop year 
}	 
*drop codes ending in 0 if it is not the only 5 in a 4 
sum  
gen last0  = substr(curpc5,5,1)=="0" 
gen curpc4 = substr(curpc5,1,4) 
egen n5    = count(last0), by(curpc4) 
egen n5_0  = total(last0), by(curpc4) 
gen i      = n5==1 & n5_0==1 
tab last0 i     
drop if last0==1 & i==0 
tab last0 i 
drop last0-i 
save curpc5_c7282, replace 


**create common list for 87-97 
use curpc5_87, clear 
sort curpc5 
foreach yr in 92 97 { 
	merge curpc5 using curpc5_`yr' 
	tab _merge 
	keep if _merge==3 
	drop _merge 
	sort curpc5 
	gen year=`yr' 
	tab year 
	drop year 
}	 
*drop codes ending in 0 if it is not the only 5 in a 4 
sum  
gen last0  = substr(curpc5,5,1)=="0" 
gen curpc4 = substr(curpc5,1,4) 
egen n5    = count(last0), by(curpc4) 
egen n5_0  = total(last0), by(curpc4) 
gen i      = n5==1 & n5_0==1 
tab last0 i     
drop if last0==1 & i==0 
tab last0 i 
drop last0-i 
save curpc5_c8797, replace 
log close 




**3b Create a basic data files that are specific to constant set of goods for each sample:
**	
**   _c8797
**   _c7282
**   _c7297
**
**
**   Also create files needed for tfp routine
**
capture log close
log using ps4_drop_uncommon_ssic5.log, text replace
foreach xxx in 8797 7282 7297 {	


	use curpc5_c`xxx', clear
	*drop curpc
	rename curpc5 ssic5
	sort ssic5
	save curpc5_c`xxx'_01, replace



	use fp5_02, clear
	if `xxx'==7282 {
		drop if year>=1987
	}
	if `xxx'==8797 {
		drop if year<=1982
	}	
	sort ssic5
	merge ssic5 using curpc5_c`xxx'_01, keep(ssic5)
	table year _merge, c(sum pt_pv count pt_pv) f(%20.0fc)
	keep if _merge==3
	drop _merge
	
	*get rid of firms not from these years
	*added 2007.11.8
	egen t1=total(pt_pv), by(firmid)
	drop if t1==0
	
	save fp5_02_c`xxx', replace


	use ps4_plantlevel, clear
	if `xxx'==7282 {
		drop if year>=1987
	}
	if `xxx'==8797 {
		drop if year<=1982
	}	
	sort ssic5
	merge ssic5 using curpc5_c`xxx'_01, keep(ssic5)
	table year _merge, c(sum pt_pv count pt_pv) f(%20.0fc)
	keep if _merge==3
	drop _merge
	save ps4_plantlevel_c`xxx', replace


	**create plant-level deflators
	use ps4_plantlevel_c`xxx', clear 
	gen ssic4 = substr(ssic5,1,4) 
	drop if pt_pv==0 | pt_pv==. | pt_pv<0
	collapse (sum) pt_pv, by(ppn ssic4 year) 
 
	destring ssic4, force g(sic4) 
	sort sic4 year 
	merge sic4 year using ps4_bbg_deflators_pv, keep(piship97 piinv97 pimat97) 
	tab _merge 
	drop if _merge==2 
	drop _merge
	collapse (mean) pi* [aw=pt_pv], by(ppn year)
	rename piship97 ppiship97
	rename piinv97 ppiinv97
	rename pimat97 ppimat97

	sort ppn year
	save ps4_pdeflator_c`xxx', replace


	*Create plant-level raw-data-tfp files to be used in the TFP routine below

	*First, use plants' major-industry to deflate
	use ps4_plantchars, clear
	sort ppn year
	rename pt_ind msic 
	*destring pt_ind, force g(msic)
	sort ppn year
	save ps4_02_ptfp1_c`xxx', replace

	*create birthdeath file
	use fp_01, clear
	if `xxx'==7282 {
		drop if year>=1987
	}
	if `xxx'==8797 {
		drop if year<=1982
	}	
	sort ssic5
	merge ssic5 using curpc5_c`xxx'_01, keep(ssic5)
	table year _merge, c(sum pt_pv count pt_pv) f(%20.0fc)
	keep if _merge==3
	drop _merge
	collapse (mean) tvs, by(firmid year)

	drop if tvs==0 | tvs==.
	egen born = min(year), by(firmid)
	egen died = max(year), by(firmid)
	gen birthyear = born==year
	gen deathyear = died==year
	keep if birthyear==1 | deathyear==1
	sort firmid year
	save ps4_birthdeath_c`xxx', replace	


	**create multi-plant indicator
	use ps4_plantlevel_c`xxx', replace

	collapse (sum) pt_pv, by(firmid ppn year)
	gen nplant = 1	
	collapse (sum) nplant, by(firmid year)
	gen mplant = nplant>1 & nplant~=.
	sort firmid year
	save ps4_mplant_c`xxx', replace	

}


log close




**4 Compute Index TFP based on msic BY SAMPLE
**
clear
set more off
set mem 2700m
capture log close
log using ps4_basic_tfp.txt, replace text
local nyear = 1997

foreach xxx in 8797 7282 7297 {	

	*set directory

	**create locals here because year vector changes by sample below
	if `xxx'==7297 {
		local ylist1 = "1972 1977 1982 1987 1992 1997"
		local ylist2 = "1977 1982 1987 1992 1997"
		local ylist3 = "1972"
	}
	if `xxx'==8797 {
		local ylist1 = "1987 1992 1997"
		local ylist2 = "1992 1997"
		local ylist3 = "1987"
	}
	if `xxx'==7282 {
		local ylist1 = "1972 1977 1982"
		local ylist2 = "1977 1982"
		local ylist3 = "1972"
	}


	***************************************************************************************
	*1. FIRST plant file 
	use ps4_02_ptfp1_c`xxx', clear
	foreach yyy in pw oe rcm rtvs rww row rtae {
		gen l`yyy' = ln(`yyy')
	}
	
	**screens
	drop if rtvs==0  | pw==0  | oe==0  | rcm==0  | rtae==0 
	drop if lrtvs==. | lpw==. | loe==. | lrcm==. | lrtae==. 
	
	gen suminputs=rcm+rww+row 
	drop if suminputs>rtvs
	sum te pw oe rsw rww row rcm rtvs  
	
	**compute cost shares
	foreach yyy in rww row rcm {
	      gen s_`yyy' = `yyy'/rtvs
	}
	gen s_rtae = 1-s_rcm-s_rww-s_row
	
	**compute mean costshares
	foreach yyy in s_row s_rww s_rcm s_rtae {
		egen m`yyy' = mean(`yyy'), by(msic year)
	}
	
	**compute mean i/o
	foreach yyy in lrtvs lpw loe lrtae lrcm {
		egen m`yyy' = mean(`yyy'), by(msic year)
	}
	
	**distribute 'nyear' mean shares and i/o to all years
	foreach yyy in rww row rcm rtae {
		*Added year in by() b/c we will now have data for multiple years--not just 1977
		egen msyear_`yyy' = mean(ms_`yyy'), by(msic year) 
	}
	foreach yyy in lrtvs lpw loe lrtae lrcm {
		*Added year in by() b/c we will now have data for multiple years--not just 1977
		egen myear_`yyy' = mean(m`yyy'), by(msic year) 
	}
	
	* following sections added by JRP on 10/30/07
	save ps4_acrtfp_interim_c`xxx', replace
	
	foreach yyy in `ylist1' {
		use ps4_acrtfp_interim_c`xxx', clear
		keep if year==`yyy'
		save `yyy'_1_c`xxx', replace
	}
	
	foreach yyy in `ylist1' {
		use `yyy'_1_c`xxx'
		egen ms`yyy'_rww=mean(msyear_rww)
		egen ms`yyy'_row=mean(msyear_row)
		egen ms`yyy'_rcm=mean(msyear_rcm)
		egen ms`yyy'_rtae=mean(msyear_rtae) 
		egen m`yyy'_lrtvs=mean(myear_lrtvs)
		egen m`yyy'_lpw=mean(myear_lpw)
		egen m`yyy'_loe=mean(myear_loe)
		egen m`yyy'_lrtae=mean(myear_lrtae)
		egen m`yyy'_lrcm=mean(myear_lrcm)
		save `yyy'_2_c`xxx', replace
	}
	
	use `ylist3'_2_c`xxx'
	foreach yyy in `ylist2' {
		append using `yyy'_2_c`xxx'
	}
	
	foreach yyy in `ylist1' {
		egen ms`yyy'_rww2=mean(ms`yyy'_rww)
		egen ms`yyy'_row2=mean(ms`yyy'_row)
		egen ms`yyy'_rcm2=mean(ms`yyy'_rcm)
		egen ms`yyy'_rtae2=mean(ms`yyy'_rtae) 
		egen m`yyy'_lrtvs2=mean(m`yyy'_lrtvs)
		egen m`yyy'_lpw2=mean(m`yyy'_lpw)
		egen m`yyy'_loe2=mean(m`yyy'_loe)
		egen m`yyy'_lrtae2=mean(m`yyy'_lrtae)
		egen m`yyy'_lrcm2=mean(m`yyy'_lrcm)
	}
		
	* compute components of productivity measure	
	if `xxx'==7297 {
		gen tc1972=0
		gen tc1977=(ms1977_rcm2+ms1972_rcm2)*(m1977_lrcm2-m1972_lrcm2)+(ms1977_rtae2+ms1972_rtae2)*(m1977_lrtae2-m1972_lrtae2)+(ms1977_row2+ms1972_row2)*(m1977_loe2-m1972_loe2)+(ms1977_rww2+ms1972_rww2)*(m1977_lpw2-m1972_lpw2)
		gen tc1982=(ms1982_rcm2+ms1977_rcm2)*(m1982_lrcm2-m1977_lrcm2)+(ms1982_rtae2+ms1977_rtae2)*(m1982_lrtae2-m1977_lrtae2)+(ms1982_row2+ms1977_row2)*(m1982_loe2-m1977_loe2)+(ms1982_rww2+ms1977_rww2)*(m1982_lpw2-m1977_lpw2)
		gen tc1987=(ms1987_rcm2+ms1982_rcm2)*(m1987_lrcm2-m1982_lrcm2)+(ms1987_rtae2+ms1982_rtae2)*(m1987_lrtae2-m1982_lrtae2)+(ms1987_row2+ms1982_row2)*(m1987_loe2-m1982_loe2)+(ms1987_rww2+ms1982_rww2)*(m1987_lpw2-m1982_lpw2)
		gen tc1992=(ms1992_rcm2+ms1987_rcm2)*(m1992_lrcm2-m1987_lrcm2)+(ms1992_rtae2+ms1987_rtae2)*(m1992_lrtae2-m1987_lrtae2)+(ms1992_row2+ms1987_row2)*(m1992_loe2-m1987_loe2)+(ms1992_rww2+ms1987_rww2)*(m1992_lpw2-m1987_lpw2)
		gen tc1997=(ms1997_rcm2+ms1992_rcm2)*(m1997_lrcm2-m1992_lrcm2)+(ms1997_rtae2+ms1992_rtae2)*(m1997_lrtae2-m1992_lrtae2)+(ms1997_row2+ms1992_row2)*(m1997_loe2-m1992_loe2)+(ms1997_rww2+ms1992_rww2)*(m1997_lpw2-m1992_lpw2)	
		gen tc=.
		replace tc=tc1972 if year==1972
		replace tc=tc1972+tc1977 if year==1977
		replace tc=tc1972+tc1977+tc1982 if year==1982
		replace tc=tc1972+tc1977+tc1982+tc1987 if year==1987
		replace tc=tc1972+tc1977+tc1982+tc1987+tc1992 if year==1992
		replace tc=tc1972+tc1977+tc1982+tc1987+tc1992+tc1997 if year==1997
		replace tc=tc/2

		**compute components of productivity measure
		gen ta  = (lrtvs-mlrtvs) + (mlrtvs-m1972_lrtvs2)
		gen tb = ((s_rcm+ms_rcm)*(lrcm-mlrcm) + (s_rtae+ms_rtae)*(lrtae-mlrtae) + (s_row+ms_row)*(loe-mloe) + (s_rww+ms_rww)*(lpw-mlpw) )/2
	}
	if `xxx'==7282 {
		gen tc1972=0
		gen tc1977=(ms1977_rcm2+ms1972_rcm2)*(m1977_lrcm2-m1972_lrcm2)+(ms1977_rtae2+ms1972_rtae2)*(m1977_lrtae2-m1972_lrtae2)+(ms1977_row2+ms1972_row2)*(m1977_loe2-m1972_loe2)+(ms1977_rww2+ms1972_rww2)*(m1977_lpw2-m1972_lpw2)
		gen tc1982=(ms1982_rcm2+ms1977_rcm2)*(m1982_lrcm2-m1977_lrcm2)+(ms1982_rtae2+ms1977_rtae2)*(m1982_lrtae2-m1977_lrtae2)+(ms1982_row2+ms1977_row2)*(m1982_loe2-m1977_loe2)+(ms1982_rww2+ms1977_rww2)*(m1982_lpw2-m1977_lpw2)
		gen tc=.
		replace tc=tc1972 if year==1972
		replace tc=tc1972+tc1977 if year==1977
		replace tc=tc1972+tc1977+tc1982 if year==1982
		replace tc=tc/2

		**compute components of productivity measure
		gen ta  = (lrtvs-mlrtvs) + (mlrtvs-m1972_lrtvs2)
		gen tb = ((s_rcm+ms_rcm)*(lrcm-mlrcm) + (s_rtae+ms_rtae)*(lrtae-mlrtae) + (s_row+ms_row)*(loe-mloe) + (s_rww+ms_rww)*(lpw-mlpw) )/2
	}
	if `xxx'==8797 {
		gen tc1987=0
		gen tc1992=(ms1992_rcm2+ms1987_rcm2)*(m1992_lrcm2-m1987_lrcm2)+(ms1992_rtae2+ms1987_rtae2)*(m1992_lrtae2-m1987_lrtae2)+(ms1992_row2+ms1987_row2)*(m1992_loe2-m1987_loe2)+(ms1992_rww2+ms1987_rww2)*(m1992_lpw2-m1987_lpw2)
		gen tc1997=(ms1997_rcm2+ms1992_rcm2)*(m1997_lrcm2-m1992_lrcm2)+(ms1997_rtae2+ms1992_rtae2)*(m1997_lrtae2-m1992_lrtae2)+(ms1997_row2+ms1992_row2)*(m1997_loe2-m1992_loe2)+(ms1997_rww2+ms1992_rww2)*(m1997_lpw2-m1992_lpw2)	
		gen tc=.
		replace tc=tc1987 if year==1987
		replace tc=tc1987+tc1992 if year==1992
		replace tc=tc1987+tc1992+tc1997 if year==1997
		replace tc=tc/2

		**compute components of productivity measure
		gen ta  = (lrtvs-mlrtvs) + (mlrtvs-m1987_lrtvs2)
		gen tb = ((s_rcm+ms_rcm)*(lrcm-mlrcm) + (s_rtae+ms_rtae)*(lrtae-mlrtae) + (s_row+ms_row)*(loe-mloe) + (s_rww+ms_rww)*(lpw-mlpw) )/2
	}
	gen acrtfp_p1 = ta - tb - tc
	sum ppn year tc-acrtfp_p1
	sort ppn year

	merge ppn year using ps4_plantchars, keep(firmid tvs)
	tab _merge
	drop if _merge==2
	drop _merge

	*take tvs weighted mean across plants in firm
	collapse (mean) acrtfp_p1 msic [aweight=int(tvs)], by(firmid year)
	sort firmid year
	save ps4_acrtfp`nyear'_c`xxx'_p1, replace
	***************************************************************************************

}


**combine measures
foreach xxx in 8797 7282 7297 {	
local nyear = 1997
	use ps4_acrtfp`nyear'_c`xxx'_p1, clear
	keep firmid year acrtfp_p1 msic
	sort firmid year
	sum acrtfp*
	save ps4_acrtfp`nyear'_c`xxx', replace
}

log close






**5  Create product mix dummies by sample
clear
set mem 2600m
set more off
capture log close
log using ps4_basic_prodmix.log, replace text

foreach xxx in 8797 7282 7297 {	

	**1  Create background datasets listed below.  For the first three, include a column
	**   which is the binary weight needed to construct the base 10 representation. 
	**

	**create list of all product, etc lines (there are 1923 ssic5, xxx ssic4 and xxx ssic2)
	use fp5_02_c`xxx', clear
	keep ssic5
	duplicates drop ssic5, force
	sort ssic5
	save ps4_ssic5list_c`xxx', replace

	use fp5_02_c`xxx', clear
	gen ssic4=substr(ssic5,1,4)
	keep ssic4
	duplicates drop ssic4, force
	sort ssic4
	save ps4_ssic4list_c`xxx', replace	

	use fp5_02_c`xxx', clear
	gen ssic2=substr(ssic5,1,2)
	keep ssic2
	duplicates drop ssic2, force
	sort ssic2
	save ps4_ssic2list_c`xxx', replace			

	**chop up ssic5 lists:  stata no allow numbers greater than 10^323<2^2000, so need
	**  to do ssic5 binary encoding in stages
	local xx = 50
	local c = 1
	while `c'<=39 {
		local l = (`c'-1)*`xx'
		local u = `c'*`xx'
		display [`l'] " " [`u']		

		use ps4_ssic5list_c`xxx', clear
		keep if _n>`l' & _n<=`u'
		gen junk=1	
		gen temp=sum(junk)
		gen double binary`c'=2^temp
		keep ssic5 binary`c'
		sort ssic5
		save ps4_ssic5list_binary_0`c'_c`xxx', replace

		local c = `c' + 1
	}

	**append all together
	use ps4_ssic5list_binary_01_c`xxx', clear
	forvalues x=2/39  {
		append using ps4_ssic5list_binary_0`x'_c`xxx'
	}
	sort ssic5
	save ps4_ssic5list_binary_c`xxx', replace


	**chop up ssic4 lists:  stata no allow numbers greater than 10^323<2^2000, so need
	**  to do ssic4 binary encoding in stages
	local xx = 25
	local c = 1
	while `c'<=19 {
		local l = (`c'-1)*`xx'
		local u = `c'*`xx'
		display [`l'] " " [`u']

		use ps4_ssic4list_c`xxx', clear
		keep if _n>`l' & _n<=`u'
		gen junk=1	
		gen temp=sum(junk)
		gen double binary`c'=2^temp
		keep ssic4 binary`c'
		sort ssic4
		save ps4_ssic4list_binary_0`c'_c`xxx', replace

		local c = `c' + 1
	}

	**append all together
	use ps4_ssic5list_binary_01_c`xxx', clear
	forvalues x=2/19  {
		append using ps4_ssic4list_binary_0`x'_c`xxx'
	}
	sort ssic4
	save ps4_ssic4list_binary_c`xxx', replace



	**2  Create product mix datasets
	**
	**

	**collapse to firm ssic5 pv
	use fp5_02_c`xxx', clear
	drop if pt_pv==.

	**read in binary weights for each product
	sort ssic5
	merge ssic5 using ps4_ssic5list_binary_c`xxx'
	tab _merge
	drop _merge	

	**compute base 10 representation for subgroup
	drop if pt_pv==0 | pt_pv==. | pt_pv<0
	forvalues x=1/39  {
		gen double binprodmix`x' = binary`x'
	}
	save ps4_prodmix_interim1_c`xxx', replace

	**check whether any of the binprodmixes sum to more than 10^300.  If so, they will
	**be incorret.  did not find any...
	use ps4_prodmix_interim1_c`xxx', clear
	forvalues x=1/39  {
		gen logcheck`x' = log10(binprodmix`x')
	}
	gen nprod=1	
	collapse (sum) binprodmix* logcheck* nprod, by(firm year)
	sum logcheck*
	forvalues x=1/39  {
		rename binprodmix`x' p`x'
		label var p`x' "unique product mix identifier for subgroup `x'"
	}
	sort firm year		
	save ps4_prodmix_interim2_c`xxx', replace

	**now create groups for prod mix
	use ps4_prodmix_interim2_c`xxx', clear
	sort p1-p39
	des
	forvalues x=1/39 {
	gen i`x' = p`x'==p`x'[_n-1]
	}
	gen i = i1+i2+i3+i4+i5+i6+i7+i8+i9+i10+i11+i12+i13+i14+i15+i16+i17+i18+i19+i20+i21+i22+i23+i24+i25+i26+i27+i28+i29+i30+i31+i32+i33+i34+i35+i36+i37+i38+i39
	tab i
	gen idx=1
	replace idx=0 if i==39
	gen prodmix = sum(idx)
	drop i1-i39
	order firm year prodmix p1-p39
	sort firmid year 
	save ps4_prodmix_c`xxx', replace



	**3  Create ind4 mix datasets
	**
	**
	clear
	set mem 2000m	

	**collapse to firm ssic4 pv
	use fp5_02_c`xxx', clear
	gen str4 ssic4 = substr(ssic5,1,4)		
	collapse (sum) pt_pv, by(firm year ssic4)

	**read in binary weights for each product
	sort ssic4
	merge ssic4 using ps4_ssic4list_binary_c`xxx'
	tab _merge
	drop _merge

	**compute base 10 representation for subgroup
	gen idx = pt_pv>0 & pt_pv~=.
	drop if idx==0
	forvalues x=1/19  {
		gen double binprodmix`x' = idx*binary`x'
	}
	save ps4_ind4mix_interim1_c`xxx', replace

	**check whether any of the binprodmixes sum to more than 10^300.  If so, they will
	**be incorret.  did not find any...
	forvalues x=1/19  {
		gen logcheck`x' = log10(binprodmix`x')
	}	
	collapse (sum) binprodmix* logcheck*, by(firm year)
	sum logcheck*

	forvalues x=1/19  {
		rename binprodmix`x' p`x'
		label var p`x' "unique ind4 identifier for subgroup `x'"
	}
	sort firm year		
	save ps4_ind4mix_interim2_c`xxx', replace

	**now create groups for prod mix
	use ps4_ind4mix_interim2_c`xxx', replace
	sort p1-p19
	des
	forvalues x=1/19 {
		gen i`x' = p`x'==p`x'[_n-1]
	}
	gen i = i1+i2+i3+i4+i5+i6+i7+i8+i9+i10+i11+i12+i13+i14+i15+i16+i17+i18+i19
	gen idx=1
	replace idx=0 if i==19
	gen ind4mix = sum(idx)
	drop i1-i19
	order firm year ind4mix p1-p19
	sort ind4mix
	save ps4_ind4mix_c`xxx', replace

}

log close



**6 Use Plant's Major Industry Code to Create Firm MSIC
**  Do this by finding out firm's predominant four digit SIC root
use fp5_02_c8797, clear 
gen ssic4 = substr(ssic5,1,4) 
collapse (sum) tvs pt_pv, by(firmid ssic4 year)
drop if pt_pv==. 
destring ssic4, force g(sic4) 
egen t1 = rank(pt_pv), field by(firmid year) 
gen t2  = sic4 if t1==1 
replace t2=. if t1~=1 
egen msic=mean(t2), by(firmid year) 
collapse (mean) msic, by(firmid year) 
sort firmid year 
save ps4_msic_c8797, replace






