version 12.1
set type double


global TERMINAL "/mnt/data0/work/MPF_FINAL"

***************************************************************************
* Pre GMM Preparation
* Generate data for estimation, and also data to be used in post-estimation
****************************************************************************


	clear all
	set more off
	set matsize 11000
	set maxvar 32767
	
	
	cd ${TERMINAL}/data/
	use KiltsQuarterlyCleanV2_v4.dta



	* Drop nonsense or incomplete observations
	drop if quantity <=0
	drop if quantity == .
	drop if value <=0 
	drop if value == .
	drop if price == .
	drop if upc == .
	drop if rawb == .



	duplicates drop upc  product_module_code brand_code firmid product_group_code panel_year quarter, force

	* Create time, brand, and firm identifiers
	sort upc  panel_year quarter
	egen double time = group(panel_year quarter)

	sort product_group_code firmid product_module_code brand_code
	egen double bm = group(product_group_code firmid product_module_code brand_code) 
	sort product_group_code firmid product_module_code
	egen double fm = group(product_group_code firmid) 

	gen double lnprice = ln(price)

	
	* Find number of buyers for each brand, firm, product group
	bys product_group_code firmid product_module_code brand_code panel_year quarter: egen double brandbuyers = sum(rawb) 
	
	bys product_group_code firmid panel_year quarter: egen double firmbuyersold = sum(rawb) 
	
	
	bys product_group_code panel_year quarter: egen double groupbuyers = sum(rawb) 



	* Sales by brand, firm, product group
	bys product_group_code firmid product_module_code brand_code panel_year quarter: egen double brandsalestotal = total(value)
	bys product_group_code firmid panel_year quarter: egen double firmsalestotal = total(value)
	bys product_group_code panel_year quarter: egen double groupsalestotal = total(value)
	bys panel_year quarter: egen double totalsalesquart = total(value)
	

	* Find shares of sales, by firm, brand, and product-group
	gen double shareprodfirm = value/firmsalestotal
	gen double sharebrandfirm = brandsalestotal/firmsalestotal
	gen double sharefirmgroup = firmsalestotal/groupsalestotal
	gen double sharegroupagg = groupsalestotal/totalsalesquart
	gen double lnshareprodfirm = ln(shareprodfirm)
	gen double lnsharebrandfirm = ln(sharebrandfirm)
	gen double lnsharefirmgroup = ln(sharefirmgroup)
	gen double lnsharegroupagg = ln(sharegroupagg)


	gen double weightedrawb = shareprodfirm*rawb
	
	bys product_group_code firmid panel_year quarter: egen double firmbuyers = sum(weightedrawb)

	sort product_group_code firmid product_module_code brand_code upc 

	egen double upc2 = group(upc)

	save NestedMPFPriceIndexRedo_v3.dta, replace
	
	drop if panel_year == 1994
	drop time
	sort upc  panel_year quarter
	egen double time = group(panel_year quarter)

	**Base good is defined as the largest good available in all years

	** Base good needs at least 20 rawb **
	

	*Criteria 1: find goods available in all years
		bys panel_year quarter: egen double totalval = total(value)
		gen double sharebrand = brandsalestotal/totalval
		gen double sharefirm = firmsalestotal/totalval
		gen double sharegroup = groupsalestotal/totalval
		gen double share = value/totalval
		gen double lnshare=ln(share)
		
		
		sort product_group_code firmid upc  panel_year quarter
		gen double logprice = lnprice[_n] - lnprice[_n-1] if time[_n] == time[_n-1]+1 & upc[_n] == upc[_n-1]
		gen double logshare = lnshare[_n] - lnshare[_n-1] if time[_n] == time[_n-1]+1 & upc[_n] == upc[_n-1]

	
		
		drop if rawb < 20
		
		bys product_group_code firmid upc : gen double count_yr = _N
		bys product_group_code firmid: egen double max_count_yr = max(count_yr)
		keep if count_yr==max_count_yr
		
	*Criteria 2: find largest goods satisfying criteria 1
		keep bm fm product_group_code firmid product_module_code brand_code upc  shareprodfirm value
		collapse (sum) value, by(product_group_code firmid fm upc ) fast
		gsort fm -value
		** largest good as base **
		drop if value == 0
		drop if upc == .
		keep if _n==1 | fm!= fm[_n-1]
	
		keep product_group_code firmid upc 
		sort product_group_code firmid upc 
		save baseupcs_v3.dta, replace
		
	use NestedMPFPriceIndexRedo_v3.dta, clear
	drop if panel_year == 1994
	drop time

	sort upc  panel_year quarter
	egen double time = group(panel_year quarter)

	sort product_group_code firmid upc 
	merge m:1  product_group_code firmid upc  using baseupcs_v3.dta
	keep if _merge == 3

	drop if rawb < 20
	

	sum rawb,d

	keep bm fm product_group_code firmid panel_year quarter lnprice shareprodfirm upc  upc2 lnshareprodfirm time
	rename lnprice lnprice_base
	rename shareprodfirm shareprodfirm_base
	rename upc2 upc_base


	gen double lnshareprodfirm_base = ln(shareprodfirm_base)

	
	bys upc_base  firmid product_group_code (panel_year quarter): gen double lag_lnshareprodfirm_base=lnshareprodfirm_base[_n-1] if time[_n] == time[_n-1]+1 & upc_base[_n] == upc_base[_n-1]

	* produces identical values *
	*bys upc_base _base product_module_code brand_code (panel_year quarter): gen double lag_lnshareprodbrand_basetest=cond(time[_n] == time[_n-1]+1,lnshareprodbrand_base[_n-1],.)

	by upc_base  firmid product_group_code (panel_year quarter): gen double lag_shareprodfirm_base=shareprodfirm_base[_n-1] if time[_n] == time[_n-1]+1 & upc_base[_n] == upc_base[_n-1]
	by upc_base  firmid product_group_code (panel_year quarter): gen double lag_lnprice_base=lnprice_base[_n-1] if time[_n] == time[_n-1]+1 & upc_base[_n] == upc_base[_n-1]

	drop if upc_base == .


	duplicates drop firmid product_group_code panel_year quarter, force
	sort firmid product_group_code panel_year quarter

	save basedata_v3.dta, replace
		
	use NestedMPFPriceIndexRedo_v3.dta, clear
	drop if panel_year == 1994
	drop time

	sort upc  panel_year quarter
	egen double time = group(panel_year quarter)

	sort firmid product_group_code panel_year quarter 


	merge m:1 firmid product_group_code panel_year quarter using basedata_v3.dta
	** don't drop base upc  **
	drop _merge

	*** variables for GMM estimation ***
	*gen double dropfirm = 0
	*replace dropfirm = 1 if fm == firmid_base
	gen double dropupc = 0
	replace dropupc = 1 if upc2 == upc_base
	replace dropupc = 1 if rawb < 20

	gen double dropupc2 = 0
	replace dropupc2 = 1 if rawb < 20

	*** UPC ***

	
	bys upc  firmid product_group_code (panel_year quarter): gen double lag_lnprice=lnprice[_n-1] if time[_n] == time[_n-1]+1 & upc[_n] == upc[_n-1]
	by upc  firmid product_group_code (panel_year quarter): gen double lag_lnshareprodfirm=lnshareprodfirm[_n-1] if time[_n] == time[_n-1]+1 & upc[_n] == upc[_n-1]
	
	gen double ddupcprice = (lnprice - lag_lnprice) - (lnprice_base - lag_lnprice_base)
	gen double ddfirmprodshare = (lnshareprodfirm - lag_lnshareprodfirm) - (lnshareprodfirm_base - lag_lnshareprodfirm_base)
	
	
	
	
	keep   fm upc2 upc  upc_base  rawb product_module_code firmid product_group_code panel_year quarter time rawb dropupc ddfirmprodshare ddupcprice
	
	sort product_group_code firmid upc panel_year quarter time
	
	
	gen double weight = 1/rawb[_n]+1/rawb[_n-1] if time[_n]==time[_n-1]+1 & upc[_n]==upc[_n-1]
	gen double bias1 = weight
	drop if ddupcprice == .
	drop if ddfirmprodshare == .



	
	
	* Drop if bad UPC, too small, base UPC
	drop if upc2 == upc_base
	drop if rawb < 20
	drop if dropupc == 1
	drop dropupc
	
	bys upc firmid product_group_code: egen double countobs = count(ddupcprice)
	sum countobs,d

	
	
	* Weights
	drop if weight == .
	gen double countweight = countobs^(3/2)
	replace weight = weight ^(-1/2)
	replace weight = weight*countweight
	drop countweight countobs

	


	sort upc time 

	

	drop time
	sort panel_year quarter
	egen double time = group(panel_year quarter)

	

	drop if product_group_code == .
	sort product_group_code firmid upc time

	
	
	drop product_module_code firmid upc2  rawb

	order upc fm product_group_code ddupcprice ddfirmprodshare weight

	save PreGMM1V2_v3.dta, replace
