version 12.1
set type double
global TERMINAL "/mnt/data0/work/MPF_FINAL"


clear all
set more off
set matsize 11000
set maxvar 32767

cd ${TERMINAL}/data/KILTS


********************************************************
* KILTS Data
* Aggregate to natl-quarterly from HH-city-week
* Combine datasets (deleting intermediate steps)
*********************************************************

*cap prog drop KILTS_agg
*prog define KILTS_agg

	clear
	*set niceness 5
	set max_memory 256g
	set segmentsize 512m
	

	* Retrieve KILTS data on demographic, geographic, and product ownership of panelists
	forvalues j =4(1)11{
	
		if `j' < 10 {
		use "${TERMINAL}/data/KILTS/nielsen_extracts/HMS/200`j'/Annual_Files/panelists_200`j'.dta", clear
		}

		if `j' >= 10 {
		use "${TERMINAL}/data/KILTS/nielsen_extracts/HMS/20`j'/Annual_Files/panelists_20`j'.dta", clear
		}

		keep household_code panel_year projection_factor projection_factor_magnet scantrack_market_code scantrack_market_descr
		
		collapse (count) household_code, by(panel_year scantrack_market_code) fast

		if `j' < 10 {
		save HouseholdCount200`j'.dta, replace
		}
		
		if `j' >= 10 {
		save HouseholdCount20`j'.dta, replace
		}
		
		}
		
		use HouseholdCount2004.dta, clear
		append using HouseholdCount2005
		append using HouseholdCount2006
		append using HouseholdCount2007
		append using HouseholdCount2008
		append using HouseholdCount2009
		append using HouseholdCount2010
		append using HouseholdCount2011
		
		rename household_code nr_hh
		sort panel_year scantrack_market_code
		save nr_hh_city_year.dta, replace



***************************************************************************
* Pre GMM Preparation
* Generate data for estimation, and also data to be used in post-estimation
****************************************************************************


	clear
	
	
	cd ${TERMINAL}/data/
	
		
	use NestedMPFPriceIndexRedo_v3.dta, clear
	drop if panel_year == 1994
	drop time

	sort upc  panel_year quarter
	egen double time = group(panel_year quarter)

	sort firmid product_group_code panel_year quarter 


	merge m:1 firmid product_group_code panel_year quarter using basedata_v3.dta
	** don't drop base upc  **
	drop _merge

	*** variables for GMM estimation ***
	*gen double dropfirm = 0
	*replace dropfirm = 1 if fm == firmid_base
	gen double dropupc = 0
	replace dropupc = 1 if upc2 == upc_base
	replace dropupc = 1 if rawb < 20

	gen double dropupc2 = 0
	replace dropupc2 = 1 if rawb < 20

	*** UPC ***
	*** error was here: no lag_lnprice or lag_lnshareprodfirm!
	
	bys upc  firmid product_group_code (panel_year quarter): gen double lag_lnprice=lnprice[_n-1] if time[_n] == time[_n-1]+1 & upc[_n] == upc[_n-1]
	by upc  firmid product_group_code (panel_year quarter): gen double lag_lnshareprodfirm=lnshareprodfirm[_n-1] if time[_n] == time[_n-1]+1 & upc[_n] == upc[_n-1]
	
	gen double ddupcprice = (lnprice - lag_lnprice) - (lnprice_base - lag_lnprice_base)
	gen double ddfirmprodshare = (lnshareprodfirm - lag_lnshareprodfirm) - (lnshareprodfirm_base - lag_lnshareprodfirm_base)
	
	
	
	
	keep   fm upc2 upc upc_actual upc_ver_uc  upc_base  rawb product_module_code firmid product_group_code panel_year quarter time rawb dropupc ddfirmprodshare ddupcprice
	
	sort product_group_code firmid upc panel_year quarter time
	
	
	gen double weight = 1/rawb[_n]+1/rawb[_n-1] if time[_n]==time[_n-1]+1 & upc[_n]==upc[_n-1]
	gen double bias1 = weight
	drop if ddupcprice == .
	drop if ddfirmprodshare == .



	
	
	* Drop if bad UPC, too small, base UPC
	drop if upc2 == upc_base
	drop if rawb < 20
	drop if dropupc == 1
	drop dropupc
	
	bys upc firmid product_group_code: egen double countobs = count(ddupcprice)
	sum countobs,d

	
	
	* Weights
	drop if weight == .
	gen double countweight = countobs^(3/2)
	replace weight = weight ^(-1/2)
	replace weight = weight*countweight
	drop countweight countobs

	


	sort upc time 

	

	drop time
	sort panel_year quarter
	egen double time = group(panel_year quarter)

	

	drop if product_group_code == .
	sort product_group_code firmid upc time

	
	
	drop product_module_code firmid upc2  rawb

	order upc fm product_group_code ddupcprice ddfirmprodshare weight


	** trim the double differenced variables 
sum ddupcprice,d
local umin = `r(p1)'
local umax = `r(p99)'
replace ddupcprice = . if ddupcprice <= `umin'
replace ddupcprice = . if ddupcprice >= `umax'
*
sum ddfirmprodshare,d
local fmin = `r(p1)'
local fmax = `r(p99)'
replace ddfirmprodshare = . if ddfirmprodshare <= `fmin'
replace ddfirmprodshare = . if ddfirmprodshare >= `fmax'
drop if ddupcprice == .
drop if ddfirmprodshare == .

** generate variables used in the estimation 
* generate the squate of the double differenced variables 
gen double y2 = ddupcprice*ddupcprice
gen double z1 = ddupcprice*ddfirmprodshare
gen double z2 = ddfirmprodshare*ddfirmprodshare
* generate the product group - upc average of the squared - double differenced variables 
bys product_group_code upc: egen double weight_bar = mean(weight)
by product_group_code upc: egen double y2_bar = mean(y2)	
by product_group_code upc: egen double z1_bar = mean(z1)	
by product_group_code upc: egen double z2_bar = mean(z2)
bys product_group_code upc: egen double b1 = mean(bias1)

** clean: drop upcs with less than 10 or 20 observations in a given product group 
bys product_group_code upc: egen double t2 = count(y2)
sum t2,d
drop if t2 < 10
drop if t2 < 20 & product_group_code != 2004 & product_group_code != 5505 & product_group_code != 5509 & product_group_code != 5517 & product_group_code != 6005 & product_group_code != 6006 & product_group_code != 6007 & product_group_code != 6013

* generate the weighted mean of the squared - double differenced variables 
bys product_group_code: egen double sum_weight = total(weight_bar)
gen double weight_bar_other = weight_bar/sum_weight
foreach var in y2_bar z1_bar z2_bar {
		replace `var' = `var'*weight_bar_other
}
collapse (mean) y2_bar z1_bar z2_bar t2 weight_bar_other b1 ddfirmprodshare ddupcprice, by(product_group_code upc_actual upc upc_ver_uc) fast

* drop product group with less than 30 observations
bys product_group_code: egen countobs2 = count(t2)
drop if countobs2 < 30

keep upc_actual upc_ver_uc

order upc_actual upc_ver_uc

sort upc_actual upc_ver_uc

save upc_used_estimation_list.dta, replace

clear

cd ${TERMINAL}/data/KILTS

* get national level upc price, quantity, value, firm share, brand share, product group share. 
use panel_year quarter product_group_code upc upc_ver_uc quantity value price using KiltsNationalQuarterly.dta
drop if panel_year == .
rename quantity quantity_nat
rename value value_nat
rename price price_nat
gen upc2 = upc
gen str20 stringu = string(upc2,"%20.0g")
gen firmid2 = substr(stringu,1,5)
destring firmid2, replace
gen firmid = firmid2
drop firmid2 upc2 stringu
save national_data_all, replace

* merge national upc price, quantity, value, firm share into the city - time Kilts 
use Kilts_city_quarterly.dta, clear 
sort panel_year quarter product_group_code firmid upc upc_ver_uc
*merge m:1 panel_year quarter product_group_code firmid upc upc_ver_uc using national_data_all.dta
merge m:1 panel_year quarter product_group_code upc upc_ver_uc using national_data_all.dta
tab _merge 
drop _merge 
save Kilts_city_quarterly_Wnat.dta, replace

* merge in the number of households 
sort panel_year scantrack_market_code
merge m:1 panel_year scantrack_market_code using "${TERMINAL}/data/KILTS/nr_hh_city_year.dta"
tab _merge
drop if _merge==1 | _merge==2
drop _merge 
* aggregate the three NY scantrack_market_code (008, 009, 010) into one new (078)
sort panel_year quarter scantrack_market_code product_group_code firmid upc upc_ver_uc
save Kilts_city_q_Wnat_WnrHH.dta, replace 

/*
* ----------------------------------------------------------
* Correlation between firms' city and national sales 
* ----------------------------------------------------------
* Test 1 
* 
* Define regional markets as cities with more than 1000 households (raw buyers )
* To project the total New York City market, researchers should use the aggregation of all 
* three NY Scantrack codes: urban NY (009), exurban NY (010), and suburban NY (008)  
* bys panel_year quarter scantrack_market_code: egen max_households=max(rawb) 
* drop if max_households<1000
use Kilts_city_q_Wnat_WnrHH.dta, clear 
drop if nr_hh<1000 & (scantrack_market_code!=008 & scantrack_market_code!=009 & scantrack_market_code!=010) 
*compute the firm share at city level within each product group
bys panel_year quarter scantrack_market_code product_group_code firmid: egen firm_city_sales=sum(value)
bys panel_year quarter scantrack_market_code product_group_code: egen pg_city_sales=sum(value)  
gen firm_city_share=firm_city_sales/pg_city_sales
*
bys panel_year quarter product_group_code firmid: egen firm_nat_sales=sum(value) 
bys panel_year quarter product_group_code: egen pg_nat_sales=sum(value) 
gen firm_nat_share=firm_nat_sales/pg_nat_sales
*
bys panel_year quarter product_group_code firmid: egen firm_nat_sales2=sum(value_nat) 
bys panel_year quarter product_group_code: egen pg_nat_sales2=sum(value_nat) 
gen firm_nat_share2=firm_nat_sales2/pg_nat_sales2
*
save Kilts_city_quarterly_bigcity_Wnat.dta, replace 
*
collapse (firstnm) firm_city_share firm_nat_share firm_nat_share2, by(panel_year quarter scantrack_market_code product_group_code firmid) fast 
save city_nat_firm_share.dta, replace 
* define dummy variables that take value 1 if (i) firm share is greater than 5%, (ii) firm share is greater than 10%
use city_nat_firm_share.dta, clear 
gen dummy5_city=0
replace dummy5_city=1 if firm_city_share>0.05
*
gen dummy10_city=0
replace dummy10_city=1 if firm_city_share>0.1
*
gen dummy5_nat=0
replace dummy5_nat=1 if firm_nat_share>0.05
*
gen dummy10_nat=0
replace dummy10_nat=1 if firm_nat_share>0.1
*
gen dummy2_5_nat=0
replace dummy2_5_nat=1 if firm_nat_share2>0.05
*
gen dummy2_10_nat=0
replace dummy2_10_nat=1 if firm_nat_share2>0.1
*  compute correlations between (i) the national level firm share and the regional firm share, (ii) the two dummies at the national and the regional level
* they hope high
pwcorr firm_nat_share firm_city_share firm_nat_share2 
pwcorr  dummy5_city dummy10_city dummy5_nat dummy10_nat dummy2_5_nat dummy2_10_nat
*
*/
* Test 2
*
* Define national brand as a brand sold in 10 or more regions
use Kilts_city_q_Wnat_WnrHH.dta, clear 
* use Kilts_city_quarterly_bigcity_Wnat.dta, clear  
sort panel_year quarter brand_code_uc scantrack_market_code 
collapse (sum) value (firstnm) nr_hh, by(panel_year quarter brand_code_uc scantrack_market_code) fast
drop if value<=0
drop if value == .
bys panel_year quarter brand_code_uc: egen nr_city=count(scantrack_market_code) 
drop if nr_hh<1000 & (scantrack_market_code!=008 & scantrack_market_code!=009 & scantrack_market_code!=010)
collapse (sum) value (firstnm)  nr_city, by(panel_year quarter brand_code_uc) fast
* bys panel_year quarter brand_code_uc: egen brand_nat_sales=sum(value)
* bys panel_year quarter: egen nat_sales=sum(value)  
* gen brand_nat_share=brand_nat_sales/nat_sales
* gen nat_brand=0
* replace nat_brand=1 if nr_city>=10
* save nr_city_brand_sold.dta, replace
save nr_brand_sold.dta, replace
bys panel_year quarter: egen double totalsales = sum(value)
drop if nr_city<10
*collapse (sum) value (mean) brand_nat_share, by(brand_code_uc) fast
bys panel_year quarter: egen double total_salesNbrand=sum(value)
gen double share = total_salesNbrand/totalsales
collapse (firstnm) share, by(panel_year quarter) fast
sum share,d
save national_brand_share_sales.dta, replace
*gen share=value/total_sales
*sum share brand_nat_share
*egen avg_share_nat_brand=mean(brand_nat_share) if nat_brand==1
*egen avg_share_nat_brand=mean(brand_nat_share) if nat_brand==0
* 
* Same for firms 
use Kilts_city_q_Wnat_WnrHH.dta, clear 
* use Kilts_city_quarterly_bigcity_Wnat.dta, clear  
sort panel_year quarter firmid scantrack_market_code 
collapse (sum) value (firstnm) nr_hh, by(panel_year quarter firmid scantrack_market_code) fast
drop if value<=0
drop if value == .
bys panel_year quarter firmid: egen nr_city=count(scantrack_market_code) 
drop if nr_hh<1000 & (scantrack_market_code!=008 & scantrack_market_code!=009 & scantrack_market_code!=010)
collapse (sum) value (firstnm)  nr_city, by(panel_year quarter firmid) fast
* bys panel_year quarter brand_code_uc: egen brand_nat_sales=sum(value)
* bys panel_year quarter: egen nat_sales=sum(value)  
* gen brand_nat_share=brand_nat_sales/nat_sales
* gen nat_brand=0
* replace nat_brand=1 if nr_city>=10
* save nr_city_brand_sold.dta, replace
save nr_firm_sold.dta, replace
bys panel_year quarter: egen double totalsales = sum(value)
drop if nr_city<10
bys panel_year quarter: egen double total_salesNfirm=sum(value)
gen double share = total_salesNfirm/totalsales
collapse (firstnm) share, by(panel_year quarter) fast
sum share,d
save national_firm_share_sales.dta, replace

* Compute what share of the sales is accounted for by this national brand (national brands account for most of the sales) 
*
* ------------------------------------------------
* UPC price variation across regions
* ------------------------------------------------
* Test
* collapse the data on UPCs used in the estimation (keep only obs with rawb>20, are in the sample for very long, ), and the 10 cities with well defined average prices (keep only cities with rawb>1000) 
*
cd ${TERMINAL}/data/KILTS/
use Kilts_city_q_Wnat_WnrHH.dta, clear 
rename upc upc_actual
sort upc_actual upc_ver_uc
merge m:1 upc_actual upc_ver_uc using "${TERMINAL}/data/upc_used_estimation_list.dta"
tab _merge 
drop if _merge!=3
drop _merge
egen upc3=group(upc_actual upc_ver_uc)
drop if nr_hh<1000 & (scantrack_market_code!=008 & scantrack_market_code!=009 & scantrack_market_code!=010)
save Kilts_city_q_Wnat_WnrHH_Wupcest.dta, replace 
* get the average price of the UPCs at regional level
*
***
use Kilts_city_q_Wnat_WnrHH_Wupcest.dta, clear 
*sort panel_year quarter scantrack_market_code upc3
* get city prices (these are basically the quantity weighted prices
*collapse (sum) city_quantity=quantity city_value=value, by(panel_year quarter scantrack_market_code upc3) fast
rename value city_value
rename quantity city_quantity
gen city_price=city_value/city_quantity
replace city_price = ln(city_price)
* compute the quarter by quarter difference of the regional UPC price
sort scantrack_market_code upc3 panel_year quarter

drop if (product_group_code == 3501 | product_group_code == 4001)
drop if rawb < 20
*by scantrack_market_code upc3: egen double counttime = count(city_price)
*drop if counttime < 10

by scantrack_market_code upc3: gen lag_city_price=city_price[_n-1]
by scantrack_market_code upc3: gen diff_city_price=city_price-lag_city_price

*sum diff_city_price,d
*local pmin = r(p1)
*local pmax = r(p99)
*drop if diff_city_price > `pmax'
*drop if diff_city_price < `pmin'

rename price_nat nat_price
replace nat_price  = ln(nat_price)
by scantrack_market_code upc3: gen lag_nat_price=nat_price[_n-1]
by scantrack_market_code upc3: gen diff_nat_price=nat_price-lag_nat_price
*save time_city_upc.dta, replace
* compute the quarter by quarter difference of the national UPC price
*use Kilts_city_q_Wnat_WnrHH_Wupcest.dta, clear 
*sort panel_year quarter upc3 
*collapse (sum) nat_quant=quantity_nat nat_sales=value_nat, by(panel_year quarter upc3) fast


egen time=group(panel_year quarter) 
save time_city_upc.dta, replace
* compute the quantity shares 
*sort panel_year quarter scantrack_market_code upc3
*by panel_year quarter: egen tot_quant_year_quarter=sum(city_quantity)
*gen quant_city_share=city_quantity/tot_quant_year_quarter
* regress the regional prices (the changes in regional prices) on a constant and the national prices (the change in national prices) by using quantities as weights
*reg avg_reg_price nat_price [aw=quant_city_share]
*reg diff_reg_price diff_nat_price [aw=quant_city_share]
areg city_price nat_price i.time, absorb(upc3)
local p = _b[nat_price]
local con = _b[_cons]
predict fit, xbd
replace fit = fit - `p'*nat_price - `con'
gen partial_price = city_price - fit
reg partial_price nat_price

areg diff_city_price diff_nat_price i.time, absorb(upc3)
local p2 = _b[diff_nat_price]
local con2 = _b[_cons]
predict fit2, xbd
replace fit2 = fit2 - `p2'*diff_nat_price - `con2'
gen partial_price2 = diff_city_price - fit2
reg partial_price2 diff_nat_price

save city_price_regression.dta, replace



