version 12.1
set type double


global TERMINAL "/mnt/data0/work/MPF_FINAL"

clear all
set more off
set type double

* Save the Kilts datasets into Stata format
foreach ds in "panelists" "trips" "purchases" {
	forvalues j =4(1)11{
		if `j' < 10 {
			insheet using "${TERMINAL}/data/KILTS/nielsen_extracts/HMS/200`j'/Annual_Files/`ds'_200`j'.tsv", clear
            save "${TERMINAL}/data/KILTS/nielsen_extracts/HMS/200`j'/Annual_Files/`ds'_200`j'.dta", replace
			}

		if `j' >= 10 {
			insheet using "${TERMINAL}/data/KILTS/nielsen_extracts/HMS/20`j'/Annual_Files/`ds'_20`j'.tsv", clear
            save "${TERMINAL}/data/KILTS/nielsen_extracts/HMS/20`j'/Annual_Files/`ds'_20`j'.dta", replace
			}
		}
	}


filefilter "${TERMINAL}/data/KILTS/nielsen_extracts/HMS/Master_Files/Latest/products.tsv" "${TERMINAL}/data/KILTS/nielsen_extracts/HMS/Master_Files/Latest/productsfiltered.tsv", from("\Q") to(" ") replace
insheet using "${TERMINAL}/data/KILTS/nielsen_extracts/HMS/Master_Files/Latest/productsfiltered.tsv", clear
save "${TERMINAL}/data/KILTS/nielsen_extracts/HMS/Master_Files/Latest/products.dta", replace


* Create the unmatchedupc dataset from GS1 data
cd "${TERMINAL}"/data/GS1/


//--------------
// Combine Data
//--------------

infix str upc 1-12 using "${TERMINAL}/data/KILTS/nielsen_extracts/HMS/2004/Annual_Files/products_extra_2004.tsv", clear
drop if _n==1
save temp.dta, replace

forvalues i =2005/2011 {
infix str upc 1-12 using "${TERMINAL}/data/KILTS/nielsen_extracts/HMS/`i'/Annual_Files/products_extra_`i'.tsv", clear
drop if _n==1
append using temp.dta
duplicates drop upc, force
save temp.dta, replace
}

infix str upc 1-12 using "${TERMINAL}/data/KILTS/nielsen_extracts/HMS/Master_Files/Latest/products.tsv", clear
drop if _n==1
append using temp.dta
duplicates drop upc, force
sort upc
save allupc.dta, replace
erase temp.dta



use "gs1.dta", clear
rename gs1companyprefix upc
replace upc =subinstr(upc,"=","",.)
replace upc =subinstr(upc,`"""',  "", .)
tempfile gs1prefix
save `gs1prefix', replace



use allupc.dta, clear
rename upc upcfull
gen upc =substr(upcfull,1,6)
merge m:1 upc using `gs1prefix'
keep if _merge ==3
drop _merge
tempfile upcfirm
save `upcfirm', replace

forvalues i=7/11 {
use allupc.dta, clear
rename upc upcfull
gen upc =substr(upcfull,1,`i')
merge 1:1 upcfull using `upcfirm'
drop if _merge ==3
keep upc upcfull
merge m:1 upc using `gs1prefix'
keep if _merge ==3
drop _merge
append using `upcfirm'
*sleep 5000
tempfile upcfirm
save `upcfirm', replace
}

use allupc.dta, clear
rename upc upcfull
merge 1:1 upcfull using `upcfirm'
keep if _merge ==1
keep upcfull
rename upcfull upc
destring(upc), replace
sort upc
save unmatchedupc.dta, replace
