% Chetty Data extended version of Table 1

clear all;
small = 1.0e-10;
pinv_tol = 1.0e-05;
big = 1.0e+8;
global datadir;
rng(197347);

% -- File Directories   
outdir = 'out/';
figdir = 'fig/';
matdir = 'mat/';
dataxls = '../Data/Chetty_Mobility/Chetty_Data_1.xlsx';
addpath('../matlab_functions');
addpath('../cscpc')
addpath('functions')

%%
q = 15;
% --- Confidence interval grid -- half-life
n_hl = 100;
hl_grid_ho = linspace(0.001,1,n_hl)';
hl_grid_ho = [hl_grid_ho;linspace(1.01,3,30)'];
hl_grid_ho = [hl_grid_ho;100];
n_hl_grid_ho = length(hl_grid_ho);
n_hl_ha = 50;
hl_grid_ha = linspace(0.001,1.0,n_hl_ha)';

% --- Random numbers to use for all simulations
nrep = 100000;
emat = randn(q,nrep);

%% 
% Read Data
dataxls = '../Data/Chetty_Mobility/Chetty_Data_Labels.xlsx';
[~,tmp] = xlsread(dataxls,'A2:AT2');
VariableDesc = tmp';
dataxls = '../Data/Chetty_Mobility/Chetty_Data_1.xlsx';
T = readtable(dataxls);
CZ = T{:,'CZ'};
LatLon = [T{:,'Lat'} T{:,'Lon'}];
VariableNames=T.Properties.VariableNames;
State = T{:,'State'};
[Su,ia,state_id]=unique(State);
% Find Continental US 
ii_hi = ismember(State,'HI');
ii_ak = ismember(State,'AK');
ii_48 = ones(size(State,1),1)-ii_hi-ii_ak;
LatLon = LatLon(ii_48==1,:);


% Variables used, y and x
y_ind = 7;
x_ind = [13 14 16 18 19 20 21 27 28 29 40 41 42 43 44 45 24 30 31 32 34 35 36 37 38 39];

i = y_ind;
y_str1 = [char(VariableNames(i))];
y_raw = T{:,i};
y_raw = y_raw(ii_48==1);
n_xind = length(x_ind);


Results_OLS_CSCPC = NaN(n_xind+1,3);
Results_OLS_Cluster = NaN(n_xind+1,3);
Results_FGLS_CSCPC = NaN(n_xind+1,3);
Results_OLS_R2 = NaN(n_xind+1,1);
Results_FGLS_R2 = NaN(n_xind+1,1);
Results_OLS_SP_I1_res = NaN(n_xind+1,1);
Results_OLS_SP_I0_res = NaN(n_xind+1,1);
Results_FGLS_SP_I1_res = NaN(n_xind+1,1);
Results_FGLS_SP_I0_res = NaN(n_xind+1,1);
Results_SP_I1 = NaN(n_xind+1,1);
Results_SP_I0 = NaN(n_xind+1,1);
Results_hl_ci = NaN(n_xind+1,2);

%% I1 and I0 Univariate tests
for iz = 1:n_xind+1
  tic
  if iz == 1
  	ivar = y_ind;
  else
  	ivar = x_ind(iz-1);
  end
  Z = T{:,ivar};
  Z = Z(ii_48==1,:);    % 48 States)
  ii = isnan(Z);
  Z = Z(ii==0);
  s = LatLon(ii==0,:);
  n = length(s);
  latlongflag = 1;
  distmat = getdistmat(s,latlongflag);
  max_dist = max(max(distmat));
  hl_grid_ha_max = hl_grid_ha*max_dist;
  hl_grid_ho_max = hl_grid_ho*max_dist;
  c_grid_ha = -log(0.5)./hl_grid_ha_max;
  c_grid_ho = -log(0.5)./hl_grid_ho_max;
  % I1 and I0 Tests
  SP_I1 = Spatial_I1_Test(Z,distmat,emat);
  SP_I0 = Spatial_I0_Test(Z,distmat,emat);
  Results_SP_I1(iz)=SP_I1.pvalue;
  Results_SP_I0(iz)=SP_I0.pvalue;
  % Confidence Interval
  pv_mat=c_ci(Z,distmat,emat,c_grid_ho,c_grid_ha);
  ii = pv_mat > 0.05;  % 5% Critical value
  hl = hl_grid_ho(ii==1);
  ci_l = min(hl);
  ci_u = max(hl);
  Results_hl_ci(iz,1) = ci_l;
  Results_hl_ci(iz,2) = ci_u;
  iz 
  toc
end
fprintf('Finished with univariate analysis \n'); 
 
%%
% Regression results .. y onto x

for ix = 1:n_xind
i = x_ind(ix);
x_str1 = [char(VariableNames(i))];
x_raw = T{:,i};
x_raw = x_raw(ii_48==1);
iiy = 1-isnan(y_raw);
iix = 1-isnan(x_raw);
ii = iiy.*iix;

y = y_raw(ii==1);
x = x_raw(ii==1);
s = LatLon(ii==1,:);
sid = state_id(ii==1,:);
n = length(s);
latlongflag = 1;
distmat = getdistmat(s,latlongflag);

% Standardize 
ys = (y-mean(y))/std(y);
xs = (x-mean(x))/std(x);

[b,seb,veb]=ols_clustered_se(xs,ys,sid);

% Construct LBM Covariance matrix
% BM covariance matrix (approximation for demeanded value)
rho_bm = 0.999;
c_bm = getcbar(rho_bm,distmat);
sigdm_bm = get_sigma_dm(distmat,c_bm);

% Construct Matrix Square Root
sigdm_bm_sqrt = sqrt_psd(sigdm_bm);
H = pinv(sigdm_bm_sqrt,pinv_tol);
aa = H*sigdm_bm*H';
% [n sum(diag(aa))]
hy = H*ys;
hx = H*xs;
% hy = hy-mean(hy);  not needed, H*ones(n,1) = 0
% hx = hx-mean(hx);
rhobar = 0.03;
ci_level = 0.95;
rslt_ols = cscpc(ys,xs,s,latlongflag,rhobar,ci_level);  % Run CSCPC
rslt_fgls = cscpc(hy,hx,s,latlongflag,rhobar,ci_level);  % Run CSCPC
Results_OLS_Cluster(ix+1,:) = [b b-1.96*seb b+1.96*seb];
Results_FGLS_CSCPC(ix+1,:) = [rslt_fgls.beta_hat rslt_fgls.ci_cscpc];
Results_OLS_CSCPC(ix+1,:) = [rslt_ols.beta_hat rslt_ols.ci_cscpc];

b = xs\ys;
e = ys-xs*b;
R2 = 1-(sum(e.^2)/sum(ys.^2));
Results_OLS_R2(ix+1) = R2;

b = hx\hy;
e = hy-hx*b;
R2 = 1-(sum(e.^2)/sum(hy.^2));
Results_FGLS_R2(ix+1) = R2;

% Test for unit root in residuals
SP_I1_Res = Spatial_I1_Test_Residual(ys,[ones(n,1) xs],distmat,emat);
Results_OLS_SP_I1_res(ix+1) = SP_I1_Res.pvalue;
SP_I1_Res = Spatial_I1_Test_Residual(hy,[ones(n,1) hx],distmat,emat);
Results_FGLS_SP_I1_res(ix+1) = SP_I1_Res.pvalue;
% Test for I(0) residual
SP_I0_Res = Spatial_I0_Test_Residual(ys,[ones(n,1) xs],distmat,emat);
Results_OLS_SP_I0_res(ix+1) = SP_I1_Res.pvalue;
SP_I0_Res = Spatial_I0_Test_Residual(hy,[ones(n,1) hx],distmat,emat);
Results_FGLS_SP_I0_res(ix+1) = SP_I0_Res.pvalue;
end
%%
% Save as CSV File
outfile_name = [outdir 'Chetty_Variables_Table1_Results.csv'];
fileID = fopen(outfile_name,'w');
fprintf(fileID,'Name, Desc,, Univariate Analysis,,,,Levels Regression,,,,,,LBMGLS Regression \n');
fprintf(fileID,',,, I(1) pv, I(0) pv, ci-hl,,');
fprintf(fileID,'R2,bhat [cluster],bhat[cscpc], I(1) pv, I(0) pv,,');
fprintf(fileID,'R2,bhat[cscpc], I(1) pv, I(0) pv \n');
for ii = 1:n_xind+1
    if ii == 1
        i = y_ind;
    else
        i = x_ind(ii-1);
    end
    str1 = [char(VariableNames(i)) ',' char(VariableDesc(i))];
    % label
    fprintf(fileID,[str1 ',,']);
    % univariate results
    tmp = [Results_SP_I1(ii) Results_SP_I0(ii) Results_hl_ci(ii,:)];
    fprintf(fileID,'%5.2f,%5.2f,[%5.2f;%5.2f],,',tmp);
    % Level regression results
    tmp = [Results_OLS_R2(ii) Results_OLS_Cluster(ii,:) Results_OLS_CSCPC(ii,:) Results_OLS_SP_I1_res(ii) Results_OLS_SP_I0_res(ii)];
    fprintf(fileID,'%5.2f, %5.2f [%5.2f;%5.2f], %5.2f [%5.2f;%5.2f], %5.2f, %5.2f,,',tmp);
    % LBMGLS regressuib results
    tmp = [Results_FGLS_R2(ii) Results_FGLS_CSCPC(ii,:) Results_FGLS_SP_I1_res(ii) Results_FGLS_SP_I0_res(ii)];
    fprintf(fileID,'%5.2f, %5.2f [%5.2f;%5.2f], %5.2f, %5.2f',tmp);
    
    fprintf(fileID,'\n');
    
end
%%

% ----------- Functions -------------
function [b,seb,veb]=ols_clustered_se(x,y,cluster_var)
% Clustered SEs for y on x regression
% c_ind is the cluster index
b = x\y;
e = y-x*b;
[c,~,ind]=unique(cluster_var);
ncluster = length(c);
n = size(x,1);
k = size(x,2);
xe_xe = zeros(k,k);
for i = 1:ncluster
    xc = x(ind==i,:);
    ec = e(ind==i);
    ac = xc'*ec;
    xe_xe = xe_xe+ac*ac';
end
xx = x'*x;
xxi = inv(xx);
V = xxi*xe_xe*xxi;
q = ((n-1)/(n-k))*(ncluster/(ncluster-1));
V = q*V;
veb = V;
seb = sqrt(diag(veb));

end
