% Chetty Data extended version of Table 1
% Compute Results here and save for future use and printing into table

clear all;
small = 1.0e-10;
pinv_tol = 1.0e-05;   
big = 1.0e+8;
global datadir;
rng(197347);     % Seed used for computation of p-values, etc.

% -- File Directories   
outdir = 'out/';
figdir = 'fig/';
matdir = 'mat/';
addpath('../matlab_functions');
addpath('../cscpc')
addpath('functions')

%%
q = 15;
% --- Confidence interval grid -- half-life
n_hl = 100;
hl_grid_ho = linspace(0.001,1,n_hl)';
hl_grid_ho = [hl_grid_ho;linspace(1.01,3,30)'];
hl_grid_ho = [hl_grid_ho;100];
n_hl_grid_ho = length(hl_grid_ho);
n_hl_ha = 50;
hl_grid_ha = linspace(0.001,1.0,n_hl_ha)';

% --- Random numbers to use for all simulations
nrep = 100000;
emat = randn(q,nrep);

%% 
% Read Data
dataxls = '../Data/Chetty_Data_Labels.xlsx';
[~,tmp] = xlsread(dataxls,'A2:AT2');
VariableDesc = tmp';
dataxls = '../Data/Chetty_Data_1.xlsx';
T = readtable(dataxls);
CZ = T{:,'CZ'};
LatLon = [T{:,'Lat'} T{:,'Lon'}];
VariableNames=T.Properties.VariableNames;
State = T{:,'State'};
[Su,ia,state_id]=unique(State);
% Find Continental US .. delete HI and AK
ii_hi = ismember(State,'HI');
ii_ak = ismember(State,'AK');
ii_48 = ones(size(State,1),1)-ii_hi-ii_ak;
LatLon = LatLon(ii_48==1,:);
state_id = state_id(ii_48==1);

% Variables used, y and x
y_ind = 7;
x_ind = [13 14 16 18 19 20 21 27 28 29 40 41 42 43 44 45 24 30 31 32 34 35 36 37 38 39];

% Process Data for y variable 
i = y_ind;
y_str1 = [char(VariableNames(i))];
y_raw = T{:,i};
y_raw = y_raw(ii_48==1);

% Set up Results Arrays
n_xind = length(x_ind);
Results_OLS_CSCPC = NaN(n_xind+1,3);       % OLS CSCPC
Results_OLS_Cluster = NaN(n_xind+1,4);     % OLS Clustered SE
Results_FE_Cluster = NaN(n_xind+1,4);      % Fixed Effects with Clustered SE
Results_FGLS_CSCPC = NaN(n_xind+1,3);      % LBM-GLS with CSPC
Results_OLS_R2 = NaN(n_xind+1,1);          % Simple R2
Results_FE_R2 = NaN(n_xind+1,1);           % Within R2 using fixed effects
Results_FGLS_R2 = NaN(n_xind+1,1);         % R2 with GLS transformation
Results_OLS_SP_I1_res = NaN(n_xind+1,1);   % I(1) test for OLS residuals
Results_OLS_SP_I0_res = NaN(n_xind+1,1);   % I(0) test for OLS residuals
Results_FGLS_SP_I1_res = NaN(n_xind+1,1);  % I(1) test for FGLS residuals
Results_FGLS_SP_I0_res = NaN(n_xind+1,1);  % I(0) test for FGLS residuals
Results_SP_I1 = NaN(n_xind+1,1);           % I(1) test for y
Results_SP_I0 = NaN(n_xind+1,1);           % I(0) test for y
Results_hl_ci = NaN(n_xind+1,2);           % Half-life confidence interval
VarNames = cell(n_xind+1,1);
VarDesc = cell(n_xind+1,1);

% Univariate Analysis
for iz = 1:n_xind+1
    tic
    if iz == 1
        ivar = y_ind;
    else
        ivar = x_ind(iz-1);
    end
    VarNames{iz} = VariableNames(ivar);
    VarDesc{iz} = VariableDesc(ivar);
    Z = T{:,ivar};        % Raw Data
    Z = Z(ii_48==1,:);    % 48 States
    ii = isnan(Z);        % Missing Values
    Z = Z(ii==0);         % Delete Missing Values     
    s = LatLon(ii==0,:);  % Delete corresponding LatLon .. not that LatLon is already for 48 states
    n = length(s);        % Number of observations
    latlongflag = 1;
    distmat = getdistmat_normalized(s,latlongflag);
    c_grid_ha = -log(0.5)./hl_grid_ha;  % Convert to grid for c
    c_grid_ho = -log(0.5)./hl_grid_ho;
    % I1 and I0 Tests
    SP_I1 = Spatial_I1_Test(Z,distmat,emat);
    SP_I0 = Spatial_I0_Test(Z,distmat,emat);
    Results_SP_I1(iz)=SP_I1.pvalue;
    Results_SP_I0(iz)=SP_I0.pvalue;
    % Confidence Interval
    pv_mat=c_ci(Z,distmat,emat,c_grid_ho,c_grid_ha);
    ii = pv_mat > 0.05;  % 5% Critical value
    hl = hl_grid_ho(ii==1);
    ci_l = min(hl);
    ci_u = max(hl);
    Results_hl_ci(iz,1) = ci_l;
    Results_hl_ci(iz,2) = ci_u;
    iz 
    toc
  end
fprintf('Finished with univariate analysis \n'); 
 
%
% Regression results .. y onto x

for ix = 1:n_xind
    tic
  % Get X Data
  i = x_ind(ix);
  x_str1 = [char(VariableNames(i))];
  x_raw = T{:,i};
  x_raw = x_raw(ii_48==1);
  % Delete missing observations in the matrices that are used in the analysis
  iiy = 1-isnan(y_raw);
  iix = 1-isnan(x_raw);
  ii = iiy.*iix;
  y = y_raw(ii==1);
  x = x_raw(ii==1);
  s = LatLon(ii==1,:);
  sid = state_id(ii==1);  % State ID used for clustering and fixed effects

  % Get distance matrix
  n = length(s);
  latlongflag = 1;
  distmat = getdistmat_normalized(s,latlongflag);

  % Standardize 
  ys = (y-mean(y))/std(y);
  xs = (x-mean(x))/std(x);

  % OLS with clustered SEs
  [b,seb,veb]=ols_clustered_se(xs,ys,sid);
  % Fixed Effects with clustered SEs
  [b_fe,seb_fe,veb_fe,R2_fe]=fe_clustered_se(xs,ys,sid);

  % Construct Matrix Square Root of Inverse of sigdm_bm for use as GLS transformation
  H = lbm_gls_matrix(s,latlongflag);
  
  % Transform data
  hy = H*ys;
  hx = H*xs;
  hy = hy-mean(hy);  % should not be needed because H*ones(n,1) = 0, but numerical errors in the computation of H for large n  might make this useful
  hx = hx-mean(hx);

  rhobar = 0.03;
  ci_level = 0.95;
  rslt_ols = cscpc(ys,xs,s,latlongflag,rhobar,ci_level);        % Run CSCPC for OLS data
  rslt_fgls = cscpc(hy,hx,s,latlongflag,rhobar,ci_level);       % Run CSCPC for GLS data
  Results_OLS_Cluster(ix+1,:) = [b seb b-1.96*seb b+1.96*seb];  % Use pm 1.96 for 95% CI
  Results_FE_Cluster(ix+1,:) = [b_fe seb_fe b_fe-1.96*seb_fe b_fe+1.96*seb_fe];
  Results_FE_R2(ix+1) = R2_fe;
  Results_OLS_CSCPC(ix+1,:) = [rslt_ols.beta_hat rslt_ols.ci_cscpc];
  Results_FGLS_CSCPC(ix+1,:) = [rslt_fgls.beta_hat rslt_fgls.ci_cscpc];

  b = xs\ys;
  e = ys-xs*b;
  R2 = 1-(sum(e.^2)/sum(ys.^2));
  Results_OLS_R2(ix+1) = R2;

  b = hx\hy;
  e = hy-hx*b;
  R2 = 1-(sum(e.^2)/sum(hy.^2));
  Results_FGLS_R2(ix+1) = R2;

  % Test for unit root in residuals
  SP_I1_Res = Spatial_I1_Test_Residual(ys,[ones(n,1) xs],distmat,emat);
  Results_OLS_SP_I1_res(ix+1) = SP_I1_Res.pvalue;
  SP_I1_Res = Spatial_I1_Test_Residual(hy,[ones(n,1) hx],distmat,emat);
  Results_FGLS_SP_I1_res(ix+1) = SP_I1_Res.pvalue;
  % Test for I(0) residual
  SP_I0_Res = Spatial_I0_Test_Residual(ys,[ones(n,1) xs],distmat,emat);
  Results_OLS_SP_I0_res(ix+1) = SP_I1_Res.pvalue;
  SP_I0_Res = Spatial_I0_Test_Residual(hy,[ones(n,1) hx],distmat,emat);
  Results_FGLS_SP_I0_res(ix+1) = SP_I0_Res.pvalue;

  ix 
  toc
end

% Collect and Save Results
Table_1_Results.OLS_CSCPC = Results_OLS_CSCPC;
Table_1_Results.OLS_Cluster = Results_OLS_Cluster;
Table_1_Results.FE_Cluster = Results_FE_Cluster;
Table_1_Results.FGLS_CSCPC = Results_FGLS_CSCPC;
Table_1_Results.OLS_R2 = Results_OLS_R2;
Table_1_Results.FE_R2 = Results_FE_R2;
Table_1_Results.FGLS_R2 = Results_FGLS_R2;
Table_1_Results.OLS_SP_I1_res = Results_OLS_SP_I1_res;
Table_1_Results.OLS_SP_I0_res = Results_OLS_SP_I0_res;
Table_1_Results.FGLS_SP_I1_res = Results_FGLS_SP_I1_res;
Table_1_Results.FGLS_SP_I0_res = Results_FGLS_SP_I0_res;
Table_1_Results.SP_I1 = Results_SP_I1;
Table_1_Results.SP_I0 = Results_SP_I0;
Table_1_Results.hl_ci = Results_hl_ci;
Table_1_Results.VarNames = VarNames;
Table_1_Results.VarDesc = VarDesc;

save([matdir 'Table_1_Results.mat'],'Table_1_Results');


% ----------- Functions -------------
function [b,seb,veb]=ols_clustered_se(x,y,cluster_var)
    % Clustered SEs for y on x regression
    % cluster_varc_ind is the cluster variable
    b = x\y;
    e = y-x*b;
    [c,~,ind]=unique(cluster_var);
    ncluster = length(c);
    n = size(x,1);
    k = size(x,2);
    xe_xe = zeros(k,k);
    for i = 1:ncluster
        xc = x(ind==i,:);
        ec = e(ind==i);
        ac = xc'*ec;
        xe_xe = xe_xe+ac*ac';
    end
    xx = x'*x;
    xxi = inv(xx);
    V = xxi*xe_xe*xxi;
    % Here I am trying to use the same DF correction as in Stata
    q = ((n-1)/(n-k-1))*(ncluster/(ncluster-1));  
    V = q*V;
    
    veb = V;
    seb = sqrt(diag(veb));
    
    end
    
    % ----------- Functions -------------
    function [b,seb,veb,r2]=fe_clustered_se(x,y,cluster_var)
      % Fixed Effects with Clustered SEs for y on x regression
      % Clustered SEs for y on x regression
      % cluster_var is the cluster variable
      % r2 is the within R2
      [c,~,ind]=unique(cluster_var);
      ncluster = length(c);
      x_dm = NaN(size(x));
      y_dm = NaN(size(y));
      % Form cluster-demeaned variables
      for i = 1:ncluster
          xc = x(ind==i,:);
          yc = y(ind==i);
          xc = xc-mean(xc);
          yc = yc-mean(yc);
          x_dm(ind==i,:) = xc;
          y_dm(ind==i) = yc;
      end
      x = x_dm;
      y = y_dm;
      b = x\y;
      e = y-x*b;
      r2 = 1-(sum(e.^2)/sum(y.^2));
      n = size(x,1);
      k = size(x,2);
      xe_xe = zeros(k,k);
      for i = 1:ncluster
          xc = x(ind==i,:);
          ec = e(ind==i);
          ac = xc'*ec;
          xe_xe = xe_xe+ac*ac';
      end
      xx = x'*x;
      xxi = inv(xx);
      V = xxi*xe_xe*xxi;
      % Here I am trying to use the same DF correction as in Stata
      % q = ((n-1)/(n-k-ncluster))*(ncluster/(ncluster-1));
      q = ((n-1)/(n-k-ncluster))*(ncluster/(ncluster-1));
      V = q*V;
      veb = V;
      seb = sqrt(diag(veb));
      
      end


