% Examine World Bank Data 
%
clear all;

% Read Series
dname = ['WB_Series.xlsx'];
Series_Table = readtable(dname);

% Read Data
dname = ['WB_2015.xlsx'];
Data_Table_2015 = readtable(dname);
dname = ['WB_2005.xlsx'];
Data_Table_2005 = readtable(dname);
Ty = Data_Table_2015(:,{'CountryCode','Lattitude','Longitude'});

% Summarize Results for each series
ns = size(Series_Table,1);
kk = 0;
for is = 1:ns;
    sname = Series_Table{is,'SeriesCode'};
    sname_str = char(sname);
    sname2_str = strrep(sname_str,'.','_');
    iname_str = char(Series_Table{is,'IndicatorName'});
    iname_str = strrep(iname_str,'%','Percent');
    x_2015 = Data_Table_2015{:,sname2_str};
    x_2005 = Data_Table_2005{:,sname2_str};
    ii = 1 - isnan(x_2015+x_2005);
    nc_x = sum(ii);
    fprintf('%3i of %3i \n',[is ns]);
    fprintf(['Series: ' sname_str '  Name: ' iname_str  '\n']);
    fprintf('   Number of countries: %3i \n',nc_x); 
    
    % Check to see if '%' is in title
    pct_ind = contains(Series_Table{is,'IndicatorName'},'%');
    % Check to see of index is in title
    index_ind = contains(Series_Table{is,'IndicatorName'},'index');
    
    % Determine log or level
    if pct_ind == 1
        y = x_2015-x_2005;
        tcode = 'level';
    elseif index_ind == 1
        y = x_2015-x_2005;
        tcode = 'level';
    elseif min(min((packr([x_2015 x_2005])))) <= 0
        y = x_2015-x_2005;
        tcode = 'level';
    else 
        y = log(x_2015)-x_2005;
        tcode = 'log';
    end
    % Eliminate series with 10 or fewer non-zero values
    ii = packr(y) ~= 0;
    if sum(ii) <= 10
        y = NaN(size(y));
    end
    
    
    % Check for outliers .. replace with missing values
    yoa = adjout_a(y,5);
    noa = sum(isnan(yoa))-sum(isnan(y));
    fprintf(['Transformation: ' tcode '\n']);
    fprintf('Number of outliers: %3i \n',noa);
    nyoa = sum(isnan(yoa)==0);
    fprintf('Number of obs in yoa: %3i \n',nyoa);
    ym = yoa - mean(packr(yoa));
    ykur = mean(packr(ym).^4)/(mean(packr(ym).^2)^2);
    fprintf('   Kurtosis %5.1f \n',ykur);
    if ykur > 20
        yoa = NaN(size(yoa));
    end
    % Check for number of unique values, make sure at least 10 with
    % different values
    if sum(isnan(yoa)==0) >= 100
        tmp = packr(yoa);
        c = unique(yoa);
        ic = tmp == c';
        nc = sum(ic)';
        mc = max(nc);
        if (size(tmp,1)-mc) <= 10
            yoa = NaN(size(yoa));
        end
    end
    
%     if ykur > 10
%      ykur
%      fig = figure;
%      subplot(2,2,1);
%      histogram(x_2015);
%      subplot(2,2,2);
%      histogram(x_2005);
%      subplot(2,2,3);
%      histogram(y);
%      subplot(2,2,4);
%      histogram(yoa);
%      waitforbuttonpress;
%      close(fig);
%     end
    
    if sum(isnan(yoa)==0) >= 100
        kk = kk+1;
        SeriesCode{kk,1} = sname2_str;
        Topic{kk,1} = Series_Table.Topic{is};
        IndicatorName{kk,1} = Series_Table.IndicatorName{is};
        LongDefinition{kk,1} = Series_Table.LongDefinition{is};
        Tcode{kk,1} = tcode;
        Ty = [Ty table(yoa,'VariableNames',{sname2_str})];      
    end 
end
Series_Table_2015_2005 = table(SeriesCode,Tcode,Topic,IndicatorName,LongDefinition);
writetable(Series_Table_2015_2005,'Series_Data_2015_2005.xlsx','Sheet','Series_Table','Writemode','replacefile');
writetable(Ty,'Series_Data_2015_2005.xlsx','Sheet','Data_Table');

% Functions 
function [yoa] = adjout_a(y,thr)
% -- Adjust for outliers using fraction of IQR
% -- Disregard Zeros
% -- Replace outliers with missing values

a = unique(packr(y));
if size(a,1) < 4
    yoa = y;
else
 ii = isnan(y) + (y == 0) + (y==100);  % Zero or missing
 z = y(ii==0);
 pct_vec = [0.25 0.50 0.75];
 tmp = pctile(z,pct_vec);
 zm = tmp(2);
 iqr = tmp(3)-tmp(1);
 iii = abs(z-zm) <= (thr*iqr);
 za = NaN(size(z,1),1);
 za(iii==1) = z(iii==1);
 yoa = y;
 yoa(ii==0) = za;
end

end


