set(0,'defaultAxesPosition',[0.15 0.15 0.8 0.8],'defaultAxesFontUnits','points','defaultAxesFontSize',28);
set(0, 'defaultFigurePaperPositionMode', 'auto','defaultFigurePosition',[50 50 2000 1400], 'defaultFigureColor', 'White');
set(0,'defaultAxesFontSize', 32);
set(0,'DefaultTextFontSize', 32);
set(0,'DefaultAxesTitleFontWeight', 'normal')

%%
genesProtein=textread('geneInfo.protein_coding.txt','%s');
genesDisease=textread('curated_genes.txt','%s');
genesNames=textread('hgnc_names.txt','%s');

%%
sheetNames=sheetnames('write_pbmc5k/MajorClusters_4/DGEperCluster.xlsx');

sheetNames={'T cells', 'Monocytes', 'B cells', 'Natural killer cells'};

for ii=1:length(sheetNames)
    T{ii}=readtable('write_pbmc5k/MajorClusters_4/DGEperCluster.xlsx', 'Sheet',ii);
end

%%
% sheetNames={'T cells'};
% T{1}=readtable('DGEperCluster_Tcells.csv');

%%
vvProtein=ismember(T{1}.names, genesProtein);
vvDisease=ismember(T{1}.names, genesDisease);

%% calculate p_adj
signAll1=false(length(T{1}.names),1);
signAll2=signAll1;
for ii=1:length(T)
    T1 = T{ii};
    vv = max(T1.count_ref,T1.count_pan)>10 & abs(T1.cpm_ref-T1.cpm_pan)./(T1.cpm_ref+T1.cpm_pan)>0.1;
    vv1 = vv & T1.logfoldchanges>0;
    vv2 = vv & T1.logfoldchanges<0;

    padj=ones(length(vv),1);
    padj(vv1)=funBH(T1.pvals(vv1));
    padj(vv2)=funBH(T1.pvals(vv2));
    T{ii}.padj = padj;

    signG=padj<0.1;
    disp([ii  nnz(signG & vv1)  nnz(signG & vv1 & vvProtein) nnz(signG & vv1 & vvDisease) ...
              nnz(signG & vv2)  nnz(signG & vv2 & vvProtein) nnz(signG & vv2 & vvDisease) ]);

    signAll1=signAll1 | (padj<0.1 & vv1);
    signAll2=signAll2 | (padj<0.1 & vv2);

end

%% plots for each cluster
for ii=1%:length(T)
    T1 = T{ii};
    vv = max(T1.cpm_ref,T1.cpm_pan)>0.1;% & abs(T1.cpm_ref-T1.cpm_pan)./(T1.cpm_ref+T1.cpm_pan)>0.01;
    T1 = T1(vv,:);

    lfc = T1.logfoldchanges;
    lfc(lfc>4) = 4;
    lfc(lfc<-4) = -4;

%     logp = -log10(T1.pvals);
%     logp(logp>5) = 5;
% 
%     figure(100+ii);    
%     scatter(lfc, logp, 'o');

    pane = T1.cpm_pan;
    refe = T1.cpm_ref;
    plx=max(pane,refe);

    figure(200+ii);close(200+ii);figure(200+ii);
    plot(plx, lfc, 'o', 'MarkerSize', 10, 'MarkerEdgeColor','Blue', 'LineWidth',2);
    
    vv1 = T1.padj<=0.1;
    line(plx(vv1), lfc(vv1), 'LineStyle','none', 'Marker', 'o', 'MarkerSize', 10, 'MarkerEdgeColor','Red', 'LineWidth',2);
    hold off
    set(gca,'Xscale','log');
    xlim([0.5 110]);
    strongGenes='';
    for ig=1:length(vv1)
        if vv1(ig) && plx(ig)>1 && abs(lfc(ig))>1 && ~contains(T1.names{ig},'.') && ismember(T1.names{ig},genesNames) 
            text(plx(ig)*1.05,lfc(ig),T1.names{ig},'FontSize',16, 'FontAngle','italic');
            strongGenes = [strongGenes ' ' T1.names{ig}];
        end
    end

    disp(strongGenes);
    
    box on; grid on;
    xlabel('Gene Expression, CPM')
    ylabel('log_2(Pan/Ref)') 
    title(sheetNames{ii})
    exportgraphics(gcf,['matlab_figures/lfcVsCpm_' num2str(ii) '.png'], 'Resolution',600)
    exportgraphics(gcf,['matlab_figures/lfcVsCpm_' num2str(ii) '.pdf'])
end

%% combine genes from all clusters, select min p-value
A=table2array(T{1}(:,2:end));
for ii=2:4
    A(:,:,ii)=table2array(T{ii}(:,2:end));
end

Tb=T{1};
for ig=1:length(T{1}.names)
    [~,ind1] = max(abs(A(ig,4,:)));
    Tb(ig,2:end)=num2cell(A(ig,:,ind1));
end

%%
ii=10;
T1 = Tb;
vv = max(T1.cpm_ref,T1.cpm_pan)>0.5;% & abs(T1.cpm_ref-T1.cpm_pan)./(T1.cpm_ref+T1.cpm_pan)>0.01;
T1 = T1(vv,:);

lfc = T1.logfoldchanges;
lfc(lfc>4) = 4;
lfc(lfc<-4) = -4;

pane = T1.cpm_pan;
refe = T1.cpm_ref;

figure(200+ii);close(200+ii);figure(200+ii);
plot(max(pane,refe), lfc, 'o', 'MarkerSize', 10, 'MarkerEdgeColor','Blue', 'LineWidth',2);

vv1 = T1.padj<=0.1;
line(max(pane(vv1),refe(vv1)), lfc(vv1), 'LineStyle','none', 'Marker', 'o', 'MarkerSize', 10, 'MarkerEdgeColor','Red', 'LineWidth',2);
hold off
set(gca,'Xscale','log');
xlim([0.5 200]);
box on; grid on;
xlabel('Gene Expression, CPM')
ylabel('log_2(Pan/Ref)') 
% title(sheetNames{ii})
exportgraphics(gcf,['matlab_figures/lfcVsCpm_' num2str(ii) '.pdf'])

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% DGE between clusters
%%
clPairsG=sheetnames('write_pbmc5k/CRclusters9/DGEbetweenClusters.xlsx');
clNames={'T','Monocytes', 'B', 'NK'};

clear Td
for ii=1:length(clPairsG)
    t1=readtable('write_pbmc5k/CRclusters9/DGEbetweenClusters.xlsx', 'Sheet',ii);
    Td{ii}=sortrows(t1,1);
end

%%
for ii=1%:2:length(clPairsG)
    T1=Td{ii};
    T2=Td{ii+1};
    vv=T1.pvals_adj<0.01 | T2.pvals_adj<0.01;
    minP=min(T1.pvals_adj,T2.pvals_adj);
    
    minP=-log10(minP(vv)+1e-10);
    lfc1=T1.logfoldchanges(vv);
    lfc2=T2.logfoldchanges(vv);
    lfc1(lfc1>10)=10;
    lfc1(lfc1<-10)=-10;
    lfc2(lfc2>10)=10;
    lfc2(lfc2<-10)=-10;

    figure(300+ii);close(300+ii);figure(300+ii);
    set(gcf,'Position',[50 50 2000 1400])
    %plot(lfc1, lfc2, 'o', 'MarkerSize', 10, 'MarkerEdgeColor','Blue', 'LineWidth',2);
    
    scatter(lfc1,lfc2, 50, minP, 'LineWidth',2);
    colormap(jet);
    colorbar;
    axis('square');
    text(12.5,10, 'p_{adj}', 'FontSize', 40);
    

    cl1=strsplit(clPairsG{ii},'_');
    cl1=['log_2(FoldChange): ' cl1{1} ' vs ' cl1{2}];

    xlabel([cl1 ', Ref'])
    ylabel([cl1 ', Pan'])
    box on; grid on;

    exportgraphics(gcf,['matlab_figures/lfcPanVsRef_' clPairsG{ii} '.png'], 'Resolution',600)
    exportgraphics(gcf,['matlab_figures/lfcPanVsRef_' clPairsG{ii} '.pdf'])

end
%% old
% for ii=1
%     T1 = T{ii};
%     vv = T1.pvals<1;
%     T1 = T1(vv,:);
% 
%     lfc = T1.logfoldchanges;
%     lfc(lfc>4) = 4;
%     lfc(lfc<-4) = -4;
% 
%     logp = -log10(T1.pvals);
%     logp(logp>5) = 5;
% 
%     figure(100+ii);    
%     scatter(lfc, logp, 'o');
% 
%     pane = T1.cpm_pan;
%     refe = T1.cpm_ref;
% 
%     figure(200+ii);close(200+ii);figure(200+ii);
%     scatter(max(pane,refe), lfc, 'o');
%     hold on
%     vv1 = T1.pvals_adj<=0.1;
%     scatter(max(pane(vv1),refe(vv1)), lfc(vv1), 'o', 'MarkerEdgeColor','Red');
% 
%     %set(gca,'Xscale','log');
%     xlim([0 10]);
% 
% end

%% add biomart descriptions
BM=readtable('mart_export.filt.txt');
% BM.Properties.RowNames=BM.GeneName;
GN=readtable('../pan/Solo.out/Gene/raw/features.tsv','FileType','text','ReadVariableNames',false);


%%
T1=T{1}(T{1}.padj<0.1, :);

[~,ind1]=ismember(T1.names, GN.Var2);
gID=GN.Var1(ind1);
gID=regexprep(gID,'\..*','');
[~,ind1]=ismember(gID, BM.GeneStableID);
%%
T2=T1;
for ff1={'GeneType','GeneDescription','PhenotypeDescription'}
    ff=ff1{1};
    T2.(ff)=repmat({''},length(T1.names),1);
    T2.(ff)(ind1>0)=BM.(ff)(ind1(ind1>0));
end

%%
writetable(T2,'SignificantGenes.csv');