%% 2 December 2021. Coey & Clark (2021) Table S1: Gcn4 Half-Site analysis.
% Half-site = ATGAC/GTCAT. Total: 25,838 (including 291 full sites).
% Identify half-sites in peaks containing full sites
% Get sequences of bound and unbound motifs for MEME

load('sacCer3_genome.mat', 'genome');
chrLen = [genome.chrLen];
noChr = numel(genome);

%% Record HC Motif chromosome and nt locations. HC motif = ATGAC in top strand
Chr_Gcn4_AP1_HC = [];
Loc_Gcn4_AP1_HC = [];
TotalNoSites_AP1_HC = 0;
for chr = 1 : noChr
    indHC = strfind(upper(genome(chr).Seq), 'ATGAC');
    noSitesPerChr = numel(indHC);   
    Chr_Gcn4_AP1_HC = [Chr_Gcn4_AP1_HC; chr*ones(noSitesPerChr, 1)];
    Loc_Gcn4_AP1_HC = [Loc_Gcn4_AP1_HC; indHC'+4];
    TotalNoSites_AP1_HC = TotalNoSites_AP1_HC + noSitesPerChr;
end

%% Record HG Motif chromosome and nt locations. HG motif = GTCAT in top strand
Chr_Gcn4_AP1_HG = [];
Loc_Gcn4_AP1_HG = [];
TotalNoSites_AP1_HG = 0;
for chr = 1 : noChr
    indHG = strfind(upper(genome(chr).Seq), 'GTCAT');
    noSitesPerChr = numel(indHG);
    Chr_Gcn4_AP1_HG = [Chr_Gcn4_AP1_HG; chr*ones(noSitesPerChr, 1)];
    Loc_Gcn4_AP1_HG = [Loc_Gcn4_AP1_HG; indHG'];
    TotalNoSites_AP1_HG = TotalNoSites_AP1_HG + noSitesPerChr;
end

%% Get half-site motif sequences: 25838 half sites (HS)
HS_MotifChrNo = [Chr_Gcn4_AP1_HC; Chr_Gcn4_AP1_HG];
HS_MotifCoord = [Loc_Gcn4_AP1_HC; Loc_Gcn4_AP1_HG];
% Sort the motifs by chromosome coordinate using a matrix, then back to columns
M1 = [HS_MotifChrNo, HS_MotifCoord];
M2 = sortrows(M1);
HS_MotifChrNo = M2(:,1);
HS_MotifCoord = M2(:,2);
% Get motif sequence using this coordinate list
HS_MotifSeq = strings(25838,1);
for b = 1:25838
    HS_MotifSeq(b) = genome(HS_MotifChrNo(b)).Seq(HS_MotifCoord(b)-5:HS_MotifCoord(b)+5);
end

%% Identify and count half-sites which are really full sites = 291.
HS_AP1 = zeros(25838,1);
for b = 1:25838
    if contains(HS_MotifSeq(b),'TGACTCA')
        HS_AP1(b) = 1;
    elseif contains(HS_MotifSeq(b),'TGAGTCA')
        HS_AP1(b) = 1;
    end
end
Full_AP1_Count = sum(HS_AP1);

%% Calculate fraction of half-sites bound by Gcn4: G-SELEX
% Test if each site is located within a peak.
% Count peaks with at least 1 half-site: 1962 of 2359; 3591 half-sites in peaks
load('Gcn4_SELEX_all_replicates_shared_peaks_bed.mat','chrNo','peak_start','peak_end');
HS_Selex = zeros(25838,1);
Selex_HS_peaks = zeros(2359,1);
for b = 1:25838
    for d = 1:2359
        if HS_MotifChrNo(b) == chrNo(d)
            if HS_MotifCoord(b) > peak_start(d) & HS_MotifCoord(b) < peak_end(d)
                HS_Selex(b) = 1;
                Selex_HS_peaks(d) = 1;
            end
        end
    end
end
Bound_Selex_Count = sum(HS_Selex);
Selex_Peaks_with_HS = sum(Selex_HS_peaks);

%% Calculate fraction of half-sites bound by Gcn4: ChIP (induced)
% Count peaks with at least 1 half-site: 422 of 546 (821 half-sites in ChIP peaks)
load('Gcn4_ChIP_WT_induced_peaks.mat','ChIPchrNo','ChIPpeak_start','ChIPpeak_end', ...
    'Peak_No');
HS_ChIP = zeros(25838,1);
ChIP_HS_peaks = zeros(546,1);
for b = 1:25838
    for d = 1:546
        if HS_MotifChrNo(b) == ChIPchrNo(d)
            if HS_MotifCoord(b) > ChIPpeak_start(d) & HS_MotifCoord(b) < ChIPpeak_end(d)
                HS_ChIP(b) = 1;
                ChIP_HS_peaks(d) = 1;
            end
        end
    end
end
Bound_ChIP_Count = sum(HS_ChIP);
ChIP_Peaks_with_HS = sum(ChIP_HS_peaks);

%% Count number of half-sites within each G-SELEX peak
% Find sequence of peak, count half-sites. 
% Total Half-sites in peaks = 3584 (1826 C and 1758 G)
% After subtraction of 240 peaks with full sites = 3344 half-sites
Peak_Seq = strings(2359,1);
HS_C_count = zeros(2359,1);
HS_G_count = zeros(2359,1);
for d = 1:2359
    Peak_Seq(d) = genome(chrNo(d)).Seq(peak_start(d):peak_end(d));
    HS_C_count(d) = count(Peak_Seq(d), 'ATGAC');
    HS_G_count(d) = count(Peak_Seq(d), 'GTCAT');
end
HC_count =sum(HS_C_count);
HG_count =sum(HS_G_count);
Total_HS_count = HS_C_count + HS_G_count;
Total_HS = sum(Total_HS_count);

% Subtract full sites
FS_C_count = zeros(2359,1);
FS_G_count = zeros(2359,1);
for d = 1:2359
    Peak_Seq(d) = genome(chrNo(d)).Seq(peak_start(d):peak_end(d));
    FS_C_count(d) = count(Peak_Seq(d), 'ATGACTCA');
    FS_G_count(d) = count(Peak_Seq(d), 'TGAGTCAT');
end
FC_count =sum(FS_C_count);
FG_count =sum(FS_G_count);
Total_FS_count = FS_C_count + FS_G_count;
Total_FS = sum(Total_FS_count);

Revised_HS_count = Total_HS_count - Total_FS_count;
Revised_Total_HS = sum(Revised_HS_count);

%% Count number of half-sites within each ChIP peak
% Find sequence of peak, count half-sites. 
% Total Half-sites in peaks = 848 (433 C and 415 G)
% After subtraction of 63 peaks with full sites = 785 half-sites
ChIP_Peak_Seq = strings(546,1);
ChIP_HS_C_count = zeros(546,1);
ChIP_HS_G_count = zeros(546,1);
for d = 1:546
    ChIP_Peak_Seq(d) = genome(ChIPchrNo(d)).Seq(ChIPpeak_start(d):ChIPpeak_end(d));
    ChIP_HS_C_count(d) = count(Peak_Seq(d), 'ATGAC');
    ChIP_HS_G_count(d) = count(Peak_Seq(d), 'GTCAT');
end
ChIP_HC_count =sum(ChIP_HS_C_count);
ChIP_HG_count =sum(ChIP_HS_G_count);
Total_ChIP_HS_count = ChIP_HS_C_count + ChIP_HS_G_count;
Total_ChIP_HS = sum(Total_ChIP_HS_count);

% Subtract full sites
ChIP_FS_C_count = zeros(546,1);
ChIP_FS_G_count = zeros(546,1);
for d = 1:546
    ChIP_Peak_Seq(d) = genome(ChIPchrNo(d)).Seq(ChIPpeak_start(d):ChIPpeak_end(d));
    ChIP_FS_C_count(d) = count(Peak_Seq(d), 'ATGACTCA');
    ChIP_FS_G_count(d) = count(Peak_Seq(d), 'TGAGTCAT');
end
ChIP_FC_count =sum(ChIP_FS_C_count);
ChIP_FG_count =sum(ChIP_FS_G_count);
Total_ChIP_FS_count = ChIP_FS_C_count + ChIP_FS_G_count;
Total_ChIP_FS = sum(Total_ChIP_FS_count);

Revised_ChIP_HS_count = Total_ChIP_HS_count - Total_ChIP_FS_count;
Revised_Total_ChIP_HS = sum(Revised_ChIP_HS_count);

%% G-SELEX: Site/Peak occupancies to compare bound and unbound half-sites
load('Norm_Occupancy_Gcn4_GSelex_R3.combined_0_5000.mat','NormOcc');
Selex_Occ = zeros(25838,1);
for b = 1:25838
    Selex_Occ(b) = NormOcc{HS_MotifChrNo(b)}(HS_MotifCoord(b));
end
% Get normalised occupancy for all G-SELEX peaks (maximum value)
SelPeakOcc = zeros(2359,1);
for d = 1:2359
    SelPeakOcc(d) = max(NormOcc{chrNo(d)}(peak_start(d):peak_end(d)));    
end
clear NormOcc

%% ChIP: Site/Peak occupancies to compare bound and unbound half-sites
load('Norm_Occupancy_SC_Gcn4_WT_I.combined.rmdup_0_500.mat','NormOcc');
ChIP_Occ = zeros(25838,1);
for b = 1:25838
    ChIP_Occ(b) = NormOcc{HS_MotifChrNo(b)}(HS_MotifCoord(b));
end
% Get normalised occupancy for all ChIP peaks (maximum value)
ChIPpeakOcc = zeros(546,1);
for d = 1:546
    ChIPpeakOcc(d) = max(NormOcc{ChIPchrNo(d)}(ChIPpeak_start(d):ChIPpeak_end(d)));    
end
clear NormOcc

%% Identify half-sites occurring within G-SELEX peaks attributed to full-sites
% Identify G-SELEX peaks with full sites
Sel_Full_Peak_Seq = strings(2359,1);
Sel_Full_start = zeros(2359,1);
Sel_Full_end = zeros(2359,1);
Sel_Full_Site = zeros(2359,1);
for d = 1:2359
    Sel_Full_Peak_Seq(d) = genome(chrNo(d)).Seq(peak_start(d):peak_end(d));
    if contains(Sel_Full_Peak_Seq(d),'TGACTCA')
        Sel_Full_start(d) = peak_start(d);
        Sel_Full_end(d) = peak_end(d);
        Sel_Full_Site(d) = 1;
    elseif contains(Sel_Full_Peak_Seq(d),'TGAGTCA')
        Sel_Full_start(d) = peak_start(d);
        Sel_Full_end(d) = peak_end(d);
        Sel_Full_Site(d) = 1;
    end
end
% Identify Half-Sites occurring within Selex peaks with full sites = 667
In_FS_SelPeak = zeros(25838,1);
for b = 1:25838
        for d = 1:2359
        if HS_MotifChrNo(b) == chrNo(d)
            if HS_MotifCoord(b) > Sel_Full_start(d) & HS_MotifCoord(b) < Sel_Full_end(d)
                In_FS_SelPeak(b) = 1;
            end
        end    
    end
end
HS_in_FS_SelPeak_count = sum(In_FS_SelPeak);

%% Identify half-sites occurring within ChIP peaks attributed to full-sites
ChIP_Full_Peak_Seq = strings(546,1);
ChIP_Full_start = zeros(546,1);
ChIP_Full_end = zeros(546,1);
ChIP_Full_Site = zeros(546,1);
for d = 1:546
    ChIP_Full_Peak_Seq(d) = genome(ChIPchrNo(d)).Seq(ChIPpeak_start(d):ChIPpeak_end(d));
    if contains(ChIP_Full_Peak_Seq(d),'TGACTCA')
        ChIP_Full_start(d) = ChIPpeak_start(d);
        ChIP_Full_end(d) = ChIPpeak_end(d);
        ChIP_Full_Site(d) = 1;
    elseif contains(ChIP_Full_Peak_Seq(d),'TGAGTCA')
        ChIP_Full_start(d) = ChIPpeak_start(d);
        ChIP_Full_end(d) = ChIPpeak_end(d);
        ChIP_Full_Site(d) = 1;
    end
end
% Identify Half-Sites occurring within ChIP peaks with full sites = 534
In_FS_ChIPpeak = zeros(25838,1);
for b = 1:25838
        for d = 1:546
        if HS_MotifChrNo(b) == ChIPchrNo(d)
            if HS_MotifCoord(b) > ChIP_Full_start(d) & HS_MotifCoord(b) < ChIP_Full_end(d)
                In_FS_ChIPpeak(b) = 1;
            end
        end    
    end
end
HS_in_FS_ChIPpeak_count = sum(In_FS_ChIPpeak);

%% Get 51 bp sequences for MEME analysis - G-SELEX - 2924 bound and 22196 unbound motifs
% "strings" does not fill the first one!
Sel_Bound_MotifSeq = strings();
for b = 1:25838
    if HS_AP1(b) == 0 & In_FS_SelPeak(b) == 0 & HS_Selex(b) == 1
    Sel_Bound_Motif = genome(HS_MotifChrNo(b)).Seq(HS_MotifCoord(b)-25:HS_MotifCoord(b)+25);
    Sel_Bound_MotifSeq = [Sel_Bound_MotifSeq; Sel_Bound_Motif];
    end
end
Sel_Unbound_MotifSeq = strings();
for b = 1:25838
    if HS_AP1(b) == 0 & In_FS_SelPeak(b) == 0 & HS_Selex(b) == 0
    Sel_Unbound_Motif = genome(HS_MotifChrNo(b)).Seq(HS_MotifCoord(b)-25:HS_MotifCoord(b)+25);
    Sel_Unbound_MotifSeq = [Sel_Unbound_MotifSeq; Sel_Unbound_Motif];
    end
end
%% Get 51 bp sequences for MEME analysis - ChIP - 287 bound and 24914 unbound motifs
% "strings" does not fill the first one!
ChIP_Bound_MotifSeq = strings();
for b = 1:25838
    if HS_AP1(b) == 0 & In_FS_ChIPpeak(b) == 0 & HS_ChIP(b) == 1
    ChIP_Bound_Motif = genome(HS_MotifChrNo(b)).Seq(HS_MotifCoord(b)-25:HS_MotifCoord(b)+25);
    ChIP_Bound_MotifSeq = [ChIP_Bound_MotifSeq; ChIP_Bound_Motif];
    end
end
ChIP_Unbound_MotifSeq = strings();
for b = 1:25838
    if HS_AP1(b) == 0 & In_FS_ChIPpeak(b) == 0 & HS_ChIP(b) == 0
    ChIP_Unbound_Motif = genome(HS_MotifChrNo(b)).Seq(HS_MotifCoord(b)-25:HS_MotifCoord(b)+25);
    ChIP_Unbound_MotifSeq = [ChIP_Unbound_MotifSeq; ChIP_Unbound_Motif];
    end
end

%% Save data. Tables
save('Half_Site_Summary.mat','HS_MotifChrNo','HS_MotifCoord','HS_MotifSeq','HS_AP1', ...
    'HS_Selex','In_FS_SelPeak','Selex_Occ','HS_ChIP','ChIP_Occ','In_FS_ChIPpeak',...
    'chrNo','peak_start','peak_end','Selex_HS_peaks','Revised_HS_count','Sel_Full_Site', ...
    'SelPeakOcc', ...
    'Peak_No','ChIPchrNo','ChIPpeak_start','ChIPpeak_end', 'ChIP_HS_peaks', ...
    'Revised_ChIP_HS_count','ChIP_Full_Site','ChIPpeakOcc');

T1 = table(HS_MotifChrNo, HS_MotifCoord, HS_MotifSeq, HS_AP1, HS_Selex, In_FS_SelPeak, ...
    Selex_Occ, HS_ChIP, In_FS_ChIPpeak, ChIP_Occ, ...
    'VariableNames',{'Chr','Coordinate','Motif','Full site','G-Selex peak',...
    'In Full Site SELEX peak','G-Selex Occ.','ChIP peak','In Full Site ChIP peak', ...
    'ChIP Occ.'});
writetable(T1, 'Table_S1_Half_Site_List.csv');

T2 = table(chrNo, peak_start, peak_end, Selex_HS_peaks, Revised_HS_count, ...
    Sel_Full_Site, SelPeakOcc, ...
    'VariableNames',{'Chr','Peak start','Peak end','Half-site', ...
    'Selex half-sites/peak','Selex Full Site','G-SELEX Peak Occ.'});
writetable(T2, 'Half_Site_SELEX_List.csv');

T3 = table(Peak_No, ChIPchrNo, ChIPpeak_start, ChIPpeak_end, ChIP_HS_peaks, ...
    Revised_ChIP_HS_count, ChIP_Full_Site, ChIPpeakOcc, ...
    'VariableNames',{'Peak No','Chr','Peak start','Peak end','Half-site', ...
    'ChIP half-sites/peak','ChIP Full Site','ChIP Peak Occ.'});
writetable(T3, 'Half_Site_ChIP_List.csv');

%% Save 51-bp sequences for MEME analysis (separate files)
T4 = table(Sel_Unbound_MotifSeq, 'VariableNames',{'G-SELEX unbound'}); 
writetable(T4,'Unbound_HalfSite_G-SELEX_sequences.csv');

T5 = table(Sel_Bound_MotifSeq, 'VariableNames',{'G-SELEX bound'}); 
writetable(T5,'Bound_HalfSite_G-SELEX_sequences.csv');

T6 = table(ChIP_Unbound_MotifSeq, 'VariableNames',{'ChIP unbound'}); 
writetable(T6,'Unbound_HalfSite_ChIP_sequences.csv');

T7 = table(ChIP_Bound_MotifSeq, 'VariableNames',{'ChIP bound'}); 
writetable(T7,'Bound_HalfSite_ChIP_sequences.csv');
