%% 2 December 2021. Coey & Clark Table S4. Gcn4 AP1 site occupancies: ChIP and G-SELEX.
% Based on 'Get_Gcn4_AP1_motif_norm_occ_all_samples_prom600_v4.m'
% Produces 'Table_S4_Gcn4_AP1_site_occupancies.csv'

load('sacCer3_genome.mat', 'genome');
chrLen = [genome.chrLen];
noChr = numel(genome);

%% C Motif (TGACTCA in top strand): Find motif sequence for each site.
AllCMotifs = [];
Total_CSites = 0;
for chr = 1 : 16
    indC = strfind(upper(genome(chr).Seq), 'TGACTCA');
    noCSitesChr = numel(indC);
    Total_CSites = Total_CSites + noCSitesChr;
    for f = 1 : noCSitesChr
        Motif_chrC = [genome(chr).Seq(indC(f)-1:indC(f)+7)];
        AllCMotifs = [AllCMotifs; Motif_chrC;];
    end
end

%% Record C Motif chromosome and nt locations. Adjusted to record central base 'C'.
Chr_Gcn4_AP1_C = [];
Loc_Gcn4_AP1_C = [];
TotalNoSites_AP1_C = 0;
for chr = 1 : noChr
    indC = strfind(upper(genome(chr).Seq), 'TGACTCA');
    noSitesPerChr = numel(indC);   
    Chr_Gcn4_AP1_C = [Chr_Gcn4_AP1_C; chr*ones(noSitesPerChr, 1)];
    Loc_Gcn4_AP1_C = [Loc_Gcn4_AP1_C; indC'+3;];
    TotalNoSites_AP1_C = TotalNoSites_AP1_C + noSitesPerChr;
end

%% G Motif (TGAGTCA in top strand): Find motif sequence for each site.
AllGMotifs = [];
Total_GSites = 0;
for chr = 1 : 16
    indG = strfind(upper(genome(chr).Seq), 'TGAGTCA');
    noGSitesChr = numel(indG);
    Total_GSites = Total_GSites + noGSitesChr;
    for f = 1 : noGSitesChr
        Motif_chrG = [genome(chr).Seq(indG(f)-1:indG(f)+7)];
        AllGMotifs = [AllGMotifs; Motif_chrG;];
    end
end

%% G Motif: Record chromosome and nt locations. Adjusted to record central base 'G'.
Chr_Gcn4_AP1_G = [];
Loc_Gcn4_AP1_G = [];
TotalNoSites_AP1_G = 0;
for chr = 1 : noChr
    indG = strfind(upper(genome(chr).Seq), 'TGAGTCA');
    noSitesPerChr = numel(indG);
    Chr_Gcn4_AP1_G = [Chr_Gcn4_AP1_G; chr*ones(noSitesPerChr, 1)];
    Loc_Gcn4_AP1_G = [Loc_Gcn4_AP1_G; indG'+3;];
    TotalNoSites_AP1_G = TotalNoSites_AP1_G + noSitesPerChr;
end

%% Get a single sorted motif list: Combine C and G motifs and sort by chromosome/coord
MotifChrNo = [Chr_Gcn4_AP1_C; Chr_Gcn4_AP1_G];
MotifCoord = [Loc_Gcn4_AP1_C; Loc_Gcn4_AP1_G];
% Combine columns to get matrix
M = [MotifChrNo, MotifCoord];
M1 = sortrows(M);
% Separate matrix back into sorted columns
Motif_Chr_No = M1(:,1);
Motif_Coord = M1(:,2);

%% Get motif sequences using sorted columns
for a = 1:numel(Motif_Chr_No)
    Motif_Seq = genome(Motif_Chr_No(a,1)).Seq(Motif_Coord(a,1)-4:Motif_Coord(a,1)+4);
    MotifSeq(a,1) = string(Motif_Seq);
end

%% Specify motif types: Bases are for C-strand
Motif_Type = strings(1078,1);
Motif_RY_Type = strings(1078,1);
for n = 1:1078
 if contains(MotifSeq(n),'ATGACTCAA')
     Motif_Type(n) = 'AA';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'ATGACTCAC')
     Motif_Type(n) = 'AC';
     Motif_RY_Type(n) = 'RY';
     elseif contains(MotifSeq(n),'ATGACTCAG')
     Motif_Type(n) = 'AG';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'ATGACTCAT')
     Motif_Type(n) = 'AT';
     Motif_RY_Type(n) = 'RY';
     
     elseif contains(MotifSeq(n),'CTGACTCAA')
     Motif_Type(n) = 'CA';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'CTGACTCAC')
     Motif_Type(n) = 'CC';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'CTGACTCAG')
     Motif_Type(n) = 'CG';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'CTGACTCAT')
     Motif_Type(n) = 'CT';
     Motif_RY_Type(n) = 'RR-YY';
          
     elseif contains(MotifSeq(n),'GTGACTCAA')
     Motif_Type(n) = 'GA';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'GTGACTCAC')
     Motif_Type(n) = 'GC';
     Motif_RY_Type(n) = 'RY';
     elseif contains(MotifSeq(n),'GTGACTCAG')
     Motif_Type(n) = 'GG';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'GTGACTCAT')
     Motif_Type(n) = 'GT';
     Motif_RY_Type(n) = 'RY';
     
     elseif contains(MotifSeq(n),'TTGACTCAA')
     Motif_Type(n) = 'TA';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'TTGACTCAC')
     Motif_Type(n) = 'TC';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'TTGACTCAG')
     Motif_Type(n) = 'TG';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'TTGACTCAT')
     Motif_Type(n) = 'TT';
     Motif_RY_Type(n) = 'RR-YY';    
% G-motif        
     elseif contains(MotifSeq(n),'ATGAGTCAA')
     Motif_Type(n) = 'TT';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'ATGAGTCAC')
     Motif_Type(n) = 'GT';
     Motif_RY_Type(n) = 'RY';
     elseif contains(MotifSeq(n),'ATGAGTCAG')
     Motif_Type(n) = 'CT';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'ATGAGTCAT')
     Motif_Type(n) = 'AT';
     Motif_RY_Type(n) = 'RY';
     
     elseif contains(MotifSeq(n),'CTGAGTCAA')
     Motif_Type(n) = 'TG';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'CTGAGTCAC')
     Motif_Type(n) = 'GG';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'CTGAGTCAG')
     Motif_Type(n) = 'CG';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'CTGAGTCAT')
     Motif_Type(n) = 'AG';
     Motif_RY_Type(n) = 'RR-YY';
          
     elseif contains(MotifSeq(n),'GTGAGTCAA')
     Motif_Type(n) = 'TC';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'GTGAGTCAC')
     Motif_Type(n) = 'GC';
     Motif_RY_Type(n) = 'RY';
     elseif contains(MotifSeq(n),'GTGAGTCAG')
     Motif_Type(n) = 'CC';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'GTGAGTCAT')
     Motif_Type(n) = 'AC';
     Motif_RY_Type(n) = 'RY';
     
     elseif contains(MotifSeq(n),'TTGAGTCAA')
     Motif_Type(n) = 'TA';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'TTGAGTCAC')
     Motif_Type(n) = 'GA';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'TTGAGTCAG')
     Motif_Type(n) = 'CA';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'TTGAGTCAT')
     Motif_Type(n) = 'AA';
     Motif_RY_Type(n) = 'RR-YY'; 
end
end

%% Identify motifs in ORFs and identify ORF
load('Yeast_sacCer3_annotations_210609.mat','Chr', 'ORF','ORFStart','ORFEnd','Watson','TSS');
ORF_Motif = strings(1078,1);
ORF_name = strings(1078,1);
for e = 1:1078
    for h = 1:5770
        if Watson(h) == 1
            if Chr(h) == Motif_Chr_No(e)
                if Motif_Coord(e) > ORFStart(h) & Motif_Coord(e) < ORFEnd(h)
                ORF_Motif(e) = 'ORF';
                ORF_name(e) = ORF(h);
                end
            end
            elseif Watson(h) == 0
                if Chr(h) == Motif_Chr_No(e)
                if Motif_Coord(e) > ORFEnd(h) & Motif_Coord(e) < ORFStart(h)
                ORF_Motif(e) = 'ORF'; 
                ORF_name(e) = ORF(h);
                end
            end
        end
    end
end

% Count motifs in ORFs = 788
ORF_Motif_Count = 0;
for a = 1:1078
if ORF_Motif(a) == 'ORF'
    ORF_Motif_Count = ORF_Motif_Count +1;
end
end

%% Identify motifs in promoters and identify gene with TSS. PROMOTERS = TSS -600
Promoter_Motif = strings(1078,1);
TSS_name = strings(1078,1);
for e = 1:1078
    for h = 1:5770
        if Watson(h) == 1
            if Chr(h) == Motif_Chr_No(e)
                if Motif_Coord(e) < TSS(h) & Motif_Coord(e) > TSS(h) -600
                Promoter_Motif(e) = 'Promoter';
                TSS_name(e) = ORF(h);
                end
            end
            elseif Watson(h) == 0
                if Chr(h) == Motif_Chr_No(e)
                if Motif_Coord(e) > TSS(h) & Motif_Coord(e) < TSS(h) +600
                Promoter_Motif(e) = 'Promoter'; 
                TSS_name(e) = ORF(h);
                end
            end
        end
    end
end

% Count motifs in promoters = 253 (TSS -600 bp).
Prom_Motif_Count = 0;
for a = 1:1078
if Promoter_Motif(a) == 'Promoter'
    Prom_Motif_Count = Prom_Motif_Count +1;
end
end
% Count genes with motif near TSS = 253 (600 bp).
TSS_name_Count = 0;
for a = 1:1078
if TSS_name(a) == ""
else
    TSS_name_Count = TSS_name_Count +1;
end
end
% Check for overlap between ORF and promoter = 91 (600 bp).
Overlap_Count = 0;
for a = 1:1078
if ORF_Motif(a) == 'ORF' & Promoter_Motif(a) == 'Promoter'
    Overlap_Count = Overlap_Count +1;
end
end
% Combine ORF and promoter data. If both are indicated, set = ORF.
Motif_Location = strings(1078,1);
for a = 1:1078
    if ORF_Motif(a) == 'ORF' 
    Motif_Location(a) = 'ORF';
    end
if Promoter_Motif(a) == 'Promoter'
    Motif_Location(a) = 'Promoter';
end
if ORF_Motif(a) == 'ORF' & Promoter_Motif(a) == 'Promoter'
    Motif_Location(a) = 'ORF';
    TSS_name(a) = "";
end
end
% REPEAT count genes with motif near TSS = 162 (600 bp).
RevTSS_name_Count = 0;
for a = 1:1078
if TSS_name(a) == ""
else
    RevTSS_name_Count = RevTSS_name_Count +1;
end
end
% Count promoters (TSS - 600) = 162 w/o overlap; count ORFs = 788
ORF_Count = 0;
for a = 1:1078
if Motif_Location(a) == 'ORF' 
    ORF_Count = ORF_Count +1;
    end
end
Promoter_Count = 0;
for a = 1:1078
if  Motif_Location(a) == 'Promoter' 
    Promoter_Count = Promoter_Count +1;
    end
end

%% Identify rDNA peaks
Motif_Location(697) = 'rDNA-1';
Motif_Location(698) = 'rDNA-2';

%% Identify Ty peaks
load('Ty_Coords.mat', 'Ty_Chr_No','Ty_start','Ty_end','Ty_type');
Ty_name = strings(1078,1);
for e = 1:1078
    for h = 1:432
            if Ty_Chr_No(h) == Motif_Chr_No(e)
                if Motif_Coord(e) > Ty_start(h) & Motif_Coord(e) < Ty_end(h)
                Ty_name(e) = Ty_type(h);
                elseif Motif_Coord(e) < Ty_start(h) & Motif_Coord(e) > Ty_end(h)
                Ty_name(e) = Ty_type(h);
                end
            end
    end
end
% Motifs in Ty count = 37 (no overlap)
Ty_Count = 0;
for a = 1:1078
if  Ty_name(a) == "" 
else Ty_Count = Ty_Count +1;
    end
end
% Combine Ty data with ORF/promoter
for a = 1:1078
    if contains(Ty_name(a), 'Ty')
    Motif_Location(a) = Ty_name(a);
    end
end

%% Identify motifs in G-SELEX peaks
load('Gcn4_SELEX_all_replicates_shared_peaks_bed.mat', ...
    'chrName', 'chrNo', 'peak_start', 'peak_end');
% Make Selex peak filter
Selex_Filter = cell(1, 16);
for chr = 1:16
    Selex_Filter{chr} = zeros(1, chrLen(chr));
end

for z = 1:numel(chrNo)
    Selex_Filter{chrNo(z)}(1,peak_start(z):peak_end(z)) = 1;
end
% Determine if central nucleotide in each motif is in a Selex peak. 
% No. of motifs in G-Selex peaks = 616. 
Selex_Motif = strings(1078,1);
Selex_Peak_Count = 0;
for a = 1:1078
    if Selex_Filter{Motif_Chr_No(a)}(Motif_Coord(a)) == 1
        Selex_Motif(a) = 'G-SELEX';
        Selex_Peak_Count = Selex_Peak_Count +1;
    end
end

% Check no. of peaks = 2359
Selex_Peaks = [];
Peak_Begin = [0 0 1 1];
for chr = 1:16
    Peak = strfind(Selex_Filter{chr}, Peak_Begin);
    Selex_Peaks = [Selex_Peaks, Peak];
end
Selex_Peak_Number = numel(Selex_Peaks);

%% Identify motifs in ChIP peaks
load('Gcn4_ChIP_WT_induced_peaks.mat','ChIPchrNo','ChIPpeak_start','ChIPpeak_end');
% Make Induced WT ChIP peak filter
ChIP_Filter = cell(1, 16);
for chr = 1:16
    ChIP_Filter{chr} = zeros(1, chrLen(chr));
end

for d = 1:numel(ChIPchrNo)
    ChIP_Filter{ChIPchrNo(d)}(1,ChIPpeak_start(d):ChIPpeak_end(d)) = 1;
end
% Determine if central nucleotide in each motif is in a ChIP peak. 
% No. of motifs in ChIP peaks = 389.
ChIP_Motif = strings(1078,1);
ChIP_Peak_Count = 0;
for a = 1:1078
    if ChIP_Filter{Motif_Chr_No(a)}(Motif_Coord(a)) == 1
        ChIP_Motif(a) = 'ChIP';
        ChIP_Peak_Count = ChIP_Peak_Count +1;
    end
end
% Check no. of peaks = 546
ChIP_Peaks = [];
Peak_Begin = [0 0 1 1];
for chr = 1:16
    ChIP_Peak = strfind(ChIP_Filter{chr}, Peak_Begin);
    ChIP_Peaks = [ChIP_Peaks, ChIP_Peak];
end
ChIP_Peak_Number = numel(ChIP_Peaks);

%% Count motifs common to G-Selex and Induced ChIP = 352
Common_Motif_Count = 0;
for a = 1:1078
if Selex_Motif(a) == 'G-SELEX' & ChIP_Motif(a) == 'ChIP'
    Common_Motif_Count = Common_Motif_Count +1;
end
end

%% Get motif occupancy values: CTC_G4305 180711 Round 3
load('Occupancy_CTC_G4305_0_5000.mat', 'Occ');
Occ_180711_R3 = zeros(1078,1);
for b = 1:numel(Motif_Chr_No)
    Occ_180711_R3(b) = Occ{Motif_Chr_No(b)}(1, Motif_Coord(b));
end
clear Occ

%% Get motif occupancy values: CTC_G4R312 180808 Round 3
load('Occupancy_CTC_G4R312_0_5000.mat', 'Occ');
Occ_180808_R3 = zeros(1078,1);
for b = 1:numel(Motif_Chr_No)
    Occ_180808_R3(b) = Occ{Motif_Chr_No(b)}(1, Motif_Coord(b));
end
clear Occ

%% Get motif occupancy values: GCN4W_CTC_R509 of 180702 = Round 3
load('Occupancy_GCN4W_CTC_R509_0_5000.mat', 'Occ');
Occ_180702_R3 = zeros(1078,1);
for b = 1:numel(Motif_Chr_No)
    Occ_180702_R3(b) = Occ{Motif_Chr_No(b)}(1, Motif_Coord(b));
end
clear Occ

%% Get motif occupancy values: Mock:  MockW_CTC_R502 of 180702 = Round 3
load('Occupancy_MockW_CTC_R502_0_5000.mat', 'Occ');
Occ_180702_Mock = zeros(1078,1);
for b = 1:numel(Motif_Chr_No)
    Occ_180702_Mock(b) = Occ{Motif_Chr_No(b)}(1, Motif_Coord(b));
end
clear Occ

%% Get motif occupancy values: CTC_input_I13 of 180831 = Input
load('Occupancy_CTC_input_I13_0_5000.mat', 'Occ');
Occ_Input = zeros(1078,1);
for b = 1:numel(Motif_Chr_No)
    Occ_Input(b) = Occ{Motif_Chr_No(b)}(1, Motif_Coord(b));
end
clear Occ

%% Normalised motif occupancy values for all three G-Selex Round 3 expts
load('Norm_Occupancy_Gcn4_GSelex_R3.combined_0_5000.mat', 'NormOcc');
NormOcc_Selex_R3 = zeros(1078,1);
for b = 1:numel(Motif_Chr_No)
    NormOcc_Selex_R3(b) = NormOcc{Motif_Chr_No(b)}(1, Motif_Coord(b));
end
clear NormOcc

%% Normalised motif occupancy values for Induced ChIPseq data
load('Norm_Occupancy_SC_Gcn4_WT_I.combined.rmdup_0_500.mat', 'NormOcc');
NormOcc_I_ChIP = zeros(1078,1);
for b = 1:numel(Motif_Chr_No)
    NormOcc_I_ChIP(b) = NormOcc{Motif_Chr_No(b)}(1, Motif_Coord(b));
end
clear NormOcc

%% Normalised motif occupancy values for Uninduced ChIPseq data
load('Norm_Occupancy_SC_Gcn4_WT_U.combined.rmdup_0_500.mat', 'NormOcc');
NormOcc_U_ChIP = zeros(1078,1);
for b = 1:numel(Motif_Chr_No)
    NormOcc_U_ChIP(b) = NormOcc{Motif_Chr_No(b)}(1, Motif_Coord(b));
end
clear NormOcc

%% Normalised motif occupancy values for gcn4null (induced) ChIPseq data
load('Norm_Occupancy_SC_Gcn4_gcn4_I.combined.rmdup_0_500.mat', 'NormOcc');
NormOcc_gcn4_ChIP = zeros(1078,1);
for b = 1:numel(Motif_Chr_No)
    NormOcc_gcn4_ChIP(b) = NormOcc{Motif_Chr_No(b)}(1, Motif_Coord(b));
end
clear NormOcc

%% save and write table
save('Gcn4_Motif_norm_occupancies.mat', 'Motif_Chr_No', 'Motif_Coord', 'MotifSeq',...
    'Motif_Type','Motif_RY_Type','Motif_Location','ORF_name','TSS_name','ChIP_Motif', ...
    'Selex_Motif', ...
      'Occ_180711_R3','Occ_180808_R3','Occ_180702_R3', ...
      'Occ_Input','Occ_180702_Mock','NormOcc_Selex_R3', 'NormOcc_I_ChIP','NormOcc_U_ChIP', ...
      'NormOcc_gcn4_ChIP');

T = table(Motif_Chr_No, Motif_Coord, MotifSeq, Motif_Type, Motif_RY_Type, ...
        Motif_Location, ORF_name, TSS_name, ChIP_Motif, Selex_Motif, ...
      Occ_180711_R3, Occ_180808_R3, Occ_180702_R3, ...
      Occ_Input, Occ_180702_Mock, NormOcc_Selex_R3, NormOcc_I_ChIP, NormOcc_U_ChIP, ...
      NormOcc_gcn4_ChIP, ...
     'VariableNames',{'Motif_Chr_No', 'Motif_Coord', 'MotifSeq','Motif_Type', ...
     'Motif_RY_Type', 'Motif_Location','ORF_name','TSS_name','ChIP', 'G-SELEX', ...
      'Occ_180711_R3','Occ_180808_R3','Occ_180702_R3', ...
      'Occ_Input','Occ_Mock','NormOcc_Selex_R3', 'NormOcc_I_ChIP', ...
      'NormOcc_U_ChIP','NormOcc_gcn4_ChIP'});
writetable(T, 'Table_S4_Gcn4_AP1_site_occupancies.csv');



    
