%% 2 December 2021. Coey & Clark Table S2: ChIP data.
% Based on 'Gcn4_scatter_NormInducedMotifOcc_v_InductionRatio_v5.m'
% ChIP data imported from Rawal AddFile117.xlsx
% Produces Table_S2_Gcn4_ChIP_peaks.csv

load('Gcn4_ChIP_peaks_data_210605.mat','Peak_No','ChIPchrNo','ChIPpeak_start', ...
    'ChIPpeak_end','NearestTSS','CommonName','Rpb3IU', ...
    'ChIP_Common_Peaks','ChIP_Peaks_with_Motif');
load('sacCer3_genome.mat', 'genome');

%% Get the sequences of all 546 ChIP peaks
for a = 1:numel(ChIPchrNo)
    Peak_Seq = genome(ChIPchrNo(a,1)).Seq(ChIPpeak_start(a,1):ChIPpeak_end(a,1));
    MotifPeakSeqs(a,1) = string(Peak_Seq);
end

%% Identify peaks with AP1 motifs and record coords
% Use cell array because 18 of these peaks have 2 motifs
% Finds the index of the first T in the 7-bp motif.
% G-motif is TGAGTCA on top strand; C-motif is TGACTCA on top strand
Motif_G = cell(546,1);
for b = 1:numel(MotifPeakSeqs)
     Motif_G{b} = strfind(MotifPeakSeqs(b),'TGAGTCA');
end

Motif_C = cell(546,1);
for b = 1:numel(MotifPeakSeqs)
     Motif_C{b} = strfind(MotifPeakSeqs(b),'TGACTCA');
end
% Combine cell arrays
for d = 1:546
Motifs{d} = [Motif_G{d}; Motif_C{d}];
end
% Convert empty cells to NaN
for d = 1:546
   if cellfun(@isempty, Motifs(d))
       Motifs{d} = NaN;
   end
end
%% Find cells with > 1 motif: 18 peaks with 2 motifs
ExcessMotifs = [];
for d = 1:546
    if numel(Motifs{d}) > 1
        ExcessMotif = d;
        ExcessMotifs = [ExcessMotifs; ExcessMotif];
    end
end

%% Convert cell array to double array and two columns to get both motifs
ChIP_Motif = NaN(546,1);
Extra_Motif = NaN(546,1);
for d = 1:546
    if numel(Motifs{d}) > 1
    ChIP_Motif(d) = Motifs{d}(1);
    Extra_Motif(d) = Motifs{d}(2);
    else ChIP_Motif(d) = Motifs{d}(1);
    end
end

%% Adjust motif coordinate to get chr coord of motif midpoint (+3)
% Need to subtract 1 because the first nt in each peak is also counted in the index
Motif_Centres = zeros(546,1);
Extra_Motif_Centres = zeros(546,1);
for d = 1:546
    Motif_Centres(d) = ChIP_Motif(d) + ChIPpeak_start(d) + 2;
    Extra_Motif_Centres(d) = Extra_Motif(d) + ChIPpeak_start(d) + 2;
end

%% Get motif sequences for each peak: NTGA(C/G)TCAN = 9 nt
MotifSeq = strings(546,1);   
for d = 1:546
    if isnan(Motif_Centres(d))
        MotifSeq(d) = NaN;
    else
    MotifSeq(d) = string(genome(ChIPchrNo(d)).Seq(Motif_Centres(d)-4:Motif_Centres(d)+4));
    end
end

ExtraMotifSeq = strings(546,1);
for d = 1:546
    if isnan(Extra_Motif_Centres(d))
        ExtraMotifSeq(d) = NaN;
    else
    ExtraMotifSeq(d) = string(genome(ChIPchrNo(d)).Seq(Extra_Motif_Centres(d)-4:Extra_Motif_Centres(d)+4));
    end
end

%% Identify motif type
% C or G motif: C-motif = 1 (note: some have no motif)
% 371 motifs: 189 C and 182 G.
C_Motif = zeros(546,1);
for d = 1:546
C_Motif(d) = contains(MotifSeq(d),'TGACTCA');
end
G_Motif = zeros(546,1);
for d = 1:546
G_Motif(d) = contains(MotifSeq(d),'TGAGTCA');
end
Total_C = sum(C_Motif);
Total_G = sum(G_Motif);
Total_CG = Total_C + Total_G;

%% Specify first motif types: C-motif then G-motif. Bases are for C-strand
Motif_Type = strings(546,1);
Motif_RY_Type = strings(546,1);
for n = 1:546
 if contains(MotifSeq(n),'ATGACTCAA')
     Motif_Type(n) = 'AA';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'ATGACTCAC')
     Motif_Type(n) = 'AC';
     Motif_RY_Type(n) = 'RY';
     elseif contains(MotifSeq(n),'ATGACTCAG')
     Motif_Type(n) = 'AG';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'ATGACTCAT')
     Motif_Type(n) = 'AT';
     Motif_RY_Type(n) = 'RY';
     
     elseif contains(MotifSeq(n),'CTGACTCAA')
     Motif_Type(n) = 'CA';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'CTGACTCAC')
     Motif_Type(n) = 'CC';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'CTGACTCAG')
     Motif_Type(n) = 'CG';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'CTGACTCAT')
     Motif_Type(n) = 'CT';
     Motif_RY_Type(n) = 'RR-YY';
          
     elseif contains(MotifSeq(n),'GTGACTCAA')
     Motif_Type(n) = 'GA';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'GTGACTCAC')
     Motif_Type(n) = 'GC';
     Motif_RY_Type(n) = 'RY';
     elseif contains(MotifSeq(n),'GTGACTCAG')
     Motif_Type(n) = 'GG';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'GTGACTCAT')
     Motif_Type(n) = 'GT';
     Motif_RY_Type(n) = 'RY';
     
     elseif contains(MotifSeq(n),'TTGACTCAA')
     Motif_Type(n) = 'TA';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'TTGACTCAC')
     Motif_Type(n) = 'TC';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'TTGACTCAG')
     Motif_Type(n) = 'TG';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'TTGACTCAT')
     Motif_Type(n) = 'TT';
     Motif_RY_Type(n) = 'RR-YY';    
% G-motif        
     elseif contains(MotifSeq(n),'ATGAGTCAA')
     Motif_Type(n) = 'TT';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'ATGAGTCAC')
     Motif_Type(n) = 'GT';
     Motif_RY_Type(n) = 'RY';
     elseif contains(MotifSeq(n),'ATGAGTCAG')
     Motif_Type(n) = 'CT';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'ATGAGTCAT')
     Motif_Type(n) = 'AT';
     Motif_RY_Type(n) = 'RY';
     
     elseif contains(MotifSeq(n),'CTGAGTCAA')
     Motif_Type(n) = 'TG';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'CTGAGTCAC')
     Motif_Type(n) = 'GG';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'CTGAGTCAG')
     Motif_Type(n) = 'CG';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'CTGAGTCAT')
     Motif_Type(n) = 'AG';
     Motif_RY_Type(n) = 'RR-YY';
          
     elseif contains(MotifSeq(n),'GTGAGTCAA')
     Motif_Type(n) = 'TC';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'GTGAGTCAC')
     Motif_Type(n) = 'GC';
     Motif_RY_Type(n) = 'RY';
     elseif contains(MotifSeq(n),'GTGAGTCAG')
     Motif_Type(n) = 'CC';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'GTGAGTCAT')
     Motif_Type(n) = 'AC';
     Motif_RY_Type(n) = 'RY';
     
     elseif contains(MotifSeq(n),'TTGAGTCAA')
     Motif_Type(n) = 'TA';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'TTGAGTCAC')
     Motif_Type(n) = 'GA';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'TTGAGTCAG')
     Motif_Type(n) = 'CA';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'TTGAGTCAT')
     Motif_Type(n) = 'AA';
     Motif_RY_Type(n) = 'RR-YY'; 
end
end

%% Specify second motif types: C-motif then G-motif. Bases are for C-strand
Extra_Motif_Type = strings(546,1);
Extra_Motif_RY_Type = strings(546,1);
for n = 1:546
 if contains(ExtraMotifSeq(n),'ATGACTCAA')
     Extra_Motif_Type(n) = 'AA';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'ATGACTCAC')
     Extra_Motif_Type(n) = 'AC';
     Extra_Motif_RY_Type(n) = 'RY';
     elseif contains(ExtraMotifSeq(n),'ATGACTCAG')
     Extra_Motif_Type(n) = 'AG';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'ATGACTCAT')
     Extra_Motif_Type(n) = 'AT';
     Extra_Motif_RY_Type(n) = 'RY';
     
     elseif contains(ExtraMotifSeq(n),'CTGACTCAA')
     Extra_Motif_Type(n) = 'CA';
     Extra_Motif_RY_Type(n) = 'YR';
     elseif contains(ExtraMotifSeq(n),'CTGACTCAC')
     Extra_Motif_Type(n) = 'CC';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'CTGACTCAG')
     Extra_Motif_Type(n) = 'CG';
     Extra_Motif_RY_Type(n) = 'YR';
     elseif contains(ExtraMotifSeq(n),'CTGACTCAT')
     Extra_Motif_Type(n) = 'CT';
     Extra_Motif_RY_Type(n) = 'RR-YY';
          
     elseif contains(ExtraMotifSeq(n),'GTGACTCAA')
     Extra_Motif_Type(n) = 'GA';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'GTGACTCAC')
     Extra_Motif_Type(n) = 'GC';
     Extra_Motif_RY_Type(n) = 'RY';
     elseif contains(ExtraMotifSeq(n),'GTGACTCAG')
     Extra_Motif_Type(n) = 'GG';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'GTGACTCAT')
     Extra_Motif_Type(n) = 'GT';
     Extra_Motif_RY_Type(n) = 'RY';
     
     elseif contains(ExtraMotifSeq(n),'TTGACTCAA')
     Extra_Motif_Type(n) = 'TA';
     Extra_Motif_RY_Type(n) = 'YR';
     elseif contains(ExtraMotifSeq(n),'TTGACTCAC')
     Extra_Motif_Type(n) = 'TC';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'TTGACTCAG')
     Extra_Motif_Type(n) = 'TG';
     Extra_Motif_RY_Type(n) = 'YR';
     elseif contains(ExtraMotifSeq(n),'TTGACTCAT')
     Extra_Motif_Type(n) = 'TT';
     Extra_Motif_RY_Type(n) = 'RR-YY';    
% G-motif        
     elseif contains(ExtraMotifSeq(n),'ATGAGTCAA')
     Extra_Motif_Type(n) = 'TT';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'ATGAGTCAC')
     Extra_Motif_Type(n) = 'GT';
     Extra_Motif_RY_Type(n) = 'RY';
     elseif contains(ExtraMotifSeq(n),'ATGAGTCAG')
     Extra_Motif_Type(n) = 'CT';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'ATGAGTCAT')
     Extra_Motif_Type(n) = 'AT';
     Extra_Motif_RY_Type(n) = 'RY';
     
     elseif contains(ExtraMotifSeq(n),'CTGAGTCAA')
     Extra_Motif_Type(n) = 'TG';
     Extra_Motif_RY_Type(n) = 'YR';
     elseif contains(ExtraMotifSeq(n),'CTGAGTCAC')
     Extra_Motif_Type(n) = 'GG';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'CTGAGTCAG')
     Extra_Motif_Type(n) = 'CG';
     Extra_Motif_RY_Type(n) = 'YR';
     elseif contains(ExtraMotifSeq(n),'CTGAGTCAT')
     Extra_Motif_Type(n) = 'AG';
     Extra_Motif_RY_Type(n) = 'RR-YY';
          
     elseif contains(ExtraMotifSeq(n),'GTGAGTCAA')
     Extra_Motif_Type(n) = 'TC';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'GTGAGTCAC')
     Extra_Motif_Type(n) = 'GC';
     Extra_Motif_RY_Type(n) = 'RY';
     elseif contains(ExtraMotifSeq(n),'GTGAGTCAG')
     Extra_Motif_Type(n) = 'CC';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'GTGAGTCAT')
     Extra_Motif_Type(n) = 'AC';
     Extra_Motif_RY_Type(n) = 'RY';
     
     elseif contains(ExtraMotifSeq(n),'TTGAGTCAA')
     Extra_Motif_Type(n) = 'TA';
     Extra_Motif_RY_Type(n) = 'YR';
     elseif contains(ExtraMotifSeq(n),'TTGAGTCAC')
     Extra_Motif_Type(n) = 'GA';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'TTGAGTCAG')
     Extra_Motif_Type(n) = 'CA';
     Extra_Motif_RY_Type(n) = 'YR';
     elseif contains(ExtraMotifSeq(n),'TTGAGTCAT')
     Extra_Motif_Type(n) = 'AA';
     Extra_Motif_RY_Type(n) = 'RR-YY'; 
end
end

%% Get norm occ for AP-1 motif coords - use 1st or 2nd motif in order RY, RR-YY, YR
load('Norm_Occupancy_SC_Gcn4_WT_I.combined.rmdup_0_500.mat','NormOcc');
NormOcc_Motif = zeros(546,1);
for d = 1:546
    if isnan(Motif_Centres(d))
        NormOcc_Motif(d) = NaN;
    elseif Extra_Motif_RY_Type(d) == 'RY'
        NormOcc_Motif(d) = NormOcc{ChIPchrNo(d)}(Extra_Motif_Centres(d));
    elseif Motif_RY_Type(d) == 'RY'
        NormOcc_Motif(d) = NormOcc{ChIPchrNo(d)}(Motif_Centres(d));
    elseif Extra_Motif_RY_Type(d) == 'RR-YY'
        NormOcc_Motif(d) = NormOcc{ChIPchrNo(d)}(Extra_Motif_Centres(d));
    else
    NormOcc_Motif(d) = NormOcc{ChIPchrNo(d)}(Motif_Centres(d));
end
end

% Cross-check; get occupancies for motifs 1 and 2
NormOcc_Motif1 = zeros(546,1);
for d = 1:546
    if isnan(Motif_Centres(d))
        NormOcc_Motif1(d) = NaN;
    else
    NormOcc_Motif1(d) = NormOcc{ChIPchrNo(d)}(Motif_Centres(d));
end
end

NormOcc_Motif2 = zeros(546,1);
for d = 1:546
    if isnan(Extra_Motif_Centres(d))
        NormOcc_Motif2(d) = NaN;
    else
    NormOcc_Motif2(d) = NormOcc{ChIPchrNo(d)}(Extra_Motif_Centres(d));
end
end

clear NormOcc
%% Mark 456 ChIP peaks common to G-SELEX
ChIP_Selex_Peaks = strings(546,1);
for d = 1:546
    if ChIP_Common_Peaks(d) == 1
        ChIP_Selex_Peaks(d) = 'G-SELEX';
    end
end
ChIP_Selex_Peak_Count = count(ChIP_Selex_Peaks, 'G-SELEX');
No_ChIP_Selex_Peaks = sum(ChIP_Selex_Peak_Count);

%% Save data as csv file:
T = table(Peak_No, ChIPchrNo, ChIPpeak_start, ChIPpeak_end, MotifPeakSeqs, Motif_Centres, ...
    Extra_Motif_Centres, MotifSeq, ExtraMotifSeq, Motif_Type, Extra_Motif_Type, ...
    Motif_RY_Type, Extra_Motif_RY_Type, NormOcc_Motif, Rpb3IU, NearestTSS, ...
    CommonName, ChIP_Selex_Peaks, ...
    'VariableNames',{'Peak_No','Chr_No','Peak_start','Peak_end','Peak_Sequence',...
    'Mid_Motif_Coord.','Second_Motif_Coord.','Motif_sequence','Second_Motif_sequence',...
    'Motif_Flanking_Bases','Second_Motif_Flanking_Bases','Motif_Type','Second_Motif_Type', ...
    'Norm_Occ_Motif','Rpb3IU','NearestTSS','CommonName','G-SELEX_peak'});

writetable(T, 'Table_S2_Gcn4_ChIP_peaks.csv');


