%% 2 December 2021. Coey & Clark Table S3 (G-SELEX peaks) 
% Produces Table_S3 as a csv file with all G-SELEX peak data
% Correlations between G-SELEX replicates 1, 2, 3 = 0.86, 0.92, 0.98.

load('Gcn4_SELEX_all_replicates_shared_peaks_bed.mat','chrNo','peak_start', ...
    'peak_end');
load('sacCer3_genome.mat', 'genome');

%% Get the sequences of all 2359 SELEX peaks
for a = 1:numel(chrNo)
    Peak_Seq = genome(chrNo(a,1)).Seq(peak_start(a,1):peak_end(a,1));
    PeakSeqs(a,1) = string(Peak_Seq);
end

%% Identify peaks with AP1 motifs and record coords
% Use cell array because 12 of these peaks have 2 motifs
% Finds the index of the first T in the 7-bp motif. 
Motif_G = cell(2359,1);
for b = 1:numel(PeakSeqs)
     Motif_G{b} = strfind(PeakSeqs(b),'TGAGTCA');
end

Motif_C = cell(2359,1);
for b = 1:numel(PeakSeqs)
     Motif_C{b} = strfind(PeakSeqs(b),'TGACTCA');
end
% Combine cell arrays
for d = 1:2359
Motifs{d} = [Motif_G{d}; Motif_C{d}];
end
% Convert empty cells to NaN
for d = 1:2359
   if cellfun(@isempty, Motifs(d))
       Motifs{d} = NaN;
   end
end

%% Find cells with > 1 motif: 12 peaks with 2 motifs
ExcessMotifs = [];
for d = 1:2359
    if numel(Motifs{d}) > 1
        ExcessMotif = d;
        ExcessMotifs = [ExcessMotifs; ExcessMotif];
    end
end

%% Convert cell array to double array and two columns to get both motifs
SELEX_Motif = NaN(2359,1);
Extra_Motif = NaN(2359,1);
for d = 1:2359
    if numel(Motifs{d}) > 1
    SELEX_Motif(d) = Motifs{d}(1);
    Extra_Motif(d) = Motifs{d}(2);
    else SELEX_Motif(d) = Motifs{d}(1);
    end
end

%% Adjust motif coordinate to get chromosome coordinate of motif midpoint (+3)
% Need to subtract 1 because the first nt in each peak is also counted in the index
Motif_Centres = zeros(2359,1);
Extra_Motif_Centres = zeros(2359,1);
for d = 1:2359
    Motif_Centres(d) = SELEX_Motif(d) + peak_start(d) + 2;
    Extra_Motif_Centres(d) = Extra_Motif(d) + peak_start(d) + 2;
end

%% Count SELEX peaks with one or more motifs = 604. 
% Total motifs in peaks = 604 +12 = 616.
Peaks_with_Motif_count = 0;
for d = 1:2359
    if isnan(Motif_Centres(d))
        Peak_with_Motif_count = 0;
    else Peak_with_Motif_count = 1;
        Peaks_with_Motif_count = Peaks_with_Motif_count + Peak_with_Motif_count;
    end
end

%% Get motif sequences in each peak, including peaks with 2 motifs
MotifSeq = strings(2359,1);
for d = 1:2359
    if isnan(Motif_Centres(d))
        MotifSeq(d) = NaN;
    else
    MotifSeq(d) = string(genome(chrNo(d)).Seq(Motif_Centres(d)-4:Motif_Centres(d)+4));
    end
end

ExtraMotifSeq = strings(2359,1);
for d = 1:2359
    if isnan(Extra_Motif_Centres(d))
        ExtraMotifSeq(d) = NaN;
    else
    ExtraMotifSeq(d) = string(genome(chrNo(d)).Seq(Extra_Motif_Centres(d)-4:Extra_Motif_Centres(d)+4));
    end
end

%% Count C motifs (TGACTCA in top strand) and G motifs (TGAGTCA in top strand)
% 604 motifs: 297 C and 307 G. Cross-check
C_Motif = zeros(2359,1);
for d = 1:2359
C_Motif(d) = contains(MotifSeq(d),'TGACTCA');
end
G_Motif = zeros(2359,1);
for d = 1:2359
G_Motif(d) = contains(MotifSeq(d),'TGAGTCA');
end
Total_C = sum(C_Motif);
Total_G = sum(G_Motif);
Total_CG = Total_C + Total_G;

%% Specify motif types: C-motif then G-motif. Bases are for C-strand
Motif_Type = strings(2359,1);
Motif_RY_Type = strings(2359,1);

for n = 1:2359
 if contains(MotifSeq(n),'ATGACTCAA')
     Motif_Type(n) = 'AA';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'ATGACTCAC')
     Motif_Type(n) = 'AC';
     Motif_RY_Type(n) = 'RY';
     elseif contains(MotifSeq(n),'ATGACTCAG')
     Motif_Type(n) = 'AG';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'ATGACTCAT')
     Motif_Type(n) = 'AT';
     Motif_RY_Type(n) = 'RY';
     
     elseif contains(MotifSeq(n),'CTGACTCAA')
     Motif_Type(n) = 'CA';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'CTGACTCAC')
     Motif_Type(n) = 'CC';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'CTGACTCAG')
     Motif_Type(n) = 'CG';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'CTGACTCAT')
     Motif_Type(n) = 'CT';
     Motif_RY_Type(n) = 'RR-YY';
          
     elseif contains(MotifSeq(n),'GTGACTCAA')
     Motif_Type(n) = 'GA';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'GTGACTCAC')
     Motif_Type(n) = 'GC';
     Motif_RY_Type(n) = 'RY';
     elseif contains(MotifSeq(n),'GTGACTCAG')
     Motif_Type(n) = 'GG';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'GTGACTCAT')
     Motif_Type(n) = 'GT';
     Motif_RY_Type(n) = 'RY';
     
     elseif contains(MotifSeq(n),'TTGACTCAA')
     Motif_Type(n) = 'TA';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'TTGACTCAC')
     Motif_Type(n) = 'TC';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'TTGACTCAG')
     Motif_Type(n) = 'TG';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'TTGACTCAT')
     Motif_Type(n) = 'TT';
     Motif_RY_Type(n) = 'RR-YY';    
% G-motif        
     elseif contains(MotifSeq(n),'ATGAGTCAA')
     Motif_Type(n) = 'TT';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'ATGAGTCAC')
     Motif_Type(n) = 'GT';
     Motif_RY_Type(n) = 'RY';
     elseif contains(MotifSeq(n),'ATGAGTCAG')
     Motif_Type(n) = 'CT';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'ATGAGTCAT')
     Motif_Type(n) = 'AT';
     Motif_RY_Type(n) = 'RY';
     
     elseif contains(MotifSeq(n),'CTGAGTCAA')
     Motif_Type(n) = 'TG';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'CTGAGTCAC')
     Motif_Type(n) = 'GG';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'CTGAGTCAG')
     Motif_Type(n) = 'CG';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'CTGAGTCAT')
     Motif_Type(n) = 'AG';
     Motif_RY_Type(n) = 'RR-YY';
          
     elseif contains(MotifSeq(n),'GTGAGTCAA')
     Motif_Type(n) = 'TC';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'GTGAGTCAC')
     Motif_Type(n) = 'GC';
     Motif_RY_Type(n) = 'RY';
     elseif contains(MotifSeq(n),'GTGAGTCAG')
     Motif_Type(n) = 'CC';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'GTGAGTCAT')
     Motif_Type(n) = 'AC';
     Motif_RY_Type(n) = 'RY';
     
     elseif contains(MotifSeq(n),'TTGAGTCAA')
     Motif_Type(n) = 'TA';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'TTGAGTCAC')
     Motif_Type(n) = 'GA';
     Motif_RY_Type(n) = 'RR-YY';
     elseif contains(MotifSeq(n),'TTGAGTCAG')
     Motif_Type(n) = 'CA';
     Motif_RY_Type(n) = 'YR';
     elseif contains(MotifSeq(n),'TTGAGTCAT')
     Motif_Type(n) = 'AA';
     Motif_RY_Type(n) = 'RR-YY'; 
end
end

%% Repeat for the 12 extra motifs
Extra_Motif_Type = strings(2359,1);
Extra_Motif_RY_Type = strings(2359,1);
for n = 1:2359
 if contains(ExtraMotifSeq(n),'ATGACTCAA')
     Extra_Motif_Type(n) = 'AA';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'ATGACTCAC')
     Extra_Motif_Type(n) = 'AC';
     Extra_Motif_RY_Type(n) = 'RY';
     elseif contains(ExtraMotifSeq(n),'ATGACTCAG')
     Extra_Motif_Type(n) = 'AG';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'ATGACTCAT')
     Extra_Motif_Type(n) = 'AT';
     Extra_Motif_RY_Type(n) = 'RY';
     
     elseif contains(ExtraMotifSeq(n),'CTGACTCAA')
     Extra_Motif_Type(n) = 'CA';
     Extra_Motif_RY_Type(n) = 'YR';
     elseif contains(ExtraMotifSeq(n),'CTGACTCAC')
     Extra_Motif_Type(n) = 'CC';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'CTGACTCAG')
     Extra_Motif_Type(n) = 'CG';
     Extra_Motif_RY_Type(n) = 'YR';
     elseif contains(ExtraMotifSeq(n),'CTGACTCAT')
     Extra_Motif_Type(n) = 'CT';
     Extra_Motif_RY_Type(n) = 'RR-YY';
          
     elseif contains(ExtraMotifSeq(n),'GTGACTCAA')
     Extra_Motif_Type(n) = 'GA';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'GTGACTCAC')
     Extra_Motif_Type(n) = 'GC';
     Extra_Motif_RY_Type(n) = 'RY';
     elseif contains(ExtraMotifSeq(n),'GTGACTCAG')
     Extra_Motif_Type(n) = 'GG';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'GTGACTCAT')
     Extra_Motif_Type(n) = 'GT';
     Extra_Motif_RY_Type(n) = 'RY';
     
     elseif contains(ExtraMotifSeq(n),'TTGACTCAA')
     Extra_Motif_Type(n) = 'TA';
     Extra_Motif_RY_Type(n) = 'YR';
     elseif contains(ExtraMotifSeq(n),'TTGACTCAC')
     Extra_Motif_Type(n) = 'TC';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'TTGACTCAG')
     Extra_Motif_Type(n) = 'TG';
     Extra_Motif_RY_Type(n) = 'YR';
     elseif contains(ExtraMotifSeq(n),'TTGACTCAT')
     Extra_Motif_Type(n) = 'TT';
     Extra_Motif_RY_Type(n) = 'RR-YY';    
% G-motif        
     elseif contains(ExtraMotifSeq(n),'ATGAGTCAA')
     Extra_Motif_Type(n) = 'TT';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'ATGAGTCAC')
     Extra_Motif_Type(n) = 'GT';
     Extra_Motif_RY_Type(n) = 'RY';
     elseif contains(ExtraMotifSeq(n),'ATGAGTCAG')
     Extra_Motif_Type(n) = 'CT';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'ATGAGTCAT')
     Extra_Motif_Type(n) = 'AT';
     Extra_Motif_RY_Type(n) = 'RY';
     
     elseif contains(ExtraMotifSeq(n),'CTGAGTCAA')
     Extra_Motif_Type(n) = 'TG';
     Extra_Motif_RY_Type(n) = 'YR';
     elseif contains(ExtraMotifSeq(n),'CTGAGTCAC')
     Extra_Motif_Type(n) = 'GG';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'CTGAGTCAG')
     Extra_Motif_Type(n) = 'CG';
     Extra_Motif_RY_Type(n) = 'YR';
     elseif contains(ExtraMotifSeq(n),'CTGAGTCAT')
     Extra_Motif_Type(n) = 'AG';
     Extra_Motif_RY_Type(n) = 'RR-YY';
          
     elseif contains(ExtraMotifSeq(n),'GTGAGTCAA')
     Extra_Motif_Type(n) = 'TC';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'GTGAGTCAC')
     Extra_Motif_Type(n) = 'GC';
     Extra_Motif_RY_Type(n) = 'RY';
     elseif contains(ExtraMotifSeq(n),'GTGAGTCAG')
     Extra_Motif_Type(n) = 'CC';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'GTGAGTCAT')
     Extra_Motif_Type(n) = 'AC';
     Extra_Motif_RY_Type(n) = 'RY';
     
     elseif contains(ExtraMotifSeq(n),'TTGAGTCAA')
     Extra_Motif_Type(n) = 'TA';
     Extra_Motif_RY_Type(n) = 'YR';
     elseif contains(ExtraMotifSeq(n),'TTGAGTCAC')
     Extra_Motif_Type(n) = 'GA';
     Extra_Motif_RY_Type(n) = 'RR-YY';
     elseif contains(ExtraMotifSeq(n),'TTGAGTCAG')
     Extra_Motif_Type(n) = 'CA';
     Extra_Motif_RY_Type(n) = 'YR';
     elseif contains(ExtraMotifSeq(n),'TTGAGTCAT')
     Extra_Motif_Type(n) = 'AA';
     Extra_Motif_RY_Type(n) = 'RR-YY'; 
end
end

%% Get normalised occupancy for all G-SELEX peaks (maximum value)
load('Norm_Occupancy_Gcn4_GSelex_R3.combined_0_5000.mat','NormOcc');
PeakOcc = zeros(2359,1);
for a = 1:numel(chrNo)
    PeakOcc(a) = max(NormOcc{chrNo(a)}(peak_start(a):peak_end(a)));    
end

%% Motif occupancy - normalised
NormOcc_Motif = zeros(2359,1);
for d = 1:2359
    if isnan(Motif_Centres(d))
        NormOcc_Motif(d) = NaN;
    else
    NormOcc_Motif(d) = NormOcc{chrNo(d)}(Motif_Centres(d));
end
end

%% Motif occupancy G-Selex replicate 1: 180702 R3. 
% Use peak max not motif occupancy because many peaks have no motif
load('Occupancy_GCN4W_CTC_R509_0_5000.mat', 'Occ');
R1_Occ = zeros(2359,1);
for a = 1:numel(chrNo)
    R1_Occ(a) = max(Occ{chrNo(a)}(peak_start(a):peak_end(a)));    
end
clear Occ

%% Motif occupancy G-Selex replicate 2: 180711 R3. 
% Use peak max not motif occupancy because many peaks have no motif
load('Occupancy_CTC_G4305_0_5000.mat', 'Occ');
R2_Occ = zeros(2359,1);
for a = 1:numel(chrNo)
    R2_Occ(a) = max(Occ{chrNo(a)}(peak_start(a):peak_end(a)));    
end
clear Occ

%% Motif occupancy G-Selex replicate 3: 180808 R3. 
% Use peak max not motif occupancy because many peaks have no motif
load('Occupancy_CTC_G4R312_0_5000.mat', 'Occ');
R3_Occ = zeros(2359,1);
for a = 1:numel(chrNo)
    R3_Occ(a) = max(Occ{chrNo(a)}(peak_start(a):peak_end(a)));    
end
clear Occ

%% Identify G-SELEX peaks which overlap ChIP-seq peaks = 474 peaks
% For each G-SELEX peak, ask if it overlaps a ChIP peak:
% (1) starts within a ChIP peak, (2) ends within a ChIP peak, (3) includes all of a ChIP peak
load('Gcn4_ChIP_WT_induced_peaks.mat','ChIPchrNo','ChIPpeak_start','ChIPpeak_end');
Selex_Peak_in_ChIP = strings(2359,1);

for d = 1:2359
    for a = 1:546
    if chrNo(d) == ChIPchrNo(a)
        if peak_start(d) > ChIPpeak_start(a) & peak_start(d) < ChIPpeak_end(a)
            Selex_Peak_in_ChIP(d) = 'ChIP';
        elseif peak_end(d) > ChIPpeak_start(a) & peak_end(d) < ChIPpeak_end(a)
            Selex_Peak_in_ChIP(d) = 'ChIP';
        elseif mean(peak_start(d)+peak_end(d)) > ChIPpeak_start(a) & ...
                mean(peak_start(d)+peak_end(d)) < ChIPpeak_end(a)
            Selex_Peak_in_ChIP(d) = 'ChIP';
        end
    end
end
end

% Count no. of SELEX peaks overlapping ChIP peaks 
Selex_Peak_in_ChIP_Count = count(Selex_Peak_in_ChIP, 'ChIP');
No_ChIP_Selex_Peaks = sum(Selex_Peak_in_ChIP_Count);

%% Save data as csv file:
T = table(chrNo, peak_start, peak_end, PeakSeqs, Motif_Centres, Extra_Motif_Centres,...
    MotifSeq, ExtraMotifSeq, Motif_Type, Extra_Motif_Type, Motif_RY_Type, ...
    Extra_Motif_RY_Type, PeakOcc, R1_Occ, R2_Occ, R3_Occ, Selex_Peak_in_ChIP, ...
    'VariableNames',{'Chromosome No.','Peak Start Coord.','Peak End Coord.',...
    'Peak Sequence','Motif Coord.','Second Motif Coord.','Motif sequence','Second Motif Sequence', ...
    'Motif Flanking Bases','Second Motif Flanking Bases', ...
    'Motif RY Type','Second Motif RY Type','Peak Norm. Occupancy', ...
    'Peak Occupancy Rep.1','Peak Occupancy Rep.2','Peak Occupancy Rep.3','ChIP peak'});

writetable(T, 'Table_S3_Gcn4_SELEX_peaks.csv');

%% Calculate occupancy correlations between G-SELEX replicates: 0.86, 0.92, 0.98
R1_R2 = corrcoef(R1_Occ, R2_Occ);
R1_R3 = corrcoef(R1_Occ, R3_Occ);
R2_R3 = corrcoef(R2_Occ, R3_Occ);




