





import numpy as np

from experiment_paths.experiment_paths import *

import exon_id_library.exon_id_lib as el

import gzip
import intervaltree




collapsed_exons_list = ["1_ETF_S1_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"2_ETF_cleaned_S9_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"3_ETF_cleaned_S10_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"4_ETF_cleaned_S11_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"5_ETF_cleaned_S12_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"6_ETF_S2_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"7_ETF_cleaned_S13_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"8_ETF_cleaned_S14_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"9_ETF_S3_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"10_ETF_cleaned_S15_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"11_ETF_S4_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"12_ETF_cleaned_S16_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"13_ETF_cleaned_S17_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"14_ETF_cleaned_S18_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"15_ETF_S5_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"16_ETF_cleaned_S19_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"17_ETF_cleaned_S20_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"18_ETF_S6_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"19_ETF_cleaned_S21_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"20_ETF_cleaned_S7_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"21_ETF_cleaned_S8_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"22_ETF_cleaned_S22_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz",
"23_ETF_cleaned_S23_L001_R1_001.fastq.gz_all_trimmed_simple_paired_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz"
]



#path = exp_output_path.trimmed_fastq_input_files 
SAM_files = '/mnt/hgfs/main_ssd/et_main/SAM_synthetic/'
new_stem = '_chr17_scrable_selected_sorted_collapsed_reshape_sorted_collapse.txt.gz'
collapsed_exons_list = [ SAM_files + x[:-57]+new_stem  for x in collapsed_exons_list]



from maxentpy import maxent
from maxentpy.maxent import load_matrix5, load_matrix3
matrix5 = load_matrix5()
matrix3 = load_matrix3()
#maxent.score5(seq_5ss, matrix=matrix5)

import os,sys

p_set = {'chr16:101978246-101978373:-',
'chr16:101978740-101978958:-',
'chr16:101979441-101979862:+',
'chr16:101980288-101980375:+',
'chr17:90095023-90095134:+',
'chr17:90096407-90096608:+',
'chr17:90102357-90102443:+',
'chr17:90155968-90156092:+'}

p_set = dict()  #don't care about these for synthetic 

a=np.zeros(24)
'lib_count'

exon_id_dict = dict()

duplicate_count = 0
for ii, f_path in enumerate(collapsed_exons_list):
    print('load file: %s' % (os.path.basename(f_path)))
    lib_num = int(os.path.basename(f_path).split('_')[0])
    for line in gzip.open(f_path,'rt'):
        #lib_num = os.basename(f_path)
        line_split = line.strip().split('\t')
        chrom  = line_split[1] #line_split[1] is not always == to line_split[0]
        start  = line_split[2]
        end    = line_split[3]
        strand = line_split[4]
        count  = int(line_split[5])
        exon_id = '%s:%s-%s:%s' % (chrom, start, end, strand)
        #if exon_id in p_set:
        #    int('problem found')
        #    break
        
        if exon_id not in exon_id_dict:
            exon_id_dict[exon_id] = np.zeros(24)
        else:
            #print('error')
            duplicate_count += 1
        
        exon_id_dict[exon_id][lib_num] += count

print('\nDone with exon_id_dict with len {:,}'.format(len(exon_id_dict)))



def get_3ss_seq(exon_id, genome_fasta):
    ex=el.exon_id_values(exon_id)
    if ex.strand == '+':
        try:
            seq = str(genome_fasta[ex.chrom][ex.start-21:ex.start+2])
            score = maxent.score3(seq, matrix=matrix3)
        except:
            seq='N'
            score=-5500
    if ex.strand == '-':
        try:
            seq = str(genome_fasta[ex.chrom][ex.end-4:ex.end+19].reverse.complement)
            score=maxent.score3(seq, matrix=matrix3)
        except:
            seq='N'
            score=-5500
    return seq,score


def get_5ss_seq(exon_id, genome_fasta):
    ex=el.exon_id_values(exon_id)
    if ex.strand == '+':
        try:
            seq = str(genome_fasta[ex.chrom][ex.end-4:ex.end+5])
            score = maxent.score5(seq, matrix=matrix5)
        except:
            seq='N'
            score=-5500
    if ex.strand == '-':
        try:
            seq = str(genome_fasta[ex.chrom][ex.start-7:ex.start+2].reverse.complement)
            score=maxent.score5(seq, matrix=matrix5)
        except:
            seq='N'
            score=-5500
    return seq,score







len(exon_id_dict)

max_count = 0
sum_count = 0

count_list = list()
for key in exon_id_dict:
    count =     sum(exon_id_dict[key])
    if count > max_count:
        max_count = count
    sum_count += count
    #count_list.append(count)


print('\nMax read count of random exon: {:}'.format(max_count))
print('\nMean read count per exon: {:.2}'.format(sum_count/len(exon_id_dict)) )










    
    
    
    
    
