from Bio import SeqIO
import re
import os
from collections import defaultdict
import pandas as pd

def parse_cigar(file_path):
    cigar_data = []
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                parsed = re.findall(r'(\d+)([MIDNSHP=X])', line)
                cigar_data.append([(int(length), op) for length, op in parsed])
    return cigar_data

def get_sequence(file_path):
    # 假设fasta文件只有一条序列
    return str(next(SeqIO.parse(file_path, "fasta")).seq)

def apply_cigar(reference, query, cigar):
    ref_index, query_index = 0, 0
    aligned_ref, aligned_query = [], []

    for length, op in cigar:
        if op == 'M':  # Match
            aligned_ref.append(reference[ref_index:ref_index + length])
            aligned_query.append(query[query_index:query_index + length])
            ref_index += length
            query_index += length
        elif op == 'I':  # Insertion in query
            aligned_ref.append('-' * length)
            aligned_query.append(query[query_index:query_index + length])
            query_index += length
        elif op == 'D':  # Deletion from reference
            aligned_ref.append(reference[ref_index:ref_index + length])
            aligned_query.append('-' * length)
            ref_index += length
        # 可根据需求添加其他CIGAR操作的处理逻辑

    return ''.join(aligned_ref), ''.join(aligned_query)

# 文件路径
chm13_path = "../../data/human_genome/chm13/chm13_cen/chrX_cen.fasta"
chm1_path = "../../data/human_genome/chm1/chm1_cen/chrX_cen.fasta"
rama_cigar_path = './result/RaMA/cigar.txt'  # 根据需要更改为相应文件
unialigner_cigar_path = './result/UniAligner/cigar.txt'

# 读取参考和查询序列
chm13_sequence = get_sequence(chm13_path)
chm1_sequence = get_sequence(chm1_path)

# 解析CIGAR字符串
rama_cigar = parse_cigar(rama_cigar_path)
unialigner_cigar = parse_cigar(unialigner_cigar_path)

# 应用CIGAR生成比对好的序列
aligned_chm13_rama, aligned_chm1_rama = apply_cigar(chm13_sequence, chm1_sequence, rama_cigar[0])
aligned_chm13_unialigner, aligned_chm1_unialigner = apply_cigar(chm13_sequence, chm1_sequence, unialigner_cigar[0])

def is_multiple_of_2057(length):
    offset = length % 2057
    offset = min(offset, 2057 - offset)
    return offset == 0

# 函数：从CIGAR字符串中提取比对长度
def extract_cigar_length(cigar):
    # 正则表达式匹配数字和字母
    matches = re.findall(r'(\d+)([MIDNSHP=X])', cigar)
    length = 0
    for match in matches:
        num, op = int(match[0]), match[1]
        # 仅统计影响比对长度的操作（M, =, X, D, I）
        if op in "MD=XI":
            length += num
    return length, op

file_path = './result/RaMA/confidence.csv'
from collections import defaultdict

if os.path.exists(file_path):
    # 读取 CSV 文件
    df = pd.read_csv(file_path)
    
    long_indel_count = 0
    multiplicity = 0
    distribution = defaultdict(int)
    sv = 0
    # 遍历每一行计算长度
    for index, row in df.iterrows():
        cigar = row['cigar']
        reliability = row['confidence']
        rare_match = row['rare match']
        
        # 计算该 CIGAR 对应的长度
        length, op = extract_cigar_length(cigar)

        if op in "ID" and length > 5:
            if reliability == 1:
                long_indel_count += 1
                if length > 50:
                    sv += 1
                if is_multiple_of_2057(length):
                    multiplicity += 1
                    multiple = length // 2057
                    distribution[multiple] += 1  # 统计该倍数出现的次数

    print(f"RaMA: 长度大于5的可信 indel 数量为 {long_indel_count}，其中是2057倍数的可信 indel 数量为 {multiplicity}")
    print("RaMA: 2057 倍数的 indel 分布:", dict(distribution))
    print("RaMA: 2057 倍数的 总个数:", sum([v for k, v in distribution.items()]))
    print("RaMA: 2057 倍数的 总倍数:", sum([k * v for k, v in distribution.items()]))
    print("reliable RaMA SV:", sv)


def count_indels(cigar):
    indel_count = 0
    multiple_of_2057_count = 0

    # 遍历 CIGAR 字符串，统计 indel
    for length, op in cigar:
        if op in 'ID' and length > 5:  # 仅计算长度大于5的插入或删除
            indel_count += 1
            if is_multiple_of_2057(length):
                multiple_of_2057_count += 1

    return indel_count, multiple_of_2057_count
def indel_multiple_distribution(cigar):
    distribution = defaultdict(int)

    # 遍历 CIGAR 字符串，统计符合条件的 indel
    for length, op in cigar:
        if op in 'ID' and length > 5:  # 仅计算长度大于5的插入或删除
            multiple = length // 2057
            if length % 2057 == 0:  # 仅统计完全是2057倍数的indel
                distribution[multiple] += 1

    return distribution

def SV_distribution(cigar):
    sv = 0

    # 遍历 CIGAR 字符串，统计符合条件的 indel
    for length, op in cigar:
        if op in 'ID' and length > 50:  # 仅计算长度大于5的插入或删除
           sv += 1
    print("RaMA SV:", sv)
    return sv

# 统计 RaMA 比对后的序列的 indel 信息
indel_count_rama, multiple_of_2057_count_rama = count_indels(rama_cigar[0])
print(f"RaMA: 长度大于5的indel数量为 {indel_count_rama}，其中是2057倍数的indel数量为 {multiple_of_2057_count_rama}")
SV_distribution(rama_cigar[0])

# 统计 RaMA 比对后的 CIGAR 的 indel 倍数分布
rama_distribution = indel_multiple_distribution(rama_cigar[0])
print("RaMA: 2057 倍数的 indel 分布:", dict(rama_distribution))
print("RaMA: 2057 倍数的 总个数:", sum([v for k, v in rama_distribution.items()]))
print("RaMA: 2057 倍数的 总倍数:", sum([k * v for k, v in rama_distribution.items()]))


# 统计 UniAligner 比对后的序列的 indel 信息
indel_count_unialigner, multiple_of_2057_count_unialigner = count_indels(unialigner_cigar[0])
print(f"UniAligner: 长度大于5的indel数量为 {indel_count_unialigner}，其中是2057倍数的indel数量为 {multiple_of_2057_count_unialigner}")

# 统计 UniAligner 比对后的 CIGAR 的 indel 倍数分布
unialigner_distribution = indel_multiple_distribution(unialigner_cigar[0])

print("UniAligner: 2057 倍数的 indel 分布:", dict(unialigner_distribution))
print("UniAligner: 2057 倍数的 总个数:", sum([v for k, v in unialigner_distribution.items()]))
print("UniAligner: 2057 倍数的 总倍数:", sum([k * v for k, v in unialigner_distribution.items()]))

def output_indels_from_alignment(reference, query, cigar, output_file):
    ref_index, query_index = 0, 0
    
    with open(output_file, 'a') as out_f:
        for length, op in cigar:
            if op == 'M':  # Match
                ref_index += length
                query_index += length
            elif op == 'I':  # Insertion
                if is_multiple_of_2057(length):
                    out_f.write(f">indel_source:CHM1_type:insertion_length:{length}_multiplicity:{length // 2057}_start{ref_index}\n")
                    out_f.write(f"{query[query_index:query_index + length]}\n")
                query_index += length
            elif op == 'D':  # Deletion
                if is_multiple_of_2057(length):
                    out_f.write(f">indel_source:CHM13_type:deletion_length:{length}_multiplicity:{length // 2057}_start{query_index}\n")
                    out_f.write(f"{reference[ref_index:ref_index + length]}\n")
                ref_index += length

# 在比对好的序列和CIGAR结果后调用这个函数
output_fasta_path = './result/indels_output.fasta'

# 处理 RaMA 比对的 indel
output_indels_from_alignment(chm13_sequence, chm1_sequence, rama_cigar[0], output_fasta_path)

