import os
import pandas as pd
import re

# 定义染色体列表，包含1-22和X
chromosomes = [f"chr{i}" for i in range(1, 23)] + ["chrX"]

# 定义文件夹路径
base_path = "./result/RaMA/"

# 用于存储所有染色体的结果
total_stats = {
    "chromosome": [],
    "total length": [],
    "reliable length": [],
    "rare match length": [],
    "reliable ratio (%)": [],
    "rare match ratio (%)": [],
    "large indel" : [],
    "large indel count": [],
    "reliable large indel": [],
    "reliable large indel count": [],
    "large indel ratio (%)": [],
    "reliable large indel ratio (%)": []
}

# 函数：从CIGAR字符串中提取比对长度
def extract_cigar_length(cigar):
    # 正则表达式匹配数字和字母
    matches = re.findall(r'(\d+)([MIDNSHP=X])', cigar)
    length = 0
    for match in matches:
        num, op = int(match[0]), match[1]
        # 仅统计影响比对长度的操作（M, =, X, D, I）
        if op in "MD=X":
            length += num
    return length, op

# 读取每个染色体的confidence.csv文件并统计
for chrom in chromosomes:
    file_path = os.path.join(base_path, chrom, "confidence.csv")
    if os.path.exists(file_path):
        # 读取CSV文件
        df = pd.read_csv(file_path)
        
        # 初始化总长度，可靠长度和rare match长度
        total_length = 0
        reliable_length = 0
        rare_match_length = 0
        large_indel = 0
        large_indel_count = 0
        reliable_large_indel = 0
        reliable_large_indel_count = 0
        
        # 遍历每一行计算长度
        for index, row in df.iterrows():
            cigar = row['cigar']
            reliability = row['confidence']
            rare_match = row['rare match']
            
            # 计算该CIGAR对应的长度
            length, op = extract_cigar_length(cigar)
            total_length += length
            if op in "ID" and length > 5:
                large_indel += length
                large_indel_count += 1
                if reliability == 1:
                    reliable_large_indel += length
                    reliable_large_indel_count += 1
            # 可靠区域的长度
            if reliability == 1:            
                reliable_length += length
                
            # rare match的长度
            if rare_match == 1:
                rare_match_length += length
        
        # 计算占比
        reliable_ratio = reliable_length / total_length if total_length > 0 else 0
        rare_match_ratio = rare_match_length / total_length if total_length > 0 else 0
        # large_indel和reliable_large_indel的占比
        large_indel_ratio = large_indel / total_length if total_length > 0 else 0
        reliable_large_indel_ratio = reliable_large_indel / total_length if total_length > 0 else 0
        
        # 将结果存储
        total_stats["chromosome"].append(chrom)
        total_stats["total length"].append(total_length)
        total_stats["reliable length"].append(reliable_length)
        total_stats["rare match length"].append(rare_match_length)
        total_stats["reliable ratio (%)"].append(reliable_ratio * 100)
        total_stats["rare match ratio (%)"].append(rare_match_ratio * 100)
        total_stats["large indel"].append(large_indel)
        total_stats["large indel count"].append(large_indel_count)
        total_stats["reliable large indel"].append(reliable_large_indel)
        total_stats["reliable large indel count"].append(reliable_large_indel_count)
        # 补充
        total_stats["large indel ratio (%)"].append(large_indel_ratio*100)
        total_stats["reliable large indel ratio (%)"].append(reliable_large_indel_ratio*100)    

        
        print(f"{chrom} 的统计结果：")
        print(f"总长度: {total_length}, 可靠区域长度: {reliable_length}, rare match长度: {rare_match_length}")
        print(f"可靠区域占比: {reliable_ratio*100:.2f}%, rare match占比: {rare_match_ratio*100:.2f}%\n")
    else:
        print(f"{chrom} 的confidence.csv文件不存在")

# 将 total_stats 数据转换为 DataFrame
df_stats = pd.DataFrame(total_stats)

# 定义输出路径
output_csv_path = "./csv/RaMA_rare_match_reliablity.csv"

# 创建保存目录（如果不存在）
os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)

# 将 DataFrame 写入 CSV 文件
df_stats.to_csv(output_csv_path, index=False)

print(f"结果已保存至 {output_csv_path}")