import os
import pandas as pd
import re

# 定义染色体列表，包含1-22和X
chromosomes = [f"chr{i}" for i in range(1, 23)] + ["chrX"]

# 定义文件夹路径
base_path = "./result/UniAligner/"

# 用于存储所有染色体的结果
total_stats = {
    "chromosome": [],
    "total length": [],
    "large indel": [],
    "large indel count": [],
    "large indel ratio (%)": []
}

# 函数：从CIGAR字符串中提取比对长度和计算indel信息
def extract_large_indel_from_cigar(cigar):
    # 正则表达式匹配数字和字母
    matches = re.findall(r'(\d+)([MIDNSHP=X])', cigar)
    total_length = 0
    large_indel = 0
    large_indel_count = 0
    for match in matches:
        num, op = int(match[0]), match[1]
        # 统计比对总长度
        
        total_length += num
        # 计算 large indel 信息
        if op in "ID" and num > 5:
            large_indel += num
            large_indel_count += 1
    return total_length, large_indel, large_indel_count

# 读取每个染色体的cigar.txt文件并统计
for chrom in chromosomes:
    file_path = os.path.join(base_path, chrom, "cigar.txt")
    if os.path.exists(file_path):
        # 读取文件
        with open(file_path, 'r') as file:
            lines = file.readlines()
        
        # 初始化总长度和 large indel 变量
        total_length = 0
        large_indel = 0
        large_indel_count = 0
        
        # 遍历每一行计算长度
        for line in lines:
            cigar = line.strip()
            length, indel, indel_count = extract_large_indel_from_cigar(cigar)
            total_length += length
        
            large_indel += indel
            large_indel_count += indel_count
        
        # 计算 large indel 的占比
        large_indel_ratio = large_indel* 100 / total_length if total_length > 0 else 0
        
        # 将结果存储
        total_stats["chromosome"].append(chrom)
        total_stats["total length"].append(total_length)
        total_stats["large indel"].append(large_indel)
        total_stats["large indel count"].append(large_indel_count)
        total_stats["large indel ratio (%)"].append(large_indel_ratio)

# 将 total_stats 数据转换为 DataFrame
df_stats = pd.DataFrame(total_stats)

# 定义输出路径
output_csv_path = "./csv/UniAligner_large_indel_statistics.csv"

# 创建保存目录（如果不存在）
os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)

# 将 DataFrame 写入 CSV 文件
df_stats.to_csv(output_csv_path, index=False)

