import re
import pandas as pd

# 解析CIGAR字符串，返回每个区域中原始序列的比对碱基数
def count_bases_in_original_sequence(cigar_str, region_starts, region_ends):
    pos1 = 0  # 第一条序列的位置
    region_counts = [0] * len(region_starts)  # 用于存储每个区域中的匹配碱基数
    cigar_operations = re.findall(r'(\d+)([MIDXS=])', cigar_str)  # 提取CIGAR操作

    for length, op in cigar_operations:
        length = int(length)

        if op in ['M', '=']:  # 匹配操作
            for _ in range(length):
                pos1 += 1
                # 检查当前pos1是否位于任何一个拼接区域中
                for i, (start, end) in enumerate(zip(region_starts, region_ends)):
                    if start <= pos1 <= end:
                        region_counts[i] += 1  # 记录匹配的碱基数
        elif op == 'D':  # 缺失操作，只影响第一条序列
            pos1 += length

    return region_counts

# 读取CIGAR文件
def parse_cigar_file(cigar_path):
    with open(cigar_path, 'r') as file:
        cigar_data = file.read().strip()  # 读取CIGAR字符串
    return cigar_data

# 从position.csv读取拼接区域的起始和结束位置
csv_path = './csv/position.csv'
df = pd.read_csv(csv_path)

# 假设position.csv中有Region_Start和Region_End两列
region_starts = df['Start_Seq1'].tolist()  # 从CSV文件中读取开始位置
region_ends = [start + length - 1 for start, length in zip(df['Start_Seq1'], df['Length_Seq1'])]  # 计算结束位置

# 从比对文件中解析RaMA和UniAligner的CIGAR文件
cigar_path_rama = './result/RaMA/cigar.txt'
cigar_str_rama = parse_cigar_file(cigar_path_rama)

cigar_path_unialigner = './result/UniAligner/cigar.txt'
cigar_str_unialigner = parse_cigar_file(cigar_path_unialigner)

# 计算每个区域的原始序列碱基数
rama_region_counts = count_bases_in_original_sequence(cigar_str_rama, region_starts, region_ends)
unialigner_region_counts = count_bases_in_original_sequence(cigar_str_unialigner, region_starts, region_ends)

# 计算每个区域的长度
region_lengths = [end - start + 1 for start, end in zip(region_starts, region_ends)]

# 计算覆盖率（匹配碱基数/区域长度）并取平均值
rama_coverages = [count / length for count, length in zip(rama_region_counts, region_lengths)]
unialigner_coverages = [count / length for count, length in zip(unialigner_region_counts, region_lengths)]

average_rama_coverage = sum(rama_coverages) / len(rama_coverages)
average_unialigner_coverage = sum(unialigner_coverages) / len(unialigner_coverages)

# 输出结果
print("RaMA每个区域的覆盖率：")
for i, coverage in enumerate(rama_coverages):
    print(f"区域{i+1} ({region_starts[i]}-{region_ends[i]}): {coverage:.4f}")

print("UniAligner每个区域的覆盖率：")
for i, coverage in enumerate(unialigner_coverages):
    print(f"区域{i+1} ({region_starts[i]}-{region_ends[i]}): {coverage:.4f}")

print(f"RaMA的平均覆盖率: {average_rama_coverage:.4f}")
print(f"UniAligner的平均覆盖率: {average_unialigner_coverage:.4f}")

# 如果需要将结果保存为CSV文件
df_result = pd.DataFrame({
    'Region_Start': region_starts,
    'Region_End': region_ends,
    'Region_Length': region_lengths,
    'RaMA_Matched_Bases': rama_region_counts,
    'RaMA_Coverage': rama_coverages,
    'UniAligner_Matched_Bases': unialigner_region_counts,
    'UniAligner_Coverage': unialigner_coverages
})
df_result.to_csv('./csv/coverage_by_region.csv', index=False)

print("结果已保存到CSV文件。")
