import csv
from Bio import SeqIO

# 读取数据文件
dataset_99_1_path = "./data/dataset_95_1.fasta"
dataset_99_TRUE_1_path = "./data/dataset_95_TRUE_1.fasta"

# 读取序列
def read_fasta(file_path):
    with open(file_path, "r") as file:
        record = list(SeqIO.parse(file, "fasta"))
    return str(record[0].seq), str(record[1].seq)

seq1, seq2 = read_fasta(dataset_99_1_path)  # 未比对的两条序列
aligned_seq1, aligned_seq2 = read_fasta(dataset_99_TRUE_1_path)  # 比对结果

# 读取着丝粒序列
centromere1_seq = str(SeqIO.read("./data/CHM13_chr16_cen.fasta", "fasta").seq)
centromere2_seq = str(SeqIO.read("./data/CHM13_chr20_cen.fasta", "fasta").seq)
centromere1_seq2 = str(SeqIO.read("./data/CHM1_chr16_cen.fasta", "fasta").seq)
centromere2_seq2 = str(SeqIO.read("./data/CHM1_chr20_cen.fasta", "fasta").seq)

# 解析比对结果，生成对应关系
def map_alignment(aligned_seq1, aligned_seq2):
    mapping = {}  # 存储每个碱基在第一条和第二条序列中的位置
    pos1 = 0
    pos2 = 0
    for base1, base2 in zip(aligned_seq1, aligned_seq2):
        if base1 != '-':
            pos1 += 1
        if base2 != '-':
            pos2 += 1
        if base1 != '-' and base2 != '-':
            mapping[pos1] = pos2  # 建立第一条序列到第二条序列的映射
    return mapping

mapping = map_alignment(aligned_seq1, aligned_seq2)

# 插入位置
insert_pos1 = 300000
insert_pos2 = 800000

# 第一个着丝粒序列长度
centromere1_length_seq1 = len(centromere1_seq)
centromere1_length_seq2 = len(centromere1_seq2)

# 第二个着丝粒序列长度
centromere2_length_seq1 = len(centromere2_seq)
centromere2_length_seq2 = len(centromere2_seq2)

# 第一段：从开头到第一个插入点之前
segment1_start_seq1 = 0
segment1_length_seq1 = insert_pos1

segment1_start_seq2 = 0
segment1_length_seq2 = mapping[insert_pos1]

# 第二段：第一个着丝粒
segment2_start_seq1 = insert_pos1
segment2_length_seq1 = centromere1_length_seq1

segment2_start_seq2 = mapping[insert_pos1]
segment2_length_seq2 = centromere1_length_seq2

# 第三个段落开始后，插入着丝粒后，坐标需要调整
# 第三段：第一个着丝粒结束后到第二个插入点之前
segment3_start_seq1 = insert_pos1 + centromere1_length_seq1
segment3_length_seq1 = insert_pos2 - insert_pos1

segment3_start_seq2 = mapping[insert_pos1] + centromere1_length_seq2
segment3_length_seq2 = mapping[insert_pos2] - mapping[insert_pos1]

# 插入第二个着丝粒，后续坐标调整
# 第四段：第二个着丝粒
segment4_start_seq1 = insert_pos2 + centromere1_length_seq1  # 第一个着丝粒影响后的坐标
segment4_length_seq1 = centromere2_length_seq1

segment4_start_seq2 = mapping[insert_pos2] + centromere1_length_seq2  # 也考虑第一个着丝粒的影响
segment4_length_seq2 = centromere2_length_seq2

# 第五段：第二个着丝粒结束后到序列的末尾
segment5_start_seq1 = insert_pos2 + centromere1_length_seq1 + centromere2_length_seq1
segment5_length_seq1 = len(seq1) - insert_pos2

segment5_start_seq2 = mapping[insert_pos2] + centromere1_length_seq2 + centromere2_length_seq2
segment5_length_seq2 = len(seq2) - mapping[insert_pos2]

# 输出结果到FASTA文件
seq1_with_centromeres = seq1[:insert_pos1] + centromere1_seq + seq1[insert_pos1:insert_pos2] + centromere2_seq + seq1[insert_pos2:]
seq2_with_centromeres = seq2[:mapping[insert_pos1]] + centromere1_seq2 + seq2[mapping[insert_pos1]:mapping[insert_pos2]] + centromere2_seq2 + seq2[mapping[insert_pos2]:]

with open("./data/mixture_seq1_2.fasta", "w") as f1, open("./data/mixture_seq2_2.fasta", "w") as f2:
    f1.write(f">mixture_seq1\n{seq1_with_centromeres}\n")
    f2.write(f">mixture_seq2\n{seq2_with_centromeres}\n")

# 写入五段位置和长度到CSV文件
with open("./csv/position2.csv", "w", newline="") as csvfile:
    fieldnames = ['Segment', 'Start_Seq1', 'Length_Seq1', 'Start_Seq2', 'Length_Seq2']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    # 写入标题
    writer.writeheader()
    
    # 写入每段的位置和长度
    writer.writerow({'Segment': 'Segment 1', 'Start_Seq1': segment1_start_seq1, 'Length_Seq1': segment1_length_seq1, 'Start_Seq2': segment1_start_seq2, 'Length_Seq2': segment1_length_seq2})
    writer.writerow({'Segment': 'Centromere 1', 'Start_Seq1': segment2_start_seq1, 'Length_Seq1': segment2_length_seq1, 'Start_Seq2': segment2_start_seq2, 'Length_Seq2': segment2_length_seq2})
    writer.writerow({'Segment': 'Segment 2', 'Start_Seq1': segment3_start_seq1, 'Length_Seq1': segment3_length_seq1, 'Start_Seq2': segment3_start_seq2, 'Length_Seq2': segment3_length_seq2})
    writer.writerow({'Segment': 'Centromere 2', 'Start_Seq1': segment4_start_seq1, 'Length_Seq1': segment4_length_seq1, 'Start_Seq2': segment4_start_seq2, 'Length_Seq2': segment4_length_seq2})
    writer.writerow({'Segment': 'Segment 3', 'Start_Seq1': segment5_start_seq1, 'Length_Seq1': segment5_length_seq1, 'Start_Seq2': segment5_start_seq2, 'Length_Seq2': segment5_length_seq2})

print("序列修改完成，五段位置和长度已保存到 CSV 文件。")
