import pandas as pd
import os

# Let's assume the path is '/mnt/data/result/mt/' where the folders and files are stored
base_path = './result/mt/'

# List of dataset directories (dataset_sim)
dataset_dirs = [f'dataset_{i}_1' for i in range(80, 100)]
dataset_dirs = ['dataset_70_1']+['dataset_75_1'] + dataset_dirs
# Initialize a dictionary to store the total MatchLength for each dataset
match_length_sums = {}
anchor_counts = {}
coverage = {}
# Loop over each dataset directory
for dataset in dataset_dirs:
    # Define the path to the final_anchor.csv file within the dataset directory
    csv_path = os.path.join(base_path, dataset, 'final_anchor.csv')
    
    # Check if the file exists (to avoid any missing file errors)
    if os.path.exists(csv_path):
        # Read the CSV
        df = pd.read_csv(csv_path)
        
        # Calculate the sum of MatchLength column
        match_length_sum = df['MatchLength'].sum()
        
        # Calculate the count of anchors (number of rows)
        anchor_count = len(df)
        
        # Store the results in the dictionaries
        match_length_sums[dataset] = match_length_sum
        anchor_counts[dataset] = anchor_count
        coverage[dataset] = match_length_sum / 1000000

# Convert the results into a DataFrame
match_length_df = pd.DataFrame({
    'Dataset': list(match_length_sums.keys()),
    'Anchor Length Sum': list(match_length_sums.values()),
    'Anchor Count': list(anchor_counts.values()),
    'Coverage': list(coverage.values())
})

# Define the output CSV path
output_csv_path = './csv/mt_match_length_anchor_counts.csv'

# Save the DataFrame to CSV
match_length_df.to_csv(output_csv_path, index=False)