print('Importing libraries...')
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data
print('Loading data...' )
data = pd.read_csv('full_coverage_data_for_CLL_genes.tsv', sep='\t')

# change average_coverage to median_coverage
data.rename(columns={'Average_Coverage': 'Median_Coverage'}, inplace=True)

# Melt the dataframe for easier plotting with seaborn
melted_df = data.melt(id_vars=['Gene_Name', 'Total_Exonic_Length'], 
                    var_name='Sample', 
                    value_name='Median_Coverage')

# split the Sample column by "_" and make it only the last part of the split
melted_df['Sample'] = melted_df['Sample'].str.split('_').str[-1]

# Add a new column called sequencer using the following logic:
# If the sample name starts with "20583", set the sequencer to "Sequel II (N=4)"
# If the sample name starts with "20802" or "21183", set the sequencer to "Revio
# If the sample name starts with "16150", set the sequencer to "Short-Read Sequencing (N=8)"
# If the sample name does not match any of the above, set the sequencer to "Unknown"
melted_df['Sequencer'] = 'Unknown'
melted_df.loc[melted_df['Sample'].str.startswith('20583'), 'Sequencer'] = 'Sequel II (N=4)'
melted_df.loc[melted_df['Sample'].str.startswith('20802') | melted_df['Sample'].str.startswith('21183'), 'Sequencer'] = 'Revio (N=8)'
melted_df.loc[melted_df['Sample'].str.startswith('16150'), 'Sequencer'] = 'Short-read sequencing (N=8)'

sequencer_avg = melted_df.groupby('Sequencer')['Median_Coverage'].mean().reset_index()


print('Plotting...')
run_colors = {
    'Sequel II (N=4)': sns.color_palette()[0],
    'Revio (N=8)': sns.color_palette()[1],
    'Short-read sequencing (N=8)': "black"
}
label_size = 17
tick_size = 15

# Create a dot plot of the data
plt.figure(figsize=(8,6))
ax = sns.scatterplot(data=melted_df, x='Total_Exonic_Length', y='Median_Coverage', hue='Sequencer', palette=run_colors, s=50)
#plt.xscale('log')
plt.xlabel('Transcript Length', fontsize=label_size)
plt.ylabel('Median Coverage', fontsize=label_size)
plt.tick_params(axis='y', labelsize=tick_size)
plt.tick_params(axis='x', labelsize=tick_size)
#plt.title('Scatter Plot of Average Coverage vs Total Exonic Length Across Samples')
# Retrieve current handles and labels
handles, labels = ax.get_legend_handles_labels()

# Desired order of labels
desired_order = ['Sequel II (N=4)', 'Revio (N=8)', 'Short-read sequencing (N=8)']

# Reorder handles and labels according to the desired order
ordered_handles = [handles[labels.index(label)] for label in desired_order if label in labels]
ordered_labels = [label for label in desired_order if label in labels]

# Create the legend with the reordered handles and labels
plt.legend(ordered_handles, ordered_labels, loc='upper right', fontsize=tick_size)


plt.savefig('coverage_scatter_plot.png')
#plt.show()
print('Done!')