#!/usr/bin/env python3
# a custom python script for further analysis of SVs (comparison of SVs)
 
# Import modules
import pandas as pd
from itertools import chain
from pyfasta import Fasta

# Load Assemblytics bed files
sv1="1_0.Assemblytics_structural_variants.bed"
sv2="2_0.Assemblytics_structural_variants.bed"
sv1_data,sv2_data = pd.read_csv(sv1,sep='\t'),pd.read_csv(sv2,sep='\t')

# Split chromosomes into each respective dataframe
sv1_I = sv1_data[sv1_data['reference']=='chrI']
sv1_II = sv1_data[sv1_data['reference']=='chrII']
sv1_III = sv1_data[sv1_data['reference']=='chrIII']
sv1_IV = sv1_data[sv1_data['reference']=='chrIV']
sv1_V = sv1_data[sv1_data['reference']=='chrV']
sv1_X = sv1_data[sv1_data['reference']=='chrX']
sv2_I = sv2_data[sv2_data['reference']=='I']
sv2_II = sv2_data[sv2_data['reference']=='II']
sv2_III = sv2_data[sv2_data['reference']=='III']
sv2_IV = sv2_data[sv2_data['reference']=='IV']
sv2_V = sv2_data[sv2_data['reference']=='V']
sv2_X = sv2_data[sv2_data['reference']=='X']

# Combine all SV regions with 'offset' (default: start-1000~end+1000)
def conc_ranges_from_chr(chrom,offset=1000):
    concatenated = range(0)
    for each_row in chrom.iterrows():
        concatenated = chain(concatenated,range(max(each_row[1][1]-offset,0),each_row[1][2]+offset))
    return set(concatenated)
	
# Get all SV regions with 500bp offset
sv1_I_conc_500 = conc_ranges_from_chr(sv1_I,offset=500)
sv1_II_conc_500 = conc_ranges_from_chr(sv1_II,offset=500)
sv1_III_conc_500 = conc_ranges_from_chr(sv1_III,offset=500)
sv1_IV_conc_500 = conc_ranges_from_chr(sv1_IV,offset=500)
sv1_V_conc_500 = conc_ranges_from_chr(sv1_V,offset=500)
sv2_I_conc_500 = conc_ranges_from_chr(sv2_I,offset=500)
sv2_II_conc_500 = conc_ranges_from_chr(sv2_II,offset=500)
sv2_III_conc_500 = conc_ranges_from_chr(sv2_III,offset=500)
sv2_IV_conc_500 = conc_ranges_from_chr(sv2_IV,offset=500)
sv2_V_conc_500 = conc_ranges_from_chr(sv2_V,offset=500)

# Compare SVs from different genome whether SVs from one genome are covering SVs from another genome (as a Superset)
def check_subset(chrom, ref_chrom, if_print=False,including_print=False):
    result_lst = []
    for each_row in chrom.iterrows():
        if set(range(each_row[1][1],each_row[1][2]+1)).issubset(ref_chrom):
            result_lst += [1]
            if including_print:
                print(each_row)
        else:
            result_lst += [0]
            if if_print:
                print(each_row)
    return result_lst

# Check the detail for further analysis
def check_subset_detail(chrom, ref_chrom, if_print=False,including_print=False):
    result_lst = []
    for each_row in chrom.iterrows():
        if not set(range(each_row[1][1],each_row[1][2]+1)).issubset(ref_chrom):
            result_lst += [each_row]
            if including_print:
                print(each_row)
        
    return result_lst

# Getting the results in 1:1 manner	
check_sv1_I_sv2_I = check_subset(sv1_I,sv2_I_conc,if_print=False)
check_sv1_II_sv2_II = check_subset(sv1_II,sv2_II_conc,if_print=False)
check_sv1_III_sv2_III = check_subset(sv1_III,sv2_III_conc,if_print=False)
check_sv1_IV_sv2_IV = check_subset(sv1_IV,sv2_IV_conc,if_print=False)
check_sv1_V_sv2_V = check_subset(sv1_V,sv2_V_conc,if_print=False)
check_sv1_X_sv2_X = check_subset(sv1_X,sv2_X_conc,if_print=False)
check_sv2_I_sv1_I = check_subset(sv2_I,sv1_I_conc,if_print=False)
check_sv2_II_sv1_II = check_subset(sv2_II,sv1_II_conc,if_print=False)
check_sv2_III_sv1_III = check_subset(sv2_III,sv1_III_conc,if_print=False)
check_sv2_IV_sv1_IV = check_subset(sv2_IV,sv1_IV_conc,if_print=False)
check_sv2_V_sv1_V = check_subset(sv2_V,sv1_V_conc,if_print=False)
check_sv2_X_sv1_X = check_subset(sv2_X,sv1_X_conc,if_print=False)

# Getting the detail in the same manner
# Save each result manually into txt files
check_sv1_I_sv2_Id = check_subset_detail(sv1_I,sv2_I_conc,if_print=False)
check_sv1_II_sv2_IId = check_subset_detail(sv1_II,sv2_II_conc,if_print=False)
check_sv1_III_sv2_IIId = check_subset_detail(sv1_III,sv2_III_conc,if_print=False)
check_sv1_IV_sv2_IVd = check_subset_detail(sv1_IV,sv2_IV_conc,if_print=False)
check_sv1_V_sv2_Vd = check_subset_detail(sv1_V,sv2_V_conc,if_print=False)
check_sv1_X_sv2_Xd = check_subset_detail(sv1_X,sv2_X_conc,if_print=False)
check_sv2_I_sv1_Id = check_subset_detail(sv2_I,sv1_I_conc,if_print=False)
check_sv2_II_sv1_IId = check_subset_detail(sv2_II,sv1_II_conc,if_print=False)
check_sv2_III_sv1_IIId = check_subset_detail(sv2_III,sv1_III_conc,if_print=False)
check_sv2_IV_sv1_IVd = check_subset_detail(sv2_IV,sv1_IV_conc,if_print=False)
check_sv2_V_sv1_Vd = check_subset_detail(sv2_V,sv1_V_conc,if_print=False)
check_sv2_X_sv1_Xd = check_subset_detail(sv2_X,sv1_X_conc,if_print=False)	