Notebook to perform Branch Lenght Scoring (BLS)

1. Cluster-Buster with the icisTarget motif collection on conserved seqeunces per species

  • Input: per species, merged fasta sequences of the conserved MEL enhancers were used.
  • The icisTarget motif collection v8 was used, containing 20,003 motifs.
  • Cluster-Buster will score each of these fastas with the 20,003 motifs.
  • Per sequence, only the top CRM score is kept.
In [ ]:
module load Cluster-Buster/20180705-GCCcore-6.4.0

# Cluster-Buster output directory.
cbust_output_dir="/outdir";

# Run Cluster-Buster for each species.
for species in hg19 canFam3 mm10 susScr3 equCab2 danRer7; do
    # Specify FASTA file to score with the current species.
    species_regions_fasta_filename="Shared_Merged.${species}.fa";
    
    # Score all regions for one motif at the time.
    ls /staging/leuven/stg_00002/lcb/icistarget/data/motifCollection/v8/singletons_md5/*.cb \
        | cbust_output_dir="${cbust_output_dir}" \
            species="${species}" \
            species_regions_fasta_filename="${species_regions_fasta_filename}" \
            parallel \
                --env cbust_output_dir \
                --env species \
                --env species_regions_fasta_filename \
                -j 20 \
                '
                cb_filename={};
                cb_basename={/.};
                cb_basename="${cb_basename%.cb}";
             
                cbust_output_filename="${cbust_output_dir}/${cb_basename}.${species}.tsv";
             
                # Score current species FASTA file for current motif:
                #   - Minimum CRM score: 0.0
                #   - Use whole sequence of region (10000 bp up and down)
                #     to calculate background nucleotide frequencies.
                #   - Keep only top CRM score per region.
                # -r: # Range in bp for counting local nucleotide abundances (default = 100)
                # -t:  # Keep top X clusters per sequence (0 (= all))
                # -f:  #Output format: sorted by cluster score: seq name, score, seq number, rank
                # different than before: use minimal moritf score of 3
                cbust \
                    -c 0.0 \
                    -m 3 \
                    -r 10000 \
                    -t 1 \
                    -f 4 \
                    "${cb_filename}" \
                    "${species_regions_fasta_filename}" \
                    | cut -f 1,2 \
                    > "${cbust_output_filename}";
                '
             
done

2. BLS scoring

  • Output from previous step: a file per motif per species containing the top CRM score for the motif in each of the conserved sequences.
  • Combine CRM scores per region over multiple species for each motif into one output file per motif.
  • Calculate the BLS per motif: outputs a tsv file with the summed BLS per motif.
In [ ]:
# 1. Combine CRM scores per region over multiple species for each motif into one output file per motif.

module load  Python/3.6.4-foss-2018a 

combine_CRM_scores_per_region_over_multiple_species.py \ # See script below
    --md5-motifs /staging/leuven/stg_00002/lcb/icistarget/data/motifCollection/v8/motifs_count_md5_to_motif_names.tsv \
    --regions Shared_Merged.region_ids \
    --o cbust/@motif@.crm_scores.tsv5 \
    --species hg19 canFam3 mm10 susScr3 equCab2 danRer7 \
    --input cbust/@motif@.@species@.tsv \
    --output cbust/@motif@.crm_scores.tsv \
    --jobs 10
In [ ]:
# 2. Calculate BLS per motif
    # with CRM min = 3

module load  Python/3.6.4-foss-2018a 

source /user/leuven/317/vsc31703/stg_00002/system/software/virtualenv/BioPython/bin/activate

calculate_branch_length_score_per_motif.py \
    --motifs  /user/leuven/317/vsc31703/stg_00002/Melanoma-species/motif_ids.tsv \
    --phylo hg19.100way.phyloP100way.newick \
    --species hg19 \
    --crm 3.0 \
    --input cbust/@motif@.crm_scores.tsv \
    --o cbust/bls.motifs.tsv \
    --jobs 10

deactivate
In [ ]:
# 3. Make HTML

/motif_ids_tsv_to_html.py \
-i bls.motifs.tsv \
-o bls.motifs.html \
-m 8 -n hgnc -d 1 -r -t 'Cross-species Melanoma'

Extra Python scripts:

combine_CRM_scores_per_region_over_multiple_species.py

In [ ]:
#!/usr/bin/env python3

import argparse
import sys
import os.path
import multiprocessing as mp
import numpy as np
import pandas as pd


def get_region_or_motif_ids(region_or_motif_ids_filename):
    """
    Read file with all region IDs or motif IDs in a list.
    """

    with open(region_or_motif_ids_filename, 'r') as region_or_motif_ids_fh:
        region_or_motif_ids_df = pd.read_csv(
            filepath_or_buffer=region_or_motif_ids_fh,
            sep='\t',
            header=None,
            names=['region_or_motif_id'],
            index_col=None,
            usecols=[0],
            comment='#',
            engine='c',
        )
        return region_or_motif_ids_df['region_or_motif_id'].tolist()

    return None


def get_motif_md5_to_motif_ids(motif_md5_to_motif_ids_filename):
    """
    Read file with all motif MD5 names to motif IDs in a dataframe.
    """

    with open(motif_md5_to_motif_ids_filename, 'r') as motif_md5_to_motif_ids_fh:
        motif_md5_to_motif_ids_df = pd.read_csv(
            filepath_or_buffer=motif_md5_to_motif_ids_fh,
            sep='\t',
            header=None,
            names=['motif_md5', 'motif_id'],
            index_col=0,
            usecols=[0, 1],
            comment='#',
            engine='c',
        )
        return motif_md5_to_motif_ids_df

    return None


def create_zerod_dataframe_for_cbust_of_species(region_ids, species):
    """
    Return a dataframe:
      - with all species names as columns.
      - with all region IDs as rows.
      - initialised to 0 (CRM score).
    so it can be filled in later with the actual values.
    """

    nbr_regions = len(region_ids)
    nbr_species = len(species)

    region_ids_to_crm_score_per_species_df = pd.DataFrame(
        np.zeros(
            (nbr_regions, nbr_species),
            dtype=np.float
        ),
        index=pd.Index(data=region_ids, name='region_ids'),
        columns=species,
    )

    return region_ids_to_crm_score_per_species_df


def read_cbust_output_format4(cbust_output_format4_filename):
    """
    Read Cluster-Buster output format 4 (-f 4) in a dataframe:
      - with region IDs ('region_id') as row names (column 1 of input file)
      - with CRM scores ('crm_score') as column names (column 2 of input file)
    and return it.
    """
    if not os.path.exists(cbust_output_format4_filename):
        print(
            'Error: Cluster-Buster output format 4 file "{0:s}" could not be found.'.format(
                    cbust_output_format4_filename
            ),
            file=sys.stderr,
        )
        sys.exit(1)

    with open(cbust_output_format4_filename, 'r') as cbust_output_format4_fh:
        region_ids_to_crm_score_df = pd.read_csv(
            filepath_or_buffer=cbust_output_format4_fh,
            sep='\t',
            header=None,
            names=['region_id', 'crm_score'],
            index_col=[0],
            usecols=[0, 1],
            comment='#',
            engine='c',
        )
        return region_ids_to_crm_score_df

    return None


def combine_crm_scores_per_region_over_multiple_species(
        region_ids,
        species,
        cbust_output_format4_filename_template,
        combined_crm_scores_filename_template,
        motif_md5_or_motif_id,
        motif_id):
    """
    Combine CRM scores per region over multiple species for this
    motif MD5 name (or motif ID).

      - region_ids:
          List of all region IDs written to the output file.
      - species:
          List of species names for which the CRM scores will
          be written to the output file.
      - cbust_output_format4_filename_template:
          Cluster-Buster output format 4 filename template.
          It should contain "@motif@" and "@species@" which are
          replaced by the value specified in "motif_md5_or_motif_id"
          and each species name specified in "species" respectively.
          Example: "/some/path/@motif@.@species@.tsv"
      - combined_crm_scores_filename_template:
          Combined CRM scores output filename template.
          It should contain "@motif@" which is replaced by the value
          specified in "motif_id".
          Example "/some/path/@motif@.crm_scores.tsv"
      - motif_md5_or_motif_id:
          motif MD5 name (or motif ID).
      - motif_id:
          motif ID.
    """

    print(motif_md5_or_motif_id)
    # Create a dataframe:
    #   - with all species names as columns.
    #   - with all region IDs as rows.
    #   - initialised to 0 (CRM score).
    region_ids_to_crm_score_per_species_df = create_zerod_dataframe_for_cbust_of_species(
        region_ids=region_ids,
        species=species
    )

    for species_name in species:
        # Replace "@motif@" and "@species@" in the cbust_output_format4_filename_template
        # with the current motif MD5 name and species name.
        current_cbust_output_format4_filename = cbust_output_format4_filename_template

        try:
            motif_md5_offset = current_cbust_output_format4_filename.rindex('@motif@')
        except ValueError:
            print(
                'Error: Cluster-Buster output format 4 filename template "{0:s}" does not contain "@motif@".'.format(
                    cbust_output_format4_filename_template
                ),
                file=sys.stderr,
            )
            sys.exit(1)

        current_cbust_output_format4_filename = current_cbust_output_format4_filename[
            :motif_md5_offset
        ] + motif_md5_or_motif_id + current_cbust_output_format4_filename[motif_md5_offset + 7:]

        try:
            species_offset = current_cbust_output_format4_filename.rindex('@species@')
        except ValueError:
            print(
                'Error: Cluster-Buster output format 4 filename template "{0:s}" does not contain "@species@".'.format(
                    cbust_output_format4_filename_template
                ),
                file=sys.stderr,
            )
            sys.exit(1)

        current_cbust_output_format4_filename = current_cbust_output_format4_filename[
            :species_offset
        ] + species_name + current_cbust_output_format4_filename[species_offset + 9:]

        region_ids_to_crm_score_df = read_cbust_output_format4(
            cbust_output_format4_filename=current_cbust_output_format4_filename
        )

        region_ids_to_crm_score_per_species_df.loc[region_ids_to_crm_score_df.index, species_name] = region_ids_to_crm_score_df.loc[:, 'crm_score']

    # Replace "@motif@" in the combined_crm_scores_filename_template with
    # current motif ID.
    combined_crm_scores_filename = combined_crm_scores_filename_template

    try:
        motif_md5_offset = combined_crm_scores_filename.rindex('@motif@')
    except ValueError:
        print(
            'Error: Combined CRM scores filename template "{0:s}" does not contain "@motif@".'.format(
                combined_crm_scores_filename_template
            ),
            file=sys.stderr,
        )
        sys.exit(1)

    combined_crm_scores_filename = combined_crm_scores_filename[
        :motif_md5_offset
    ] + motif_id + combined_crm_scores_filename[motif_md5_offset + 7:]

    print(combined_crm_scores_filename)
    # Write TSV with CRM scores for each region (or 0.0 if region does not have a CRM score)
    with open(combined_crm_scores_filename, 'w')  as combined_crm_scores_fh:
        region_ids_to_crm_score_per_species_df.to_csv(
            path_or_buf=combined_crm_scores_fh,
            sep='\t',
            float_format='%.10g',
            header=True,
            index=True,
        )


def main():
    parser = argparse.ArgumentParser(
        description='Combine CRM scores per region over multiple species for each motif to one output file per motif.'
    )

    motifs_mutually_exclusive_group = parser.add_mutually_exclusive_group(required=True)
    motifs_mutually_exclusive_group.add_argument(
        '-m',
        '--motifs',
        dest='motif_ids_filename',
        action='store',
        type=str,
        required=False,
        help="File with list of motif IDs."
    )
    motifs_mutually_exclusive_group.add_argument(
        '-5',
        '--md5-motifs',
        dest='motif_md5_to_motif_ids_filename',
        action='store',
        type=str,
        required=False,
        help="File with list of motif MD5 names to motif IDs."
    )

    parser.add_argument(
       '-r',
       '--regions',
       dest='region_ids_filename',
       action='store',
       type=str,
       required=True,
       help='File with list of region IDs.'
    )
    parser.add_argument(
        '-s',
        '--species',
        dest='species',
        action='store',
        type=str,
        nargs='+',
        required=True,
        help='List of species (assemblies).'
    )
    parser.add_argument(
        '-i',
        '--input',
        dest='cbust_output_format4_filename_template',
        action='store',
        type=str,
        required=True,
        help='Cluster-Buster (output format "-f 4") output files template: "@motif@" is replaced by the list of motifs provided by --motifs or by the motif MD5 name provided by --md5-motifs and "@species@" is replaced by the species provided with --species: "/some/path/@motif@.@species@.tsv"'
    )
    parser.add_argument(
        '-o',
        '--output',
        dest='combined_crm_scores_filename_template',
        action='store',
        type=str,
        required=True,
        help='Combined CRM scores output filename template: "@motif@" is replaced by the list of motifs provided by --motifs or by the motif IDs provided by --md5-motifs: "/some/path/@motif@.crm_scores.tsv"'
    )
    parser.add_argument(
        '-j',
        '--jobs',
        dest='nbr_processes',
        action='store',
        type=int,
        required=False,
        default=1,
        help='Specify the number of processes to use. Each job process will process files for one motif at the time. Default=1 Recommended=10'
    )


    args = parser.parse_args()

    # Get all region IDs.
    region_ids = get_region_or_motif_ids(
        region_or_motif_ids_filename=args.region_ids_filename
    )

    if args.motif_ids_filename:
        # Get all motif IDs.
        motif_ids = get_region_or_motif_ids(
            region_or_motif_ids_filename=args.motif_ids_filename
        )

        # Construct a fake motif MD5 to motif ID dataframe, where
        # the motif MD5 names are the same than the motif IDs.
        motif_md5_to_motif_ids_df = pd.DataFrame(
            np.array(motif_ids),
            index=pd.Index(motif_ids, name='motif_md5'),
            columns=['motif_id'],
        )
    else:
        # Get motif MD5 to motif IDs dataframe.
        motif_md5_to_motif_ids_df = get_motif_md5_to_motif_ids(
            motif_md5_to_motif_ids_filename=args.motif_md5_to_motif_ids_filename
        )


    # Create a multiprocess pool.
    pool = mp.Pool(args.nbr_processes)

    # Process files for each motif (motif MD5 or motif ID) in parallel.
    for motif_md5_or_motif_id in motif_md5_to_motif_ids_df.index:
        pool.apply_async(
            func=combine_crm_scores_per_region_over_multiple_species,
            args=(
                region_ids,
                args.species,
                args.cbust_output_format4_filename_template,
                args.combined_crm_scores_filename_template,
                motif_md5_or_motif_id,
                motif_md5_to_motif_ids_df.loc[motif_md5_or_motif_id, 'motif_id']
            )
        )

    pool.close()
    pool.join()

    sys.exit(0)


if __name__ == "__main__":
    main()

calculate_branch_length_score_per_motif.py

In [ ]:
#!/usr/bin/env python3

import argparse
import functools
import multiprocessing as mp
import pandas as pd
import sys

import Bio.Phylo


def get_motif_ids(motif_ids_filename):
    """
    Read file with all motif IDs in a list.
    """

    with open(motif_ids_filename, 'r') as motif_ids_fh:
        motif_ids_df = pd.read_csv(
            filepath_or_buffer=motif_ids_fh,
            sep='\t',
            header=None,
            names=['motif_id'],
            index_col=None,
            usecols=[0],
            comment='#',
            engine='c',
        )
        return motif_ids_df['motif_id'].tolist()

    return None


@functools.lru_cache(maxsize=2048)
def get_total_branch_length_score_for_species(
        phylogenetic_tree_filename,
        for_species,
        species_for_tbls):
    """
    Get total branch lenght score for species specified in for_species:

      - phylogenetic_tree_filename:
          Phylogenetic tree filename in Newick format.
      - for_species:
          Species (assembly) of interest.
      - species_for_tbls:
          List of species to use to calculate the total branch length score.
          If it does not include species specified in "for_species" the
          total branch length score will be set to 0.0.
    """

    # Parse phylogenetic tree.
    phylogenetic_tree = Bio.Phylo.read(phylogenetic_tree_filename, 'newick')

    # Set of species names which will be used to calculate total branch length score.
    species_for_tbls = set(species_for_tbls)

    if for_species not in species_for_tbls:
        # If the species of interest is not in the set of species used to
        # calculate total branch length score, return 0.0 as it means the
        # motif is not available in our species of interest.
        return 0.0

    # Create phylogenetic tree with only the species defined in species_for_tbls.
    for species_to_prune in phylogenetic_tree.get_terminals():
        if species_to_prune.name not in species_for_tbls:
            phylogenetic_tree.prune(species_to_prune)

    # Check if all species in species_for_tbls were found.
    assert phylogenetic_tree.count_terminals() == len(species_for_tbls), \
        'Not all species of interest were found in the phylogenetic tree.'

    # Calculate the total branch length after only keeping those species of interest.
    return phylogenetic_tree.total_branch_length()


def get_total_branch_length_score_for_species_true_false_list(
        species_true_false_list,
        phylogenetic_tree_filename,
        for_species,
        all_species):
    """
    Get total branch lenght score for species specified in for_species:

      - species_true_false_list:
          List of True and False value, where each position in the list
          corresponds with the species (assembly) in all_species.
            - True:  CRM score for species (assembly) in list was > CRM scorethreshold.
            - False: CRM score for species (assembly) in list was <= CRM score threshold.
      - phylogenetic_tree_filename:
          Phylogenetic tree filename in Newick format.
      - for_species:
          Species (assembly) of interest.
      - all_species:
          List of all species (same length as species_true_false_list)

    This is a helper function to call "get_total_branch_length_score_for_species"
    for each row of "region_ids_to_true_false_per_species_df" dataframe
    with apply.
    """

    # Create list of species to use for total branch length score by selecting
    # all species which have a True value in species_true_false_list.
    species_for_tbls = tuple(
         true_false_all_species[1]
         for true_false_all_species in zip(species_true_false_list, all_species)
         if true_false_all_species[0]
    )

    # Calculate total branch length score for requested species.
    return get_total_branch_length_score_for_species(
        phylogenetic_tree_filename,
        for_species,
        species_for_tbls
    )


def read_crm_scores_per_region_over_multiple_species(
        crm_scores_per_region_over_multiple_species_filename):
    """
    Read CRM scores per region for multiple speecies in a dataframe:
      - with region IDs ('region_id') as row names (column 1 of input file)
      - with CRM scores for each species as column names (column 2 till n of input file)
    and return it.
    """

    with open(crm_scores_per_region_over_multiple_species_filename, 'r') as crm_scores_per_region_over_multiple_species_fh:
        region_ids_to_crm_scores_per_species_df = pd.read_csv(
            filepath_or_buffer=crm_scores_per_region_over_multiple_species_fh,
            sep='\t',
            header=0,
            index_col=[0],
            comment='#',
            engine='c',
        )
        return region_ids_to_crm_scores_per_species_df

    return None


def calculate_branch_length_score_for_motif(
        motif_id,
        crm_scores_per_region_over_multiple_species_filename,
        phylogenetic_tree_filename,
        for_species,
        crm_score_threshold):
    """
    Calculate branch length score for a motif for the species specified in for_species:

      - motif_id:
         Motif ID.
      - crm_scores_per_region_over_multiple_species_filename:
          Filename with CRM scores:
            - with region IDs ('region_id') as row names (column 1 of input file)
            - with CRM scores for each species as column names (column 2 till n of input file)
      - phylogenetic_tree_filename:
          Phylogenetic tree filename in Newick format.
      - for_species:
          Species (assembly) of interest.
      - crm_score_threshold:
          CRM score threshold which determines if a motif is present in
          a region for a species or not.
    """

    # Read CRM scores for regions scores in multiple species for a certain motif.
    region_ids_to_crm_scores_per_species_df = read_crm_scores_per_region_over_multiple_species(
        crm_scores_per_region_over_multiple_species_filename=crm_scores_per_region_over_multiple_species_filename
    )

    # Get list of all species for which we have CRM scores.
    all_species = tuple(region_ids_to_crm_scores_per_species_df.columns)

    # Filter out all regions for the species of interest (for_species)
    # that don't reach the CRM score threshold:
    #     region_ids_to_crm_scores_per_species_df[
    #          region_ids_to_crm_scores_per_species_df[for_species] > crm_score_threshold
    #     ]
    #
    # Then convert the filtered dataframe to a boolean dataframe:
    #   - True:  CRM score for current region ID in current species  > CRM score
    #   - False: CRM score for current region ID in current species <= CRM score
    region_ids_to_true_false_per_species_df = region_ids_to_crm_scores_per_species_df[
        region_ids_to_crm_scores_per_species_df[for_species] > crm_score_threshold
    ] > crm_score_threshold

    # If none of the CRM scores for the species of interest (for_species)
    # is above the CRM score threshold, return 0.0.
    if region_ids_to_true_false_per_species_df.shape[0] == 0:
        return motif_id, 0.0

    # Calculate the total branch length score for each region and sum those
    # branch length scores for each region so we have a total score for the
    # current motif.
    return (
        motif_id,
        region_ids_to_true_false_per_species_df.apply(
            func=get_total_branch_length_score_for_species_true_false_list,
            axis=1,
            args=(
                phylogenetic_tree_filename,
                for_species,
                all_species
            )
        ).sum()
    )


def main():
    parser = argparse.ArgumentParser(
        description='Calculate branch length score for each motif.'
    )
    parser.add_argument(
        '-m',
        '--motifs',
        dest='motif_ids_filename',
        action='store',
        type=str,
        required=True,
        help="File with list of motif IDs."
    )
    parser.add_argument(
        '-p',
        '--phylo',
        dest='phylogenetic_tree_filename',
        action='store',
        type=str,
        required=True,
        help='Phylogenetic tree file in Newick format.'
    )
    parser.add_argument(
        '-s',
        '--species',
        dest='for_species',
        action='store',
        type=str,
        required=True,
        help='Species (assembly) of interest for which to calculate the branch length score for each motif.'
    )
    parser.add_argument(
        '-c',
        '--crm',
        dest='crm_score_threshold',
        action='store',
        type=float,
        required=False,
        default=3.0,
        help='CRM score threshold to use to decide if a CRM is found in a region or not. Default: 3.0'
    )
    parser.add_argument(
        '-i',
        '--input',
        dest='crm_scores_per_region_over_multiple_species_filename_template',
        action='store',
        type=str,
        required=True,
        help='CRM scores per region over multiple species filename template: "@motif@" is replaced by the list of motifs provided by --motifs: "/some/path/@motif@.crm_scores.tsv"'
    )
    parser.add_argument(
        '-o',
        '--output',
        dest='bls_filename',
        action='store',
        type=str,
        required=True,
        help='Output filename with a branch length score for each motif.'
    )
    parser.add_argument(
        '-j',
        '--jobs',
        dest='nbr_processes',
        action='store',
        type=int,
        required=False,
        default=1,
        help='Specify the number of processes to use. Each job process will process files for one motif at the time. Default=1 Recommended=10'
    )

    args = parser.parse_args()

    # Get all motif IDs.
    motif_ids = get_motif_ids(motif_ids_filename=args.motif_ids_filename)


    # Create a multiprocess pool.
    pool = mp.Pool(args.nbr_processes)

    bls_scores_results = list()

    # Process files for each motif (motif MD5 or motif ID) in parallel.
    for motif_id in motif_ids:
        # Replace "@motif@" in crm_scores_per_region_over_multiple_species_filename_template
        # with the current motif MD5 name and species name.
        crm_scores_per_region_over_multiple_species_filename = args.crm_scores_per_region_over_multiple_species_filename_template

        try:
            motif_id_offset = crm_scores_per_region_over_multiple_species_filename.rindex('@motif@')
        except ValueError:
            print(
                'Error: CRM scores per region over multiple species filename template "{0:s}" does not contain "@motif@".'.format(
                    args.crm_scores_per_region_over_multiple_species_filename_template
                ),
                file=sys.stderr,
            )
            sys.exit(1)

        crm_scores_per_region_over_multiple_species_filename = crm_scores_per_region_over_multiple_species_filename[
            :motif_id_offset
        ] + motif_id + crm_scores_per_region_over_multiple_species_filename[motif_id_offset + 7:]

        bls_scores_results.append(
            pool.apply_async(
                func=calculate_branch_length_score_for_motif,
                args=(
                    motif_id,
                    crm_scores_per_region_over_multiple_species_filename,
                    args.phylogenetic_tree_filename,
                    args.for_species,
                    args.crm_score_threshold
                )
            )
        )

    pool.close()
    pool.join()

    with open(args.bls_filename, 'w') as bls_fh:
        for bls_scores_result in bls_scores_results:
            # Write motif name and branch length score for motif.
            print(
                *bls_scores_result.get(),
                sep='\t',
                file=bls_fh
            )

    sys.exit(0)


if __name__ == "__main__":
    main()

motif_ids_tsv_to_html.py

In [ ]:
#!/usr/bin/env python3

import argparse
import html
import os
import sys


if os.path.exists('/home/icistarget/data/'):
    data_dir = '/home/icistarget/data'
elif os.path.exists('/staging/leuven/stg_00002/lcb/icistarget/data/'):
    data_dir = '/staging/leuven/stg_00002/lcb/icistarget/data'
else:
    data_dir = os.path.join(os.path.dirname(__file__), 'data')

motif_collections_dir = os.path.join(data_dir, 'motifCollection')
motif_to_tf_dir = os.path.join(data_dir, 'motif2tf/snapshots')


class MotifCollections:
    zips = {
        3: os.path.join(motif_collections_dir, 'zips/motifs-v3.zip'),
        6: os.path.join(motif_collections_dir, 'zips/motifs-v6-nr.zip'),
        7: os.path.join(motif_collections_dir, 'zips/motif_collection_v7.zip'),
        8: os.path.join(motif_collections_dir, 'zips/motif_collection_v8.zip'),
        9: os.path.join(motif_collections_dir, 'zips/motif_collection_v9.zip'),
    }

    extracted = {
        3: os.path.join(motif_collections_dir, 'v3'),
        6: os.path.join(motif_collections_dir, 'v6'),
        7: os.path.join(motif_collections_dir, 'v7'),
        8: os.path.join(motif_collections_dir, 'v8'),
        9: os.path.join(motif_collections_dir, 'v9'),
    }

    motifs_count_md5_to_motif_names_filename = {
        7: os.path.join(motif_collections_dir, 'v7/motifs_count_md5_to_motif_names.tsv'),
        8: os.path.join(motif_collections_dir, 'v8/motifs_count_md5_to_motif_names.tsv'),
        9: os.path.join(motif_collections_dir, 'v9/motifs_count_md5_to_motif_names.tsv'),
    }

    @staticmethod
    def get_motifs_count_md5_to_motif_names_filename(motif_collection_version):
        return MotifCollections.motifs_count_md5_to_motif_names_filename.get(motif_collection_version)

    @staticmethod
    def get_motifs_count_md5_to_motif_names(motif_collection_version):
        motifs_count_md5_to_motif_id = dict()
        motifs_count_md5_to_alternative_motif_ids = dict()

        motifs_count_md5_to_motif_names_filename = MotifCollections.get_motifs_count_md5_to_motif_names_filename(
            motif_collection_version=motif_collection_version,
        )

        if motifs_count_md5_to_motif_names_filename:
            with open(motifs_count_md5_to_motif_names_filename, 'r') as fh:
                for line in fh:
                    line = line.rstrip('\n')

                    if line.startswith('#'):
                        continue

                    columns = line.split('\t')

                    if len(columns) >= 2:
                        # motif_m5  motif_id  alternative_motif_ids
                        motif_md5 = columns[0]
                        motif_id = columns[1]
                        alternative_motif_ids = columns[2:]

                        motifs_count_md5_to_motif_id[motif_md5] = motif_id
                        motifs_count_md5_to_alternative_motif_ids[motif_md5] = alternative_motif_ids

        return motifs_count_md5_to_motif_id, motifs_count_md5_to_alternative_motif_ids


class MotifToTF:
    filenames = {
        (3, 'flybase', 'direct'): os.path.join(motif_to_tf_dir, 'motifs-v3-flybase-m0.0-o0.0.tbl'),
        (6, 'flybase', 'direct'): os.path.join(motif_to_tf_dir, 'motifs-v6-flybase-m0.000-o0.0.tbl'),
        (3, 'flybase', 'indirect'): os.path.join(motif_to_tf_dir, 'motifs-v6-flybase-m0.001-o0.0.tbl'),
        (6, 'hgnc', 'direct'): os.path.join(motif_to_tf_dir, 'motifs-v6-nr-hgnc-m0.0-o0.0.tbl'),
        (6, 'hgnc', 'indirect'): os.path.join(motif_to_tf_dir, 'motifs-v6-nr-hgnc-m0.001-o0.0.tbl'),
        (6, 'mgi', 'indirect'): os.path.join(motif_to_tf_dir, 'motifs-v6-nr.mgi-m0.001-o0.0.tbl'),
        (7, 'fbgn', 'direct'): os.path.join(motif_to_tf_dir, 'motifs-v7-nr.fbgn-m0.000-o0.0.tbl'),
        (7, 'fbgn', 'indirect'): os.path.join(motif_to_tf_dir, 'motifs-v7-nr.fbgn-m0.001-o0.0.tbl'),
        (7, 'flybase', 'direct'): os.path.join(motif_to_tf_dir, 'motifs-v7-nr.flybase-m0.000-o0.0.tbl'),
        (7, 'flybase', 'indirect'): os.path.join(motif_to_tf_dir, 'motifs-v7-nr.flybase-m0.001-o0.0.tbl'),
        (7, 'hgnc', 'direct'): os.path.join(motif_to_tf_dir, 'motifs-v7-nr.hgnc-m0.000-o0.0.tbl'),
        (7, 'hgnc', 'indirect'): os.path.join(motif_to_tf_dir, 'motifs-v7-nr.hgnc-m0.001-o0.0.tbl'),
        (7, 'mgi', 'direct'): os.path.join(motif_to_tf_dir, 'motifs-v7-nr.mgi-m0.000-o0.0.tbl'),
        (7, 'mgi', 'indirect'): os.path.join(motif_to_tf_dir, 'motifs-v7-nr.mgi-m0.001-o0.0.tbl'),
        (8, 'fbgn', 'direct'): os.path.join(motif_to_tf_dir, 'motifs-v8-nr.fbgn-m0.000-o0.0.tbl'),
        (8, 'fbgn', 'indirect'): os.path.join(motif_to_tf_dir, 'motifs-v8-nr.fbgn-m0.001-o0.0.tbl'),
        (8, 'flybase', 'direct'): os.path.join(motif_to_tf_dir, 'motifs-v8-nr.flybase-m0.000-o0.0.tbl'),
        (8, 'flybase', 'indirect'): os.path.join(motif_to_tf_dir, 'motifs-v8-nr.flybase-m0.001-o0.0.tbl'),
        (8, 'hgnc', 'direct'): os.path.join(motif_to_tf_dir, 'motifs-v8-nr.hgnc-m0.000-o0.0.tbl'),
        (8, 'hgnc', 'indirect'): os.path.join(motif_to_tf_dir, 'motifs-v8-nr.hgnc-m0.001-o0.0.tbl'),
        (8, 'mgi', 'direct'): os.path.join(motif_to_tf_dir, 'motifs-v8-nr.mgi-m0.000-o0.0.tbl'),
        (8, 'mgi', 'indirect'): os.path.join(motif_to_tf_dir, 'motifs-v8-nr.mgi-m0.001-o0.0.tbl'),
        (9, 'fbgn', 'direct'): os.path.join(motif_to_tf_dir, 'motifs-v9-nr.fbgn-m0.000-o0.0.tbl'),
        (9, 'fbgn', 'indirect'): os.path.join(motif_to_tf_dir, 'motifs-v9-nr.fbgn-m0.001-o0.0.tbl'),
        (9, 'flybase', 'direct'): os.path.join(motif_to_tf_dir, 'motifs-v9-nr.flybase-m0.000-o0.0.tbl'),
        (9, 'flybase', 'indirect'): os.path.join(motif_to_tf_dir, 'motifs-v9-nr.flybase-m0.001-o0.0.tbl'),
        (9, 'hgnc', 'direct'): os.path.join(motif_to_tf_dir, 'motifs-v9-nr.hgnc-m0.000-o0.0.tbl'),
        (9, 'hgnc', 'indirect'): os.path.join(motif_to_tf_dir, 'motifs-v9-nr.hgnc-m0.001-o0.0.tbl'),
        (9, 'mgi', 'direct'): os.path.join(motif_to_tf_dir, 'motifs-v9-nr.mgi-m0.000-o0.0.tbl'),
        (9, 'mgi', 'indirect'): os.path.join(motif_to_tf_dir, 'motifs-v9-nr.mgi-m0.001-o0.0.tbl'),
    }

    @staticmethod
    def get_motif_to_tf_filename(motif_to_tf_version, nomenclature, direct_or_indirect):
        return MotifToTF.filenames.get((motif_to_tf_version, nomenclature, direct_or_indirect))

    @staticmethod
    def get_motif_to_tf(motif_to_tf_version, nomenclature, direct_or_indirect):
        motif_to_tf = dict()

        motif_to_tf_filename = MotifToTF.filenames.get((motif_to_tf_version, nomenclature, direct_or_indirect))

        if motif_to_tf_filename:
            with open(motif_to_tf_filename, 'r') as fh:
                for line in fh:
                    line = line.rstrip('\n')

                    if line.startswith('#'):
                        continue

                    columns = line.split('\t')

                    if len(columns) == 4:
                        # motif_id  motif_description  gene_name  description
                        motif_id = columns[0]
                        tf = columns[2]
                        motif_to_tf.setdefault(motif_id, []).append(tf)
                    if len(columns) == 13:
                        # motif_id  motif_name  motif_description  source_name  source_version  gene_name
                        # motif_similarity_qvalue  similar_motif_id  similar_motif_description
                        # orthologous_identity  orthologous_gene_name  orthologous_species  description
                        motif_id = columns[0]
                        tf = columns[5]
                        motif_to_tf.setdefault(motif_id, []).append(tf)

        return motif_to_tf


def convert_tsv_to_html(motifs_tsv_filename, motifs_html_filename, motif_collection_version,
                        motif_to_tf_direct, motif_to_tf_indirect,
                        motif_id_column, motif_md5_column,
                        motifs_count_md5_to_motif_id, motifs_count_md5_to_alternative_motif_ids,
                        column_indices_output, display_rank, title):
    """
    Convert a TSV file with a motif ID column to an HTML report with:
      - Rank (if display_rank is True)
      - motif ID
      - alternative motif IDs (if motif_md5_column was chosen)
      - motif logo
      - direct TFs
      - indirect TFs
    columns added.

    :param motifs_tsv_filename: TSV file with motif ID column.
    :param motifs_html_filename: HTML file to which the report is written.
    :param motif_collection_version: Motif collection version of the used motif IDs.
    :param motif_to_tf_direct: motif to TF dictionary with all directly annotated TFs.
    :param motif_to_tf_indirect: motif to TF dictionary with all directly and indirectly annotated TFs.
    :param motif_id_column: column index that contains the motif IDs in the TSV file.
    :param motif_md5_column: column index that contains the motif md5 name in the TSV file.
    :param motifs_count_md5_to_motif_id: motif md5 name to motif ID dictionary.
    :param motifs_count_md5_to_alternative_motif_ids: motif md5 name to alternative motif ID dictionary.
    :param column_indices_output: Only output those column indices of the TSV file to the report (None: output all columns).
    :param display_rank: Display the rank for each row (True/False).
    :param title: Title used in the HTML report.
    :return: 
    """

    # Specify jQuery version.
    jquery_js_url = 'https://cdnjs.cloudflare.com/ajax/libs/jquery/3.2.1/jquery.min.js'

    # Compose DataTables version on https://datatables.net/download/
    #   - jQuery:
    #       - No jQuery
    #   - Styling:
    #       - DataTables
    #   - DataTables:
    #       - DataTables
    #   - Buttons:
    #       - Buttons
    #       - Column visibility
    #       - HTML5 export
    #       - JSZip
    #       - Print view
    #   - ColReorder:
    #       - ColReorder
    #   - FixedHeader:
    #       - FixedHeader
    #   - RowReorder:
    #       - RowReorder
    #   - Select:
    #       - Select
    #   - Packaging options:
    #       - Minify
    #       - Single file
    #       - CDN
    data_tables_js_url = 'https://cdn.datatables.net/v/dt/jszip-3.1.3/dt-1.10.15/b-1.3.1/b-colvis-1.3.1/b-html5-1.3.1/b-print-1.3.1/cr-1.3.3/fh-3.1.2/rr-1.2.0/se-1.2.2/datatables.min.js'
    data_tables_css_url = 'https://cdn.datatables.net/v/dt/jszip-3.1.3/dt-1.10.15/b-1.3.1/b-colvis-1.3.1/b-html5-1.3.1/b-print-1.3.1/cr-1.3.3/fh-3.1.2/rr-1.2.0/se-1.2.2/datatables.min.css'

    with open(motifs_html_filename, 'w') as fh_html:
        print(
            '''<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">''',
            '''  <title>{0:s}</title>'''.format(html.escape(title)),
            '''  <script type="text/javascript" charset="utf-8" src="{0:s}"></script>'''.format(jquery_js_url),
            '''  <script type="text/javascript" charset="utf-8" src="{0:s}"></script>'''.format(data_tables_js_url),
'''  <script>
    $(document).ready( function () {
        // DataTable
        var table = $('#motif_table_id').DataTable( {
            //dom: '<B><f><lp>rtip',
            dom: '<"clearfix"B><"clearfix"f><"clearfix"lp><"clearfix"i>rtip',
            ordering: true,
            searching: true,
            paging: true,
            lengthMenu: [
                [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, -1],
                [50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, "All"],
            ],
            pageLength: 500,
            buttons: [
                {
                    extend: 'colvis',
                    text: 'Show/hide columns',
                },
                {
                    extend: 'collection',
                    text: 'Copy',
                    buttons: [
                        {
                            extend: 'copyHtml5',
                            text: 'Selected',
                            exportOptions: {
                                 columns: ':visible',
                                 modifier: {
                                     selected: true,
                                 }
                            }
                        },
                        {
                            extend: 'copyHtml5',
                            text: 'All',
                            exportOptions: {
                                 columns: ':visible',
                            }
                        },
                    ],
                },
                {
                    extend: 'collection',
                    text: 'Export',
                    buttons: [
                        {
                            extend: 'collection',
                            text: 'CSV',
                            buttons: [
                                {
                                    extend: 'csvHtml5',
                                    text: 'Selected',
                                    exportOptions: {
                                         columns: ':visible',
                                         modifier: {
                                             selected: true,
                                         }
                                    }
                                },
                                {
                                    extend: 'csvHtml5',
                                    text: 'All',
                                    exportOptions: {
                                         columns: ':visible',
                                    }
                                },
                            ],
                        },
                        {
                            extend: 'collection',
                            text: 'Excel',
                            buttons: [
                                {
                                    extend: 'excelHtml5',
                                    text: 'Selected',
                                    exportOptions: {
                                         columns: ':visible',
                                         modifier: {
                                             selected: true,
                                         }
                                    }
                                },
                                {
                                    extend: 'excelHtml5',
                                    text: 'All',
                                    exportOptions: {
                                         columns: ':visible',
                                    }
                                },
                            ],
                        },
                    ],
                },
                {
                    extend: 'collection',
                    text: 'Print',
                    buttons: [
                        {
                            extend: 'print',
                            text: 'Selected',
                            exportOptions: {
                                 stripHtml: false,
                                 columns: ':visible',
                                 modifier: {
                                     selected: true,
                                 }
                            }
                        },
                        {
                            extend: 'print',
                            text: 'All',
                            exportOptions: {
                                 stripHtml: false,
                                 columns: ':visible',
                            }
                        },
                    ],
                },
            ],
            colReorder: true,
            fixedHeader: {
                footer: true,
            },
            select: true,
            rowReorder: true,
        } );

        // Add search filters to each column of the table footer.
        table.columns().every( function () {
            var column = this;

            $('input', this.footer() ).on('keyup change', function () {
                column
                    .search(this.value)
                    .draw();
            } );
        } );

    } );
  </script>''',
            '  <link rel="stylesheet" type="text/css" href="{0:s}">'.format(data_tables_css_url),
            '''  <style>
      body {
          margin: 10px 30px;
      }
      .clearfix:after {
          content: "";
          clear: both;
          display: table;
          margin: 10px 0;
      }
      a.dt-button {
          margin: 10px 15px 10px 0;
      }
      a.buttons-colvis {
          margin: 10px 50px 10px 0;
      }
      #motif_table_id_info {
          padding-left: 20px;
          color: blue;
      }
      img.logo {
          width: 200px;
      }
  </style>
  </head>
  <body>
    <table id="motif_table_id" class="display compact">''',
            sep='\n',
            file=fh_html,
        )

        header_line = False
        header_printed = False
        footer=''
        current_ranking = 0

        with open(motifs_tsv_filename, 'r') as fh_tsv:
            for line in fh_tsv:
                line = line.rstrip('\r\n')

                if not line:
                    continue

                if line.startswith('#'):
                    # Get header and remove leading comment and whitespace.
                    header_line = True
                    line = line.lstrip('# ')
                else:
                    header_line = False
                    current_ranking += 1

                columns = line.split('\t')

                if column_indices_output:
                    # Take only specified columns for displaying in the output after the motif info.
                    output_columns = [columns[column_idx] for column_idx in column_indices_output]
                else:
                    # Make a copy of all columns.
                    output_columns = columns.copy()

                    # Make the corresponding column indices output list,
                    # which will be used for naming the columns if no header was found.
                    column_indices_output = list(range(len(columns)))

                    if motif_id_column is not None:
                        # Remove the motif ID column from the output list as
                        # it will be displayed in the motif info part already.
                        output_columns.pop(motif_id_column)
                        column_indices_output.remove(motif_id_column)

                if header_line is True:
                    # Print table header with header names used in the TSV file.
                    print(
                        '<thead>',
                        ('<tr>' +
                         ('<th>rank</th>'
                          if display_rank
                          else '') +
                         '<th>motif_id</th>' +
                         ('<th>alternative_motif_ids</th>'
                          if motif_md5_column is not None
                          else '') +
                         '<th>motif logo</th>'
                         '<th>direct TFs</th>'
                         '<th>indirect TFs</th>' +
                         ('<th>' + '</th><th>'.join([html.escape(output_column)
                                                    for output_column in output_columns]
                                                    ) + '</th>'
                          if len(output_columns) > 0
                          else '') +
                         '</tr>'),
                        '</thead>',
                        '<tbody>',
                        sep='\n',
                        file=fh_html,
                    )

                    # Make table footer with search boxes for each column.
                    footer = ('<tfoot>\n' +
                              ('<tr>' +
                               ('<th><input type="text" placeholder="Search rank" /></th>'
                                if display_rank
                                else '') +
                               '<th><input type="text" placeholder="Search motif_id" /></th>' +
                               ('<th><input type="text" placeholder="Search alternative_motif_ids" /></th>'
                                if motif_md5_column is not None
                                else '') +
                               '<th><input type="text" placeholder="Search motif logo" /></th>'
                               '<th><input type="text" placeholder="Search direct TFs" /></th>'
                               '<th><input type="text" placeholder="Search indirect TFs" /></th>' +
                               ('<th><input type="text" placeholder="Search ' +
                                '" /></th><th><input type="text" placeholder="Search '.join(
                                    [html.escape(output_column)
                                     for output_column in output_columns]
                                ) + '" /></th>'
                                if len(output_columns) > 0
                                else '') +
                               '</tr>') +
                              '</tfoot>')

                    header_printed = True

                    continue
                elif current_ranking == 1 and header_printed is False:
                    # Print table header with header column numbered names.
                    print(
                        '<thead>',
                        ('<tr>' +
                         ('<th>Rank</th>'
                          if display_rank
                          else '') +
                         '<th>motif_id</th>' +
                         ('<th>alternative_motif_ids</th>'
                          if motif_md5_column is not None
                          else '') +
                         '<th>motif logo</th>'
                         '<th>direct TFs</th>'
                         '<th>indirect TFs</th>' +
                         ('<th>' + '</th><th>'.join(['column {0:d}'.format(column_idx + 1)
                                                    for column_idx in column_indices_output]
                                                    ) + '</th>'
                          if len(output_columns) > 0
                          else '') +
                         '</tr>'),
                        '</thead>',
                        '<tbody>',
                        sep='\n',
                        file=fh_html
                    )

                    header_printed = True

                    # Make table footer with search boxes for each column.
                    footer = ('<tfoot>\n' +
                              ('<tr>' +
                               ('<th><input type="text" placeholder="Search rank" /></th>'
                                if display_rank
                                else '') +
                               '<th><input type="text" placeholder="Search motif_id" /></th>' +
                               ('<th><input type="text" placeholder="Search alternative_motif_ids" /></th>'
                                if motif_md5_column is not None
                                else '') +
                               '<th><input type="text" placeholder="Search motif logo" /></th>'
                               '<th><input type="text" placeholder="Search direct TFs" /></th>'
                               '<th><input type="text" placeholder="Search indirect TFs" /></th>' +
                               ('<th><input type="text" placeholder="Search ' +
                                '" /></th><th><input type="text" placeholder="Search '.join(
                                    ['column {0:d}'.format(column_idx + 1)
                                     for column_idx in column_indices_output]
                                ) + '" /></th>'
                                if len(output_columns) > 0
                                else '') +
                               '</tr>') +
                              '</tfoot>')

                if motif_md5_column is not None:
                    # Get motif md5 name.
                    motif_md5 = columns[motif_md5_column]
                    # Get motif ID corresponding to motif md5 name.
                    motif_id = motifs_count_md5_to_motif_id[motif_md5]
                    # Get alternative motif IDs corresponding to motif md5 name.
                    alternative_motif_ids = motifs_count_md5_to_alternative_motif_ids[motif_md5]
                else:
                    # Get motif ID.
                    motif_id = columns[motif_id_column]

                # Make URL for motif logo.
                motif_id_logo_url = 'http://motifcollections.aertslab.org/v{0:d}/logos/{1:s}.png'.format(
                    motif_collection_version,
                    motif_id,
                )

                # Print each row of the table.
                print(
                    ('<tr>' +
                     ('<td>' + str(current_ranking) + '</td>'
                      if display_rank
                      else '') +
                     '<td>' + html.escape(motif_id) + '</td>' +
                     ('<td>' + html.escape(' '.join(alternative_motif_ids)) + '</td>'
                      if motif_md5_column is not None
                      else '') +
                     '<td><img src="' + motif_id_logo_url + '" alt="' + html.escape(motif_id) + '" class="logo" /></td>'
                     '<td>' + ' '.join([html.escape(tf_direct)
                                        for tf_direct in motif_to_tf_direct.get(motif_id, ' ')]
                                       ) + '</td>'
                     '<td>' + ' '.join([html.escape(tf_indirect)
                                        for tf_indirect in motif_to_tf_indirect.get(motif_id, ' ')]
                                       ) + '</td>' +
                     ('<td>' + '</td><td>'.join(output_columns) + '</td>'
                      if len(output_columns) > 0
                      else '') +
                     '</tr>'),
                    sep='\n',
                    file=fh_html,
                )

        print(
            '</tbody>',
            footer,
            '</table>',
            '</html>',
            sep='\n',
            file=fh_html,
        )


def main():
    parser = argparse.ArgumentParser(
        description='Convert TSV with motif IDs to a HTML file with logos and TF annotation.'
    )
    parser.add_argument(
        '-i',
        '--tsv',
        dest='tsv_filename',
        action='store',
        type=str,
        required=True,
        help='TSV input file with motifs'
    )
    parser.add_argument(
        '-o',
        '--html',
        dest='html_filename',
        action='store',
        type=str,
        required=True,
        help='HTML output file'
    )
    parser.add_argument(
        '-m',
        '--motif-collection-version',
        dest='motif_collection_version',
        action='store',
        type=int,
        required=True,
        choices={3, 6, 7, 8, 9},
        default=9,
        help='Motif collection version: 3, 6, 7, 8, 9 (default: 9)'
    )
    parser.add_argument(
        '-n',
        '--nomenclature',
        dest='nomenclature',
        action='store',
        type=str,
        required=True,
        choices={'fbgn', 'flybase', 'hgnc', 'mgi'},
        help='Nomenclature to use for motif to TF: fbgn, flybase, hgnc, mgi'
    )

    motif_group = parser.add_mutually_exclusive_group(required=True)
    motif_group.add_argument(
        '-d',
        '--motif-id-column',
        dest='motif_id_column',
        action='store',
        type=int,
        required=False,
        help='Specify which column contains the motif ID'
    )
    motif_group.add_argument(
        '-5',
        '--motif-md5-column',
        dest='motif_md5_column',
        action='store',
        type=int,
        required=False,
        help='Specify which column contains the motif md5 name'
    )

    parser.add_argument(
        '-c',
        '--columns',
        dest='one_based_column_indices_output',
        nargs='*',
        action='store',
        type=int,
        required=False,
        help='Specify which columns to use (default: all)'
    )
    parser.add_argument(
        '-r',
        '--rank',
        dest='display_rank',
        action='store_true',
        required=False,
        default=False,
        help='Display rank for each row in the HTML output.'
    )
    parser.add_argument(
        '-t',
        '--title',
        dest='title',
        action='store',
        type=str,
        required=False,
        help='Specify title which will be used in the HTML output.'
    )

    args = parser.parse_args()

    motif_to_tf_direct = MotifToTF.get_motif_to_tf(
        motif_to_tf_version=args.motif_collection_version,
        nomenclature=args.nomenclature,
        direct_or_indirect='direct',
    )
    motif_to_tf_indirect = MotifToTF.get_motif_to_tf(
        motif_to_tf_version=args.motif_collection_version,
        nomenclature=args.nomenclature,
        direct_or_indirect='indirect',
    )

    if args.motif_md5_column is not None:
        (motifs_count_md5_to_motif_id,
         motifs_count_md5_to_alternative_motif_ids) = MotifCollections.get_motifs_count_md5_to_motif_names(
            motif_collection_version=args.motif_collection_version
        )
    else:
        motifs_count_md5_to_motif_id = None
        motifs_count_md5_to_alternative_motif_ids = None

    convert_tsv_to_html(
        motifs_tsv_filename=args.tsv_filename,
        motifs_html_filename=args.html_filename,
        motif_to_tf_direct=motif_to_tf_direct,
        motif_to_tf_indirect=motif_to_tf_indirect,
        motif_collection_version=args.motif_collection_version,
        motif_id_column=(args.motif_id_column - 1 if args.motif_id_column is not None else None),
        motif_md5_column=(args.motif_md5_column - 1 if args.motif_md5_column is not None else None),
        motifs_count_md5_to_motif_id=motifs_count_md5_to_motif_id,
        motifs_count_md5_to_alternative_motif_ids=motifs_count_md5_to_alternative_motif_ids,
        column_indices_output=([one_based_column_idx - 1
                                for one_based_column_idx in args.one_based_column_indices_output]
                               if args.one_based_column_indices_output
                               else None),
        display_rank=args.display_rank,
        title=(args.title
               if args.title
               else os.path.basename(args.html_filename)),
    )

    sys.exit(0)


if __name__ == "__main__":
    main()