#!/usr/bin/env python
from collections import defaultdict
import numpy as np
import pandas as pd
from GGR import utils
import argparse

parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, \
description="""

take_mean_across_replicates.py

Given a tab-separated dataframe with an index column of, e.g., gene names or peak names and
a header of sample names such as "t2_rep1" or "CTCF.t2_rep1", return a dataframe with the same
header and with mean values computed across replicates but within time point. For example,
"t2_rep1", "t2_rep2", and "t2_rep3" become "t2".

""")

##################################################
parser.add_argument("--in_df", 
                    help="path to input dataframe ", \
                    required=True)

parser.add_argument("--out_df", 
                    help="path to output dataframe ", \
                    required=True)

##################################################
args = parser.parse_args()

in_df = pd.read_csv(args.in_df, sep='\t', index_col=0)

# map each sample to a time point
timepoint_to_rep = defaultdict(list)
for rep in list(in_df.columns):
    timepoint_to_rep[rep.split('_')[0]].append(rep)

for timepoint in sorted(timepoint_to_rep):
    # take the mean across replicates
    in_df[timepoint] = np.mean(in_df[timepoint_to_rep[timepoint]], axis=1)
    # drop individual samples associated with time point from df
    in_df = in_df.drop(timepoint_to_rep[timepoint], axis=1)

# sort df and write to file
in_df = in_df[utils.sort_by_timepoint(in_df.columns)]
in_df.to_csv(args.out_df, sep='\t')
