from parts2014_gfpvar.tools.common import *
from parts2014_gfpvar.analysis.seq.util import *
from parts2014_gfpvar.analysis.qtl.util import get_all_metadata
from parts2014_gfpvar.analysis.array.util import write_sample_array_data


def dataset_S1():
    meta = get_all_metadata("seq") # this includes array data
    for sample in meta: # for all samples
        if sample[0] == "#": continue # (excluding comment lines)
        elif sample.count("000") > 0: # array samples have IDs with many 0s
            write_sample_array_data(sample)
        else:
            write_sample_sequence_data(sample, rec_cutoff=0.9)
    os.system("tar cvfz %s/paper/dataset_D1.tar.gz %s/seq/txt_output/*.tab %s/arrays/txt_output/*.tab"%(DATA_DIR, DATA_DIR, DATA_DIR))


def dataset_S2(filename="%s/paper/dataset_D2.tab"%DATA_DIR):
    ofh = file(filename, 'w')
    ofh.write("#Chrm\tLoc\tORF\tGene\tType\tMutation\tSIFT_call\tSIFT_score\n") # write header
    ofh.close() 
    os.system("cat %s/seq/ref/thinned_mutations_withsift.tab >> %s"%(DATA_DIR, filename)) # append content



def main():
    dataset_S1()
    dataset_S2()


if __name__ == '__main__':
    main()
