/*
Copyright (c) 2011-2012 Daniel Marbach(1,2)

(1) Massachusetts Institute of Technology, Cambridge MA, USA
(2) Broad Institute of MIT and Harvard
 
We release this software open source under an MIT license (see below). If this
software was useful for your scientific work, please cite our paper available at:
http://compbio.mit.edu/flynet

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
 */
package edu.mit.compbio.flynet.misc;

import edu.mit.compbio.flynet.*;

import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;


/**
 * Parses the loss of function / overexpression datasets.
 */
public class ExpressionDatasetParser {

	/** The dataset file */
	private String filename_ = null;
	/** The TF that was perturbed */
	private String tf_ = null;
	/** Number of header lines */
	private int header_ = -1;
	/** The total number of columns of the file (used to check format) */
	private int numCols_ = -1;
	/** The column where the target genes are listed */
	private int targetCol_ = -1;
	/** The columns where the expression data is listed */
	private ArrayList<Integer> exprCols_ = null;
	/** The expression profiles for each gene */
	private HashMap<String, ArrayList<ArrayList<Double>>> expr_ = null;
	/** The thresholds used to call differentially expressed genes */
	private double[] thresholds_ = {0, 0.7, 1, 1.5}; // we add threshold 0 to create a list of all genes
	
	
	// ============================================================================
	// PUBLIC METHODS
	
	/** 
	 * biniou loss of function / overexpression data from the Furlong lab.
	 * Header: 1
	 * Col 1: target gene
	 * Col 4-9: loss of function 
	 * Col 10-12: overexpression
	 * 
	 * Output: Diff expressed genes for loss and gain experiments
	 */
	public void parseBiniou() {
		
		expr_ = new HashMap<String, ArrayList<ArrayList<Double>>>();
		filename_ = "resources_private/Furlong/biniou/biniou_expr.txt";
		tf_ = "FBgn0045759"; // biniou
		
		header_ = 1;
		numCols_ = 12;
		targetCol_ = 1;
		
		exprCols_ = new ArrayList<Integer>();
		for (int i=4; i<=12; i++)
			exprCols_.add(i);
		
		parse();

		String filename = "resources_private/Furlong/biniou/biniou_loss";
		int[][] cols1 = {{1}, {2}, {3}, {4}, {5}, {6}};
		writeDifferentiallyExpressedGenes(cols1, filename);
		
		filename = "resources_private/Furlong/biniou/biniou_gain";
		int[][] cols2 = {{7}, {8}, {9}};
		writeDifferentiallyExpressedGenes(cols2, filename);

	}
	  	

	// ----------------------------------------------------------------------------

	/** 
	 * Twist loss/gain of function data from the Furlong lab.
	 * Note, the same gene appears multiple times (different probes), I take the median
	 * File 1 - cDNA
	 * Header: 2
	 * Col 4-19: expression data (4 time points, 4 replicates each)
	 * 
	 * File 2 - indac
	 * Header: 3
	 * Col 5-20: expression data (4 time points, 4 replicates each, same as cDNA)
	 * 
	 * File 3 - gain
	 * Header: 3
	 * Col 5-16: expression data (3 time points, 4 replicates each)
	 */
	public void parseTwist() {
		
		// loss_cDNA
		
		expr_ = new HashMap<String, ArrayList<ArrayList<Double>>>();
		filename_ = "resources_private/Furlong/twist/twist_loss_cDNA.txt";
		tf_ = "FBgn0003900"; // twist
		
		header_ = 2;
		numCols_ = 19;
		targetCol_ = 3;
		
		exprCols_ = new ArrayList<Integer>();
		for (int i=4; i<=19; i++)
			exprCols_.add(i);
		
		parse();

		String filename = "resources_private/Furlong/twist/twist_loss_cDNA";
		int[][] cols1 = {{1,2,3,4}, {5,6,7,8}, {9,10,11,12}, {13,14,15,16}};
		writeDifferentiallyExpressedGenes(cols1, filename);
		
		// loss_indac
		
		expr_ = new HashMap<String, ArrayList<ArrayList<Double>>>();
		filename_ = "resources_private/Furlong/twist/twist_loss_indac.txt";
		
		header_ = 3;
		numCols_ = 20;
		targetCol_ = 4;
		
		exprCols_ = new ArrayList<Integer>();
		for (int i=5; i<=20; i++)
			exprCols_.add(i);
		
		parse();

		filename = "resources_private/Furlong/twist/twist_loss_indac";
		writeDifferentiallyExpressedGenes(cols1, filename);
		
		// gain

		expr_ = new HashMap<String, ArrayList<ArrayList<Double>>>();
		filename_ = "resources_private/Furlong/twist/twist_gain.txt";
		
		header_ = 3;
		numCols_ = 16;
		targetCol_ = 4;
		
		exprCols_ = new ArrayList<Integer>();
		for (int i=5; i<=16; i++)
			exprCols_.add(i);
		
		parse();

		filename = "resources_private/Furlong/twist/twist_gain";
		int[][] cols2 = {{1,2,3,4}, {5,6,7,8}, {9,10,11,12}};
		writeDifferentiallyExpressedGenes(cols2, filename);
	}
	
	
	// ----------------------------------------------------------------------------

	/** 
	 * mef2 loss of function data from the Furlong lab.
	 * Header: 1
	 * Col 4: target gene
	 * Col 55-66: loss of function (medians over 4 repeats) 
	 * Col 67-78: q-values
	 */
	public void parseMef2() {
		
		expr_ = new HashMap<String, ArrayList<ArrayList<Double>>>();
		filename_ = "resources_private/Furlong/mef2/mef2_loss.txt";
		tf_ = "FBgn0011656"; // mef2
		
		header_ = 1;
		numCols_ = 78;
		targetCol_ = 4;
		
		exprCols_ = new ArrayList<Integer>();
		for (int i=55; i<=66; i++)
			exprCols_.add(i);
		
		parse();

		String filename = "resources_private/Furlong/mef2/mef2_loss";
		int[][] cols1 = {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}, {11}, {12}};
		writeDifferentiallyExpressedGenes(cols1, filename);
		
	}

	
	// ============================================================================
	// PRIVATE METHODS

    /** Parse the targets and expression data */
	private void parse() {
		
		Flynet.println("Reading file: " + filename_);
		
		FileParser parser = new FileParser(filename_);
		parser.skipLines(header_);
		int l = header_;
		
		while (true) {
			// Read the next line
			String[] line = parser.readLine();
			l++;
			if (line == null)
				break;
			
			// Check that it has the right number of columns
			if (line.length != numCols_)
				throw new RuntimeException("Line " + l + ": expected " + numCols_ + " columns, found " + line.length);
			
			ArrayList<Double> x = new ArrayList<Double>();
			for (int i=0; i<exprCols_.size(); i++) {
				String str = line[exprCols_.get(i)-1];
				if (str.equalsIgnoreCase("NA") || str.equalsIgnoreCase(""))
					x.add(Double.NaN);
				else
					x.add(Double.parseDouble(str));
			}
			
			String target = line[targetCol_-1];
			
			if (expr_.containsKey(target)) {
				// Add the values
				ArrayList<ArrayList<Double>> expr = expr_.get(target);
				for (int i=0; i<expr.size(); i++)
					expr.get(i).add(x.get(i));
				
			} else {
				ArrayList<ArrayList<Double>> expr = new ArrayList<ArrayList<Double>>();
				expr_.put(target, expr);
				for (int i=0; i<x.size(); i++) {
					ArrayList<Double> xi = new ArrayList<Double>();
					xi.add(x.get(i));
					expr.add(xi);
				}
			}
		}
	}

	
	// ----------------------------------------------------------------------------

    /** Export the list of genes that are differentially expressed in any of the given columns */
	private void writeDifferentiallyExpressedGenes(int[][] cols, String filename) {
		
		for (int i=0; i<thresholds_.length; i++)
			writeDifferentiallyExpressedGenes(cols, thresholds_[i], filename);
	}

	
	// ----------------------------------------------------------------------------

    /** 
     * Export the list of genes that are differentially expressed in any of the given columns.
     * cols[t][k] gives the column of the expression matrix corresponding to the k'th repeat of time point t.
     */
	private void writeDifferentiallyExpressedGenes(int[][] cols, double threshold, String filename) {
		
		DecimalFormat oneDec = new DecimalFormat("0.0");
		filename = filename + "_" + oneDec.format(threshold) + ".txt";
		Flynet.println("Writing file " + filename);
		FileExport writer = new FileExport(filename);
		
		Iterator<String> iter = expr_.keySet().iterator();
		while (iter.hasNext()) {
			String target = iter.next();
			
			// For each time point
			for (int t=0; t<cols.length; t++) {
				ArrayList<Double> repeats = new ArrayList<Double>();
				
				// For each repeat
				for (int k=0; k<cols[t].length; k++) {
					ArrayList<Double> probes = expr_.get(target).get(cols[t][k]-1);
					//double x = maxAbs(probes);
					double x = median(probes);
					repeats.add(x);
				}
				double mu = median(repeats);
				if (mu >= threshold || mu <= -threshold) {
					writer.println(tf_ + "\t" + target);
					break;
				}
			}
		}
		writer.close();
	}

	
	// ----------------------------------------------------------------------------

    /** Compute the mean of the given vector */
	@SuppressWarnings("unused")
	private double mean(ArrayList<Double> x) {
		
		double mu = 0;
		for (int i=0; i<x.size(); i++)
			mu += x.get(i);
		
		return mu / x.size();
	}
	
	
	// ----------------------------------------------------------------------------

    /** Compute the mean of the given vector */
	private double median(ArrayList<Double> x) {
	    Collections.sort(x);
	 
	    if (x.size() % 2 == 1)
	    	return x.get((x.size()+1)/2-1);
	    else {
	    	double lower = x.get(x.size()/2-1);
	    	double upper = x.get(x.size()/2);
	 
	    	return (lower + upper) / 2.0;
	    }	
	}

	
	// ----------------------------------------------------------------------------

    /** Return the max absolute value */
	@SuppressWarnings("unused")
	private double maxAbs(ArrayList<Double> x) {
	    
		double max = 0;
		for (int i=0; i<x.size(); i++) {
			double xi = Math.abs(x.get(i));
			if (xi > max )
				max = xi;
		}
		
		return max;
	}

}
