####################################################################################################
# Generate log2 ratio and adm3 values from exome sequence read count data
# Author: Tomas William Fitzgerald
# Email: tf2@sanger.ac.uk

# reads output from cnv_baf_data.c uniqifed on cnv bait regions

#' Read the read depth information from cnv_baf_data's output
#'
#' @param file Path to file
#' @param n Number of rows in file
#'
#' @return Dataframe with read depth information
#' @export
read_unique_depth_bait_data <- function(file, n) {
read.table(pipe(paste("awk -F", '"\t"', " '!_[$1]++' ", file, " | cut -f1-3 ",sep="")), header=FALSE, check.names=FALSE, colClasses=c("character", "numeric", "numeric"), nrows=n, comment.char="", sep="\t")
}

#' Calculates the correlation between samples using read counts at bait regions
#'
#' @param data Output from \code{\link{read_unique_depth_bait_data}}
#' @param rdfiles Vector of reference data files (output of cnv_baf_data)
#' @export
generate_correlation_values <- function(data, rdfiles) {
  sapply(rdfiles, function(x) cor(data[,2]+0.01, read_unique_depth_bait_data(x, nrow(data))[,2]+0.01))
}


#' Generate log2 ratio values for a sample
#'
#' @param data Output from \code{\link{read_unique_depth_bait_data}}
#' @param rdfiles Vector of reference data files (output of cnv_baf_data)
#' @export
calc_log2_ratio <- function(data, rdfiles) {
	cors = generate_correlation_values(data, rdfiles)
	auto_files = rdfiles[order(cors, decreasing=T)]
	auto_files=auto_files[2:(length(rdfiles))]
	big_data = data.frame(NA,do.call('cbind',lapply(auto_files, function(x) read_unique_depth_bait_data(x, n=nrow(data) ) [,2])))[,-(1)]+0.01
	med_data = apply(big_data, 1, median)
return(log2((data[,2]+0.01)/med_data))
}

#' Converts read counts and weights to ADM3 scores
#'
#' @param d Vector with log2 ratios
#' @export
adm3_score <- function(d) {
	w = 1/(d[,5]^2);
	EWS = d[,4]*w/sqrt(w);
return(data.frame(d,w,EWS));
}

#' Splits the read count distribution into bins
#'
#' @param x Vector with log2 ratios
#' @param max_bin_size Maximum bin size
#' @export
breakpoints <- function(x,max_bin_size=1000) {
	xxc = data.frame(x[,1],as.numeric(x[,2])+0.01,as.numeric(x[,3])+0.01)
	max <- max_bin_size
	d1 <- split(sort(xxc[,2]), ceiling(seq_along(xxc[,2])/max))
	nmax <- c(1:length(d1))
	for(j in 1:length(d1)) { nmax[j] <- max(d1[[j]])  }
	nnmax <- c(0,nmax[1:length(nmax)-1],max(xxc[,2])+1)
	nnmax <- unique(nnmax)
	xc <- cut(xxc[,2],breaks=nnmax)
	xxc_cut <- data.frame(xxc,xc)
	xcu <- sort(unique(xc))
	bpoints <- data.frame(xcu,nnmax[1:length(nnmax)-1]); names(bpoints) = c("#ReadsInRanges","#Breakpoints");
return(bpoints)
}

#
#' Generates weights based on mad of log2 ratios within read count range and calculate adm3 scores
#'
#' @param readdata Vector with log2 ratios
#' @export
calc_adm3_scores <- function(readdata) {
	bps = breakpoints(readdata)
	nnmax = c(bps[,2],max(readdata[,2])+1)
	xc = cut(readdata[,2]+0.01,breaks=nnmax)
	xxc_cut = data.frame(seq(1:length(readdata[,1])),readdata,xc)
	names(xxc_cut)[length(xxc_cut)] = "cr"
	xcu = sort(unique(xc))
	CR_MADs = data.frame(xxc_cut[,1:5],xxc_cut$cr)
	names(CR_MADs) = c("rowcount","position","rc","rd","lg2r","cr")
	xclmad = rep(0.0001,length(xcu))
	for(i in 1:length(xcu)) {
		tmpc = CR_MADs[CR_MADs[,6]==xcu[i],]
		xclmad[i] = mad(tmpc[,5],constant=1, na.rm=T)
	}
	BPranges = data.frame(xclmad,xcu)
	names(BPranges) = c("mads","cr")
	xtm = merge(x=CR_MADs,y=BPranges,by="cr")
	xtm = xtm[with(xtm,order(xtm[,2])),]
	adm_rep = adm3_score(xtm[,c(3:7)]);
	colnames(adm_rep) = c("position", "rc", "rd", "lg2r", "mads", "weight", "adm3")
return(adm_rep);
}

#' Reformats the data.frame with the ADM3 scores to serve as input for the MrMosaic functions
#'
#' @param input_file Path to *.mrm file generated by the cnv_baf_data program
#' @param adm3_scores data.frame with output from calc_adm3_scores()
#' @return data.frame ready to serve as input for the Mr.Mosaic calling function
#' @export
reformat_for_mrmosaic <- function(input_file,adm3_scores){
  data = read.table(input_file)
  colnames(data) = c("position", "rc", "rd", "snp_position", "ref_base", "baf", "GT")
  data = merge(data, adm3_scores[,-which(colnames(adm3_scores)%in%c("rc", "rd"))], by="position")

  snp_chr_positions = unlist(strsplit(as.character(data$snp_position), ":"))
  snp_chr = snp_chr_positions[seq(1, length(snp_chr_positions), by=2)]
  snp_positions = unlist(strsplit(snp_chr_positions[seq(2, length(snp_chr_positions), by=2)], "-"))
  snp_position = snp_positions[seq(1, length(snp_chr_positions), by=2)]

  mrm_probe_ids = paste("e.", snp_chr, ".", snp_position, sep="")
  mrm_data = data.frame(as.character(mrm_probe_ids), as.numeric(snp_chr), as.numeric(snp_position), as.numeric(data$adm3), as.numeric(data$lg2r), as.numeric(data$baf), as.character(data$GT))
  colnames(mrm_data) = c("Name", "Chr", "Position", "ADM3", "Log.R.Ratio", "B.Allele.Freq", "GType")
  mrm_data = mrm_data[!is.na(mrm_data$GType),]
  mrm_data = mrm_data[order(mrm_data$Chr, mrm_data$Position),]
  return(mrm_data)
}
