#!/usr/bin/Rscript
# 1/24/2017 pre-digest genotype data, run mcld, post-process + collate data.
library(stringr)

# upped window size 2/7/17 for selection LD ests... could probably also rerun other stuff.
# hopefully doesn't crash anything.
locus = commandArgs(trailingOnly=TRUE)[1]
window_s = as.numeric(commandArgs(trailingOnly=TRUE)[2]) #10 # *2+1= num loci to run at once (keep 10 up, 10 down)
r_thresh = sqrt(.05) # how many results to keep, LD threshold
#chr = commandArgs(trailingOnly=TRUE)[1]


assemble_genos_for_mcld = function(dat, focal_pos, window_size=window_s, outfilename=paste('mcld.in',locus,sep='.')) { 
	#dat = dat[dat[,1]==chr,]
	if (focal_pos+window_size > nrow(dat)) { 
		# do something else
		to_format = dat[(focal_pos-window_size) : nrow(dat),]
		} else {
		to_format = dat[(focal_pos-window_size) : (focal_pos+window_size),]
		}
	#print(nrow(to_format))
	if (focal_pos == nrow(dat)) {return(FALSE)} # chr1 exactly trips this
	to_write = c()
	for (locus1 in rownames(to_format)) {
		to_write = rbind(to_write,to_format[locus1,3:ncol(to_format)],to_format[locus1,3:ncol(to_format)])
		#print(locus)
		#print(to_format[locus,3:ncol(to_format)])
		}
	metadat = to_format[,1:2]
	write.table(t(to_write), file=outfilename, quote=FALSE, row.name=FALSE, col.name=FALSE)
	return(metadat)
	}

# run the prog, clean up + read output to return
run_mcld = function(datfile=paste('mcld.in',locus,sep='.'), outfile='mcld.out') {
	# pipe() from system command
	cmd_string = paste('./mcld.exe -file=',datfile,' -perm=0 -miss=0',sep='')
	mcld_out = read.table( pipe( cmd_string ), header=F, skip=1, sep='')
	#system(paste('./mcld.exe -file=',datfile,' -perm=0 -missing=0 > ',outfile,sep=''))
	return(mcld_out)
	}

# IFF first read from file, take the whole thing
# ELSE take all cases for which position 2 in the pair > focal_pos 
# this takes the new cases while screening out the dupes
process_mcld_out = function(mcld_out, metadat, window, filter=TRUE) {
	window=window+1
	# test that data are the right shape
	#print(nrow(metadat))
	#print(nrow(mcld_out))
	stopifnot( ((nrow(metadat)*(nrow(metadat) - 1))/2) == nrow(mcld_out) )
	# strangely, this is apparently the easiest way to do this
	#print(mcld_out[,1])
	#locus_pairs = as.numeric( t( matrix(unlist(strsplit(as.character(mcld_out[,1]),split='/')), nrow=nrow(mcld_out)) ) )
	locus_pairs = t( matrix(unlist(strsplit(as.character(mcld_out[,1]),split='/')), nrow=2) )
	#print(locus_pairs)
	if (filter==TRUE) {
	corrs = mcld_out[as.numeric(locus_pairs[,2]) > window,3]
	out_loci = locus_pairs[as.numeric(locus_pairs[,2]) > window,]
	} else {
	corrs = mcld_out[,3]
	out_loci = locus_pairs
	}
	#print(locus_pairs[,2])
#	print(dim(out_loci))
	loci1 = rownames(metadat)[as.numeric(out_loci[,1])]
	loci2 = rownames(metadat)[as.numeric(out_loci[,2])]
	pos1 = metadat$pos[as.numeric(out_loci[,1])]
	pos2 = metadat$pos[as.numeric(out_loci[,2])]
	dists = metadat$pos[as.numeric(out_loci[,2])] - metadat$pos[as.numeric(out_loci[,1])]
	dists = pos2 - pos1
	bothloc = paste(loci1,loci2,sep="")
	strs = str_count(bothloc, pattern='str')
	# amended to output also pos info
	out = cbind(loci1, loci2, pos1, pos2, dists, strs, corrs)
	#out = cbind(loci1, loci2, dists, strs, corrs)
#	print(unname(out), quote=FALSE, justify=FALSE)
	write.table(out,quote=FALSE,file="",row.name=FALSE,col.name=FALSE,sep='\t')
	}

# load genotype data - becomes object 'snps'
load('str_snp_genos_for_mcld.Rdat')
#load('test.Rdat')
# WILL IT SPEED UP IF I AS.MATRIX() THE DATASET?? SHOULDN'T SCREW ANYTHING UP.

# header for outfile 
# amended to include also pos info
#cat('loc1\tloc2\tdist\tstr\tr\n')
cat('loc1\tloc2\tpos1\tpos2\tdist\tstr\tr\n')

#out_ld_dat = c()
# run a loop where you run the fns above, collect and collate results
#for (chr in unique(snps$chr)) {
	#print(chr)
	snp_chr = snps
	#snp_chr = snps[snps$chr==chr,]
	#for (focal_pos in seq(from=(window_s+1), to=max(snp_chr$position) ,by=window_s)) {
		focal_pos = which(rownames(snps)==locus)
		#print(rownames(snps)[focal_pos])
		#print(focal_pos)
		metadat = assemble_genos_for_mcld(snp_chr, focal_pos)
		#if (!(metadat[1])) {
		#print('stop')
		#break} 
		mcld_out = run_mcld()
		#print(mcld_out)
		#if (focal_pos == window_s+1) {
			process_mcld_out(mcld_out = mcld_out, metadat = metadat, window_s, filter=FALSE)
		#	} else {
		#	process_mcld_out(mcld_out = mcld_out, metadat = metadat, window_s)
		#	}
#		}
#	}

# save + plot results in a couple different ways.
# will do this in other code.
