#!usr/bin/Rscript
# to put all the data blobs together and plot the results

#r2_thresh = as.numeric(commandArgs(trailingOnly=TRUE)[1]) # all r2 values below this will be thrown out 
#r_thresh = sqrt(r2_thresh) # actually use this for thresholding, avoid a bunch of unnecessary squaring.

str_fsts = read.table('strs_bayescan_011817_fst.txt',header=T)
rownames(str_fsts) = read.table('str_indices_for_bayescan.txt')[,2]

if (file.exists('all_ld_ests_020217.Rdat')) {
	load('all_ld_ests_020217.Rdat') } else {
# rbind em all. crappy but simple.
ld_ests = c()
for (file in dir(pattern='mcld_processed_chr')) {
	print(file)
	ld_ests = rbind(ld_ests,read.table(file,header=TRUE))
	}
	}
	
ld_ests = ld_ests[ld_ests$str==1,] # only look at strs
ld_ests = ld_ests[ld_ests$dist <50000,] # only look at relatively close strs (makes lowess more interp)

extract_mcld_info = function(mcld_line, fsts) {
	strloc = mcld_line[grepl('str',mcld_line)]
	if (length(strloc) != 1) { stop('not one str in line') }
	loc = gsub('str\\.','',strloc)
	#print(loc)
	out = append(loc,fsts[loc,])
	out = append(out,mcld_line)	
	return(out)		
	}

# to do a crappy bootstrapping on the data
lowess_sample = function(table, bootsize=20, nboot=100) {
	loci = unique(table[,'loc2'])
	low_boots = list()
	for (i in 1:nboot) {
		booted = unlist(sample(loci,size=bootsize,replace=FALSE))
		dat = table[table[,'loc2'] %in% booted,]
		low_boots[[i]] = lowess(as.numeric(dat[,'r']) ~ as.numeric(dat[,'dist']))
			}
	return(low_boots)
	}


str_ld_info = do.call(rbind,apply(ld_ests, 1, extract_mcld_info, fsts = str_fsts)) 

write.table(str_ld_info,'str_ld_fst_info_020717.txt',quote=FALSE)


# these strs have theta higher than the 20 highest-fst strs, omit them
# to correct for potential confounder of mutation rate.
high_theta_strs = c("100995","102151","102277","102921","103271","104479","10688","15383","16243","17293","17887","17970","1830","18639","18","21531","21956","23529","23825","24040","24252","24954","25087","25271","25619","25739","25806","26127","26216","299","3058","3077","3198","33633","36439","3664","38510","38753","40272","40457","40529","41039","414","41598","42022","42356","43472","43880","44076","441","44218","44858","45525","45540","45871","46044","46956","47261","47587","47689","48996","49050","49282","49612","50053","52594","52814","52842","5442","55219","5791","59631","60248","61022","63034","63827","63885","64287","6431","64597","64611","65677","6782","7150","71","72657","7358","73595","73788","74375","74535","75717","75908","76533","76968","77933","78534","79241","79782","80152","80386","80397","8067","81231","81421","82777","82824","82886","82921","82959","83070","83714","84092","84219","85380","87126","87214","8749","88258","88448","8893","92655","95369","96820","96","97365","97464" )

pos_strs = rownames(str_fsts[str_fsts$prob>.25 & str_fsts$alpha>0,])
pos_lds = str_ld_info[str_ld_info[,'loc2'] %in% pos_strs,]

other_lds = str_ld_info[!(str_ld_info[,'loc2'] %in% pos_strs),]
lowtheta_lds = other_lds[!(other_lds[,'loc2'] %in% high_theta_strs),]

# loess is too slow/resource-intensive- chokes on this data!!
#pos_loess = predict(loess(pos_lds[,'r'] ~ pos_lds[,'dist']), se=T)
#other_loess = predict(loess(other_lds[,'r'] ~ other_lds[,'dist']), se=T)
#lowtheta_loess = predict(loess(lowtheta_strs[,'r'] ~ lowtheta_lds[,'dist']), se=T)

pos_lowess = lowess(pos_lds[,'r'] ~ pos_lds[,'dist'])
other_lowess = lowess(other_lds[,'r'] ~ other_lds[,'dist'])
lowtheta_lowess = lowess(lowtheta_lds[,'r'] ~ lowtheta_lds[,'dist'])

lowess_boots = lowess_sample(lowtheta_lds, nboot=100) 

pdf('str_selxn_fit_020717.pdf')
plot(other_lowess,type='l', lwd=2, xlab='Distance (bp)', ylab='R', xlim=c(0,40000), ylim=c(0,.4))
#lines(pos_lowess,lwd=2,col='red')
#lines(lowtheta_lowess,lwd=2,col='blue')

# plot each boostrap- hopefully not too busy
for (boot in lowess_boots) {
        lines(boot,lwd=.5)
        }


lines(pos_lowess,lwd=2,col='red')
lines(lowtheta_lowess,lwd=2,col='blue')

#plot(other_lds$dist, other_loess$fit, type='l', lwd=2, xlab='Distance (bp)', ylab='R', xlim=c(0,40000), ylim=c(0,.4))
#lines(other_lds$dist, other_loess$fit-qt(.975,other_loess$df)*other_loess$se, lty=2)
#lines(other_lds$dist, other_loess$fit+qt(.975,other_loess$df)*other_loess$se, lty=2)

#lines(pos_lds$dist, pos_loess$fit, type='l', lwd=2, col='red')
#lines(pos_lds$dist, pos_loess$fit+qt(.975,pos_loess$df)*pos_loess$se, lty=2, col='red')
#lines(pos_lds$dist, pos_loess$fit-qt(.975,pos_loess$df)*pos_loess$se, lty=2, col='red')

#lines(lowtheta_lds$dist, lowtheta_loess$fit, type='l', lwd=2, col='blue') 
#lines(lowtheta_lds$dist, lowtheta_loess$fit-qt(.975,lowtheta_loess$df)*lowtheta_loess$se, lty=2, col='blue')
#lines(lowtheta_lds$dist, lowtheta_loess$fit+qt(.975,lowtheta_loess$df)*lowtheta_loess$se, lty=2, col='blue')
dev.off()
