# to analyze bayescan2.1 output
library(gplots)
library(rworldmap)
set.seed(311)
sel_criterion = .5 # what do i say is under selection?


# helper fns for next
give_highest = function(vec) {
  return(which(vec==max(vec)))
}

compute_freqs = function(pop_assigned, geno) {
  dat = data.frame(pop=pop_assigned,geno=as.factor(t(geno)))
  tabulated = prop.table( t(table(dat)), 2 ) # prop.table normalizes
  return(tabulated)
}

# compute \phi_st to figure out how biased Fst is in the high-mu cases.
# takes as input a matrix of genotypes and a vector of population assignments
# corresponding to columns of the matrix.
# ref: michalakis and excoffier, genetics 1996
phi_st = function(genos, pop_assign) {
  
}

# S_b- sum of squared deviations of allele size between pops
S_b = function(pop1,pop2) {
  
}

# sum of squared deviations of allele size within pop
S_w = function(pop) {
  
}

genos = read.table('~/Dropbox/Ath_STRs/problem_mip_genotypes_081016.txt',header=T)
info = read.csv('~/Dropbox/Ath_STRs/mip_design_troubleshoot/mip_lib_info_081116.csv',header=T,stringsAsFactors = FALSE)
annots = read.csv('~/Dropbox/Ath_STRs/araport_annot/str_annots.csv',header=T)
lib_names = gsub('_R1_001.fastq.gz','',info[1:96,'spikein_file'])
lib_names = gsub('-','.',lib_names)
info = info[1:96,]
rownames(info) = lib_names
info = info[colnames(genos),]
colnames(genos) = info$Strain

pop_assign = read.table('~/Downloads/mstruct_download/ath_strs_6_666_ps5/final.theta')
rownames(pop_assign) = colnames(genos)
colnames(pop_assign) = 1:ncol(pop_assign)
pop_assigned = apply(pop_assign,1,give_highest)
reg = data.frame(region=info$Region.1001genomes.,pop=pop_assigned,name=colnames(genos))[order(pop_assigned),]
colnames(reg) = c('region', 'pop', 'name')

pdf('~/tmp/str_pop_assignment_032817.pdf')
heatmap.2(as.matrix(pop_assign), trace = 'none', cexRow=.3, cexCol = )

bayes = read.table('~/Dropbox/Ath_STRs/selection_analysis/mstruc_bayescan_analysis2017/strs_bayescan_011817_fst.txt',header=T)
rownames(bayes) = read.table('~/Dropbox/Ath_STRs/selection_analysis/mstruc_bayescan_analysis2017/str_indices_for_bayescan.txt')[,2]
bayesel = bayes[bayes$prob>=.5,] 
cat('false discovery rate of selection at this threshold is',max(bayesel$qval,'\n'))

hist(bayes$fst,200,xlab='Fst', main='All STR loci')

# plot affinities of clusters for geographic regions
 par(mar=c(7,4,4,2))
 barplot(compute_freqs(reg[,1],pop_assigned[as.character(reg$name)]),las=2,col=rainbow(6),main='Populations across regions')
 legend(.05,.7,fill=rainbow(6),legend=1:6)

# lat/longs for strains
geodat = read.csv('~/Dropbox/Ath_STRs/selection_analysis/mstruc_bayescan_analysis2017/A.thaliana_nordborg_96_strain_annots_020117_names_changed.csv',header=T)
newmap <- getMap(resolution = "low")
latitude = jitter(geodat$latitude, amount=1)
longitude = jitter(geodat$longitude,amount=1)

# also make a map showing where pop clusters are
par(mar=c(5,4,4,2))
plot(newmap,xlim = c(min(geodat$longitude),max(geodat$longitude)), ylim=c(40,160),asp=1)
points(longitude,latitude,
       col = rainbow(max(pop_assigned))[pop_assigned[as.character(geodat$name)]], cex=.5, pch=19)
dev.off()

bayesel = cbind(bayesel,annotation[rownames(bayesel),])

par(mfrow=c(4,6), mar=c(5,4,4,2))
for (str in as.character(rownames(bayesel))) {
  pdf(paste('str',str,'fst.pdf',sep=''))
  num_alleles = length(unique(as.character(genos[str,as.character(reg[,3])])))
  cols = rainbow(num_alleles)
  names(cols) = unique(as.character(sort(genos[str,as.character(reg[,3])])))
  #cols = cols[names(as.numeric(cols))]
  geno_freqs = compute_freqs(pop_assigned[as.character(reg$name)], genos[str,as.character(reg[,3])])
  barplot(
  geno_freqs, col=cols, 
  las=2,xlab='population', ylab='allele frequency',
  main=paste(str,', Fst: ',round(bayesel[str,'fst'],3),'\n',sep='') 
  )
  # make a map
  plot(newmap,xlim = c(min(geodat$longitude),max(geodat$longitude)), 
       ylim=c(95,160),asp=1, main = paste(str,', Fst: ',round(bayesel[str,'fst'],3),'\n',sep=''))
  points(longitude, latitude,
    col = cols[as.character(genos[str,as.character(geodat$name)])], 
    cex=.5, pch=19)
  dev.off()
    #print(cols[as.character(genos[str,geodat$name])])
}

pos_sel = bayesel[bayesel$alpha>0,]
div_sel = bayesel[bayesel$alpha<0,]

par(mfrow=c(1,1))

