# 9/13/16 
# changing this script from computing LD to preprocessing and merging data for PLINK analysis.
# need to make PED and MAP files because PLINK is stupid and requires this.

# STR analysis of LD 9/10/16
# (w/r/t SNPs)
library(stringr)

# maximum distance for which to calculate LD measure
max_dist =25000

# people get very precious and specific about their LD calcs so I can't use any of them,
# had to write my own measuring D:
# D = (x11)(x22) - (x12)(x21), where those are all haplotype frequencies for 2 biallelic sites.
# willems 2014 seem to have just given up and binarized STR genotypes (!!)
# so maybe i will do that too- it is in the spirit after all. 
LDmax = function(site1,site2) {
  # compute ordered frequencies
  site1 = as.character(site1)
  site2 = as.character(site2)
  alleles1 = sort(table(na.omit(site1)),decreasing=TRUE) / sum(table(na.omit(site1)))
  alleles2 = sort(table(na.omit(site2)),decreasing=TRUE) / sum(table(na.omit(site2)))
 # print(alleles1)
#  print(alleles2)
  if (length(alleles1) < 2 | length(alleles2) < 2) {
    return(NA)
    }

    # binarize it- major vs. minor alleles
  site1[!(site1 %in% names(alleles1)[1:2])] = names(alleles1)[2]
  site2[!(site2 %in% names(alleles2)[1:2])] = names(alleles2)[2]
  
  # bookkeeping
  tabled = table(na.omit(data.frame(site1,site2))) 
#  print(tabled)
  tabled = (tabled)/sum(tabled) 
  D = (tabled[1,1]*tabled[2,2]) - (tabled[1,2]*tabled[2,1])
  # normalize
 # print(D)
#  print(min(c(alleles1[1]*(1-alleles2[1]), alleles2[1]*(1-alleles1[1]) )))
 # print(max(c(-alleles1[1]*(1-alleles2[1]), -alleles2[1]*(1-alleles1[1]) )))
#  if (D>=0) {
  # THERE IS SOMETHING WRONG WITH THIS DPRIME CALC (COMPARE TO GENETICS PACKAGE)
 #Dprime = abs( D / min(c(alleles1[1]*(1-alleles2[1]), alleles2[1]*(1-alleles1[1]) )) )
 # print(Dprime)
   # } else {Dprime = D / max(c(-alleles1[1]*(1-alleles2[1]), -alleles2[1]*(1-alleles1[1]) ))}
#  return(Dprime)
  #return(c(Dprime,D,min(c(alleles1[1]*(1-alleles2[1]), alleles2[1]*(1-alleles1[1]) )),max(c(-alleles1[1]*(1-alleles2[1]), -alleles2[1]*(1-alleles1[1]) )), alleles1[1],alleles2[1]))
  # just do r^2- simpler and more common
  r2 = D^2 / (alleles1[1]*alleles1[2]*alleles2[1]*alleles2[2])
 # return(list('r2'=r2, 'Dprime'=Dprime, 'D'=D))
 return(r2)
   }

# because the genetics package is very fragile and precious and 
# requires its own weird format and all my samples are homozygous
genetics_LD_wrap =  function(geno1,geno2,snps) {
  geno1 = as.character(snps[geno1,3:ncol(snps)])
  geno2 = as.character(snps[geno2,3:ncol(snps)])
  g1 = genotype(a1=geno1,a2=geno1)
  g2 = genotype(a1=geno2,a2=geno2)
  r2 = LD(g1,g2)#$`R^2`
  return(r2)
  }

genos = read.table('~/Dropbox/Ath_STRs/problem_mip_genotypes_081016.txt',header=T,stringsAsFactors = FALSE)
# all this just to map strain names!!
info = read.csv('~/Dropbox/Ath_STRs/mip_lib_info_081116.csv',header=T,stringsAsFactors = FALSE)
lib_names = gsub('_R1_001.fastq.gz','',info[1:96,'spikein_file'])
lib_names = gsub('-','.',lib_names)
info = info[1:96,]
rownames(info) = lib_names
info = info[colnames(genos),]
#colnames(genos) = gsub('-','',info$Strain)
colnames(genos) = str_to_upper(info$Strain)

# for coordinates
annotation = read.table('~/Dropbox/Ath_STRs/str_annots_picked.txt',header=T,sep='\t',stringsAsFactors = FALSE)


# will have to thin this to just the strains i have str data for...
snps = read.csv('~/call_method_75/call_method_75_TAIR9.csv',header=T,stringsAsFactors = FALSE)
snp_meta = read.csv('~/call_method_75/call_method_75_info_ascii.csv',header=T,stringsAsFactors = FALSE)
rownames(snp_meta) = snp_meta$ecotype_id
colnames(snps)[3:ncol(snps)] = str_to_upper(gsub(' ','',snp_meta[unlist(snps[1,3:ncol(snps)]),'nativename']))

# fix mapping errors case-by-case
colnames(snps)[colnames(snps)=='KNO-18'] = 'KNOX-18'
colnames(snps)[colnames(snps)=='KNO-10'] = 'KNOX-10'
colnames(snps)[colnames(snps)=='SHAHDARA'] = 'SHA'
colnames(snps)[colnames(snps)=='COL-0'] = 'COL'
colnames(genos)[colnames(genos)=='AN-0'] = 'AN-1'

snps = cbind(snps[,1:2], snps[,colnames(genos)])
rownames(snps) = apply( snps, 1, function(vec){return(paste('snp',vec[1],vec[2],sep='.'))} )
rownames(genos) = sapply( rownames(genos),  function(str){return(paste('str',str,sep='.'))})

#######
# NOW DOING THIS ALL EXTERNALLY
#######
# # first, snp LD measures 
# #snp_lds = matrix(rep(NA,4e8),2)  # too slow to do lookups well in loop
snp_lds = c()

# added 12/8/16 to control gwas
control_gwas_str = genos[c(
  'str.1665','str.2479','str.3692','str.3950','str.6864','str.37195','str.37359','str.43970','str.65213',
  'str.65291','str.83070','str.83235','str.83239','str.86275','str.86626'),]
control_gwas_snps = c()
control_gwas_snps['str.37195'] = c('snp.2.9598635')
control_gwas_snps['str.65291'] = c('snp.4.454542')
control_gwas_snps['str.1665'] = c('snp.1.2778963')
control_gwas_snps['str.2479'] = c('snp.1.4143163')
control_gwas_snps['str.3692'] = c('snp.1.6371576')
control_gwas_snps['str.3950'] = c('snp.1.6369772')
control_gwas_snps['str.6864'] = c('snp.1.10458712')
control_gwas_snps['str.37359'] = c('snp.2.9581605')
control_gwas_snps['str.43970'] = c('snp.2.18439471')
control_gwas_snps['str.65213'] = c('snp.4.500090')
control_gwas_snps['str.83070'] = c('snp.5.2002340')
control_gwas_snps['str.83235'] = c('snp.5.2095951')
control_gwas_snps['str.83239'] = c('snp.5.2339456')
control_gwas_snps['str.86275'] = c('snp.5.6690646')
control_gwas_snps['str.86626'] = c('snp.5.7382860')

control_phenos = c()
control_phenos['str.37195'] = c('X2_LDV')
control_phenos['str.65291'] = c('X1_LD')
control_phenos['str.1665'] = c('X6_FT16')
control_phenos['str.2479'] = c('X32_avrPphB')
control_phenos['str.3692'] = c('X1_LD')
control_phenos['str.3950'] = c('X6_FT16')
control_phenos['str.6864'] = c('X45_8W.GH.FT')
control_phenos['str.37359'] = c('X6_FT16')
control_phenos['str.43970'] = c('X2_LDV')
control_phenos['str.65213'] = c('X46_8W.GH.LN')
control_phenos['str.83070'] = c('X46_8W.GH.LN')
control_phenos['str.83235'] = c('X6_FT16')
control_phenos['str.83239'] = c('X17_Mg25')
control_phenos['str.86275'] = c('X46_8W.GH.LN')
control_phenos['str.86626'] = c('X57_FT.Field')

control_snp_phenos = cbind(control_phenos,control_gwas_snps,snps[control_gwas_snps,colnames(genos)])

write.table(control_snp_phenos,'control_snp_phenos_strwa_control_012417.txt',quote=FALSE)

#control_gwas_snp_svp = snps[66100:66550,colnames(genos)]
#control_gwas_snp_cry1_etc3 = snps[124700:125300,colnames(genos)]
#control_gwas_snp_cry1_etc3 = rbind(control_gwas_snp_cry1_etc3, snps[124700:125300,colnames(genos)])
#control_gwas = t(rbind(control_gwas_str,control_gwas_snp_svp,control_gwas_snp_cry1_etc3))

for_mcld = c()
for(i in colnames(control_gwas)) {
  for_mcld = rbind(for_mcld,control_gwas[,i],control_gwas[,i])
}

print('analyzing position (chr bp):')
#count = 0
for ( site in 2: nrow(snps) ) {
  cat(unlist(snps[site,1:2]),'\n')
  focal_pos = as.numeric(snps[site,2])
  poses = as.numeric(snps[2:nrow(snps),2]) - focal_pos
  chr = snps[site,1]
  to_comp = which(snps[2:nrow(snps),1]==chr & (poses < max_dist) & (poses >= 50))
  site1 = snps[site,3:ncol(snps)]

  lds = sapply(to_comp, genetics_LD_wrap,snps=snps, geno1=focal_pos)
  snp_lds = rbind(snp_lds, cbind(poses[to_comp],lds) )
  #  for (pos in to_comp) {
  #    if (poses[pos]<50){next}
   #   print(poses[pos])
   #   print(LD(snps[site,3:ncol(snps)], snps[pos,3:ncol(snps)]))
 # count = count+1
  #snp_lds[count,] = c(poses[pos], LD(snps[site,3:ncol(snps)], snps[pos,3:ncol(snps)]))
  #site2 = snps[pos,3:ncol(snps)]
#  snp_lds = rbind(snp_lds,c(poses[pos], LDmax(snps[site,3:ncol(snps)], snps[pos,3:ncol(snps)])))
  
 # }
}

# NAs lead to weird spikiness in lowess fit for some reason
snp_lds = na.omit(snp_lds)
plot(lowess(snp_lds[,1],snp_lds[,2],f=.05),pch='.',col='black',lwd=2,type='n',xlab='distance (bp)',ylab='D\'')
#plot(loess(snp_lds[,2]~snp_lds[,1],span=.2),pch='.',col='black',lwd=2,xlab='distance (bp)',ylab='D\'')

# make .fam file first (also used in .ped, based on being idiots)

