# for checking on the LD situation on chr1, see if there are maybe inversions hitting my strs.
chr_to_check = 1
#chr_to_check = 2

library(stringr)

countnas = function(matrix) {
  nanum = c()
  for (i in rownames(matrix)) {
    nanum[i] = length(which(is.na(matrix[i,])))
  }
  return(nanum)
}


# read in both SNP and STR data... only really using SNPs though.
# lots of code!!
genos = read.table('problem_mip_genotypes_081016.txt',header=T,stringsAsFactors = FALSE)
# all this just to map strain names!!
info = read.csv('mip_design_troubleshoot/mip_lib_info_081116.csv',header=T,stringsAsFactors = FALSE)
lib_names = gsub('_R1_001.fastq.gz','',info[1:96,'spikein_file'])
lib_names = gsub('-','.',lib_names)
info = info[1:96,]
rownames(info) = lib_names
info = info[colnames(genos),]
#colnames(genos) = gsub('-','',info$Strain)
colnames(genos) = str_to_upper(info$Strain)

# for coordinates
annotation = read.table('~/Dropbox/Ath_STRs/araport_annot/Ath_STRs_full_annotations_031317.tsv',header=T,sep='\t',stringsAsFactors = FALSE)

# will have to thin this to just the strains i have str data for...
snps = read.csv('~/call_method_75/call_method_75_TAIR9.csv',header=T,stringsAsFactors = FALSE)
snp_meta = read.csv('~/call_method_75/call_method_75_info_ascii.csv',header=T,stringsAsFactors = FALSE)
rownames(snp_meta) = snp_meta$ecotype_id
colnames(snps)[3:ncol(snps)] = str_to_upper(gsub(' ','',snp_meta[unlist(snps[1,3:ncol(snps)]),'nativename']))

# fix mapping errors case-by-case
colnames(snps)[colnames(snps)=='KNO-18'] = 'KNOX-18'
colnames(snps)[colnames(snps)=='KNO-10'] = 'KNOX-10'
colnames(snps)[colnames(snps)=='SHAHDARA'] = 'SHA'
colnames(snps)[colnames(snps)=='COL-0'] = 'COL'
colnames(genos)[colnames(genos)=='AN-0'] = 'AN-1'

snps = cbind(snps[,1:2], snps[,colnames(genos)])[2:nrow(snps),] # get rid of row with weird ids
rownames(snps) = apply( snps, 1, function(vec){return(paste('snp',vec[1],vec[2],sep='.'))} )
colnames(snps)[c(1,2)] = c('chr','position')



snp_to_bin = function(locus_vec) {
  tabbed = table(locus_vec)
  tabbed[1] = 0
  tabbed[2] = 1
  out_vec = tabbed[locus_vec]
  names(out_vec) = names(locus_vec)
  return(out_vec)
}

# have read in snps already
snps = t(snps[snps$chr==chr_to_check,])

for (i in 3:ncol(snps)) {
  snps[3:98,i] = snp_to_bin(snps[3:98,i])
}

snps = t(snps)
class(snps) = 'numeric'

snps = na.omit(snps)
snp_pca = prcomp(na.omit(as.matrix(snps[,3:98])))

par(mfrow=c(3,2))

for (i in 1:6) {
  plot(snps[,'position'],snp_pca$x[,i],pch='.', ylab = paste('PC',i,sep=''), xlab=paste('Chr', chr_to_check, 'position'))
  if (chr_to_check==2) {
  # kerry thing
  #abline(v=1527905,col='red',lwd=2)  # marks breakpoint
  points(snps[2400:2600,'position'],snp_pca$x[2400:2600,i], col='red',pch=19,cex=.5)
  } else if (chr_to_check==1) {
  # 1665
  points(snps[4300:4800,'position'],snp_pca$x[4300:4800,i], col='red',pch=19,cex=.5)
  # 4165
  points(snps[11450:11700,'position'],snp_pca$x[11450:11700,i], col='green',pch=19,cex=.5)
  # 3692 
  points(snps[10500:10700,'position'],snp_pca$x[10500:10700,i], col='blue',pch=19,cex=.5)
  # 2479 (fake)
  #points(snps[6900:7100,'position'],snp_pca$x[6900:7100,i], col='pink',pch=19,cex=.5)
  
  # kerry's weird region
  #points(snps[47400:47600,'position'],snp_pca$x[47400:47600,i], col='blue',pch=19,cex=.5)
  } 
  }