#!/usr/bin/Rscript
# 1/24/2017
# preprocesses SNP and STR genotypes for LD analysis.


library(stringr)

countnas = function(matrix) {
  nanum = c()
  for (i in rownames(matrix)) {
    nanum[i] = length(which(is.na(matrix[i,])))
  }
  return(nanum)
}

genos = read.table('problem_mip_genotypes_081016.txt',header=T,stringsAsFactors = FALSE)
# all this just to map strain names!!
info = read.csv('mip_design_troubleshoot/mip_lib_info_081116.csv',header=T,stringsAsFactors = FALSE)
lib_names = gsub('_R1_001.fastq.gz','',info[1:96,'spikein_file'])
lib_names = gsub('-','.',lib_names)
info = info[1:96,]
rownames(info) = lib_names
info = info[colnames(genos),]
colnames(genos) = str_to_upper(info$Strain)

# for coordinates
annotation = read.table('araport_annot/Ath_STRs_full_annotations_111616.tsv',header=T,sep='\t',stringsAsFactors = FALSE)

# to thin this to just the strains i have str data for...
# I AM NOT PROVIDING THIS DATA HERE BECAUSE IT IS BIG. PLEASE GO FIND THIS as indicated in readme. 
snps = read.csv('call_method_75_TAIR9.csv',header=T,stringsAsFactors = FALSE)
snp_meta = read.csv('call_method_75_info_ascii.csv',header=T,stringsAsFactors = FALSE)
rownames(snp_meta) = snp_meta$ecotype_id
colnames(snps)[3:ncol(snps)] = str_to_upper(gsub(' ','',snp_meta[unlist(snps[1,3:ncol(snps)]),'nativename']))

# fix mapping errors case-by-case
colnames(snps)[colnames(snps)=='KNO-18'] = 'KNOX-18'
colnames(snps)[colnames(snps)=='KNO-10'] = 'KNOX-10'
colnames(snps)[colnames(snps)=='SHAHDARA'] = 'SHA'
colnames(snps)[colnames(snps)=='COL-0'] = 'COL'
colnames(genos)[colnames(genos)=='AN-0'] = 'AN-1'

snps = cbind(snps[,1:2], snps[,colnames(genos)])[2:nrow(snps),] # get rid of row with weird ids
rownames(snps) = apply( snps, 1, function(vec){return(paste('snp',vec[1],vec[2],sep='.'))} )
colnames(snps)[c(1,2)] = c('chr','position')

nacounts = countnas(genos)
genos = genos[nacounts<70,]

genos = cbind(gsub(pattern='Chr([0-9])','\\1',annotation[rownames(genos),'chr']), annotation[rownames(genos),'start'], genos)
rownames(genos) = sapply( rownames(genos),  function(str){return(paste('str',str,sep='.'))})
colnames(genos)[c(1,2)] = c('chr','position')

# combine em
snps = rbind(snps,genos)

# sort the genotype matrix
snps[,1] = as.numeric(snps[,1])
snps[,2] = as.numeric(snps[,2])

snps = snps[order(snps[,1],snps[,2]),]
snps[is.na(snps)] = 0

save(snps,file='str_snp_genos_for_mcld.Rdat')


