# 1/24/17 and 3/16/17
# get snp data into shape that i can use to compare with/control for STR-phenotype associations (cannibalized from LD preprocessing)

# (w/r/t SNPs)
library(stringr)

genos = read.table('~/Dropbox/Ath_STRs/problem_mip_genotypes_081016.txt',header=T,stringsAsFactors = FALSE)
# all this just to map strain names!!
info = read.csv('~/Dropbox/Ath_STRs/mip_lib_info_081116.csv',header=T,stringsAsFactors = FALSE)
lib_names = gsub('_R1_001.fastq.gz','',info[1:96,'spikein_file'])
lib_names = gsub('-','.',lib_names)
info = info[1:96,]
rownames(info) = lib_names
info = info[colnames(genos),]
#colnames(genos) = gsub('-','',info$Strain)
colnames(genos) = str_to_upper(info$Strain)

# for coordinates
annotation = read.table('~/Dropbox/Ath_STRs/str_annots_picked.txt',header=T,sep='\t',stringsAsFactors = FALSE)

# will have to thin this to just the strains i have str data for...
snps = read.csv('~/call_method_75/call_method_75_TAIR9.csv',header=T,stringsAsFactors = FALSE)
snp_meta = read.csv('~/call_method_75/call_method_75_info_ascii.csv',header=T,stringsAsFactors = FALSE)
rownames(snp_meta) = snp_meta$ecotype_id
colnames(snps)[3:ncol(snps)] = str_to_upper(gsub(' ','',snp_meta[unlist(snps[1,3:ncol(snps)]),'nativename']))

# fix mapping errors case-by-case
colnames(snps)[colnames(snps)=='KNO-18'] = 'KNOX-18'
colnames(snps)[colnames(snps)=='KNO-10'] = 'KNOX-10'
colnames(snps)[colnames(snps)=='SHAHDARA'] = 'SHA'
colnames(snps)[colnames(snps)=='COL-0'] = 'COL'
colnames(genos)[colnames(genos)=='AN-0'] = 'AN-1'

snps = cbind(snps[,1:2], snps[,colnames(genos)])
rownames(snps) = apply( snps, 1, function(vec){return(paste('snp',vec[1],vec[2],sep='.'))} )
rownames(genos) = sapply( rownames(genos),  function(str){return(paste('str',str,sep='.'))})

# added 12/8/16 to control gwas
# stupid way of doing this
control_gwas_str = genos[c(
  'str.1175','str.1665','str.2479','str.3692','str.3950','str.4806',
  'str.37359','str.41068','str.43058','str.43970','str.65213','str.65291',
  'str.83235','str.83239','str.84683','str.86275','str.86626','str.99290'
  ),]
control_gwas_snps = c()
control_gwas_snps['str.1175'] = c('snp.1.1921587') # exists?
control_gwas_snps['str.1665'] = c('snp.1.2778963')
control_gwas_snps['str.2479'] = c('snp.1.4143163')
control_gwas_snps['str.3692'] = c('snp.1.6369772')
control_gwas_snps['str.3950'] = c('snp.1.6371576')
control_gwas_snps['str.4806'] = c('snp.1.7785207')
#control_gwas_snps['str.6864'] = c('snp.1.10458712')
#control_gwas_snps['str.37195'] = c('snp.2.9598635') # gone now
control_gwas_snps['str.37359'] = c('snp.2.9602595')
control_gwas_snps['str.41068'] = c('snp.2.13886899') # exists?
control_gwas_snps['str.43058'] = c('snp.2.16917776')
control_gwas_snps['str.43970'] = c('snp.2.18439471')
control_gwas_snps['str.65213'] = c('snp.4.429928')
control_gwas_snps['str.65291'] = c('snp.4.581120')
control_gwas_snps['str.83235'] = c('snp.5.2265385')
control_gwas_snps['str.83239'] = c('snp.5.2339456')
control_gwas_snps['str.83458'] = c('snp.5.2337756')
control_gwas_snps['str.84683'] = c('snp.5.4697537') # exists?
control_gwas_snps['str.86275'] = c('snp.5.6685571')
control_gwas_snps['str.99290'] = c('snp.5.18608407') # exists?


# again, a stupid way of doing this
control_phenos = c()
control_phenos['str.1175'] = c('X279_DSDS50')
control_phenos['str.1665'] = c('X47_0W.GH.FT')
control_phenos['str.2479'] = c('X32_avrPphB')
control_phenos['str.3692'] = c('X6_FT16')
control_phenos['str.3950'] = c('X47_0W.GH.FT')
control_phenos['4806'] = c('X277_Secondary.Dormancy')
#control_phenos['str.6864'] = c('X45_8W.GH.FT')
#control_phenos['str.37195'] = c('X2_LDV') # gone now
control_phenos['str.37359'] = c('X47_0W.GH.FT')
control_phenos['str.41068'] = c('X7_FT22')
control_phenos['str.43058'] = c('X47_0W.GH.FT')
control_phenos['str.43970'] = c('X7_FT22')
control_phenos['str.65213'] = c('X277_Secondary.Dormancy')
control_phenos['str.65291'] = c('X6_FT16')
control_phenos['str.83235'] = c('X7_FT22')
control_phenos['str.83239'] = c('X17_Mg25')
control_phenos['str.83458'] = c('X7_FT22')
control_phenos['str.84683'] = c('X281_Storage.7.days')
control_phenos['str.86275'] = c('X7_FT22')
control_phenos['str.99290'] = c('X61_LC.Duration.GH')

control_snp_phenos = cbind(control_phenos,control_gwas_snps,snps[control_gwas_snps,colnames(genos)])

write.table(control_snp_phenos,'control_snp_phenos_strwa_control_032317.txt',quote=FALSE)

# to est effect sizes - write out
ld_snps = c('snp.1.3895353','snp.2.9581605','snp.5.19508285','snp.5.25386559') # all with p < 10^(-6) for pheno X1_LD
est_effect_snps = cbind( snps[ld_snps,1:2],snps[ld_snps,colnames(genos)] )
                        
write.table(est_effect_snps, 'est_effect_snps_031617.txt',quote=FALSE)
