#!/usr/bin/Rscript

# updated 12/6/16 to use the log transformations recommended by Atwell et al. 
# updated 10/31/16 to use something other than crappy STRUCTURE clusters to correct for pop structure
# updated 8/16/16 to work with new, much bigger dataset

# use mixed model to do this
#library(lme4)
library(coxme)
library(stringr)
library(MASS)

set.seed(12345)

# genomic inflation factor, estimated post facto with GenABEL
# not currently using, not obvious what to do with it.
#inflation = 1.79

cat('reading data\n')
# make sure first that data has been EDITED for stupid spelling and parsing errors
strs = read.table('data/mip_geno_filtered_table.txt',header=T)

# now mapping among 3 files, simpler to change strs in this case
colnames(strs) = str_to_upper(colnames(strs))
colnames(strs)[colnames(strs)=='KNOX.18'] = 'KNO.18'
colnames(strs)[colnames(strs)=='KNOX.10'] = 'KNO.10'
colnames(strs)[colnames(strs)=='AN.0'] = 'AN.1'
colnames(strs)[colnames(strs)=='COL'] = 'COL.0'


pheno = read.table('data/phenotype_published_raw_rename.txt',header=T,sep='\t')

# read in snp data (if not already computed)
if (file.exists('data/Kinmat.Rdat')) { load('data/Kinmat.Rdat') } else {
# have to thin this to just the strains i have str data for...
snps = read.csv('data/call_method_75/call_method_75_TAIR9.csv',header=T,stringsAsFactors = FALSE)
snp_meta = read.csv('data/call_method_75/call_method_75_info_ascii.csv',header=T,stringsAsFactors = FALSE)
colnames(snps)[3:ncol(snps)] = str_to_upper(gsub(' ','',snp_meta[unlist(snps[1,3:ncol(snps)]),'nativename']))
colnames(snps)[3:ncol(snps)] = gsub('-','.',colnames(snps)[3:ncol(snps)])
print('snped')

# fix mapping errors case-by-case
colnames(strs)[colnames(strs)=='AN-0'] = 'AN-1'

snps =  snps[2:nrow(snps),str_to_upper(colnames(strs))]

# estimating kinship as done in emma is very straightforward- just binary distances. because the emma kinship function
# chokes on the character data, i am just computing this myself (I tested and it behaves the same)
K = matrix( rep(NA, ncol(snps)^2), ncol(snps) )	# square matrix to hold pairwise kinships
# stupid loop, but works? probably slow
for (i in 1:ncol(snps) ) {
	cat('computing kinship',i,'\n')
	for (j in 1:ncol(snps)) {
		K[i,j] = length(which(snps[,i] == snps[,j])) / nrow(snps)
		}
	}

rownames(K) = colnames(strs)
colnames(K) = colnames(strs)

save(K,file='data/Kinmat.Rdat')
remove(snps,snp_meta)
}

if (length(which(is.na(K))) > 0) {sys.exit('K matrix contains NAs')}

cat('doing PCA')
# PCA on STRs as one method of pop correction
var_genos = t(na.omit(strs))
vars = apply(var_genos,2,var)
var_genos = var_genos[,vars>0]
scale_pca = prcomp(var_genos,scale=T)
cat('first PCA eigs:',scale_pca$sdev[1:10]/sum(scale_pca$sdev),'\n')
pop_corr = scale_pca$x[,1:5]
rownames(pop_corr) = rownames(var_genos)

# 8/31/16 added tryCatch so that failed model fit doesn't kill script
# need to keep an eye on this because fatal errors won't kill, but instead return NA.
# 10/31/16 NEED TO FIX this to do a sensible correction using coxme function lmekin
new_mma = function(pops,genos,phenos,K) {
	# try to make binom models work
	#tryCatch( {
	# work with presence/absence vars
	#if (length(unique(phenos))==2) {
	#return(NA)}
	data = data.frame(id=pops,genos=genos,phenos=phenos)
	genomodel = lmekin(as.numeric(phenos) ~ genos + (1|id), varlist=K,data=data,method='ML')
	nullmodel = lmekin(as.numeric(phenos) ~ (1|id), varlist = K, data=data, method='ML')	# can't compare models fit with REML with diff fixed eff
	return(list(geno=genomodel,null=nullmodel))
	#}, error=function(e) {
	#	message(e)
	#	return(NA)
	#	})
	}

# try fitting PCA from STRs as fixed effect, as Oerjan suggested.
pca_mod = function(pops,genos,phenos) {
        tryCatch( {
	pop_corr = pops
        # work with presence/absence vars
        if (identical(sort(unique(phenos)), c(0,1))) {
        genomodel = glm(as.numeric(phenos) ~ genos + as.numeric(pop_corr[,1]) + as.numeric(pop_corr[,2])+ as.numeric(pop_corr[,3])+ as.numeric(pop_corr[,4])+ as.numeric(pop_corr[,5]),family='binomial')
        nullmodel = glm(as.numeric(phenos) ~ as.numeric(pop_corr[,1]) + as.numeric(pop_corr[,2])+ as.numeric(pop_corr[,3])+ as.numeric(pop_corr[,4])+ as.numeric(pop_corr[,5]),family='binomial')  # REML apparently not used with GLMMs
        } else {
        genomodel = lm(as.numeric(phenos) ~ genos + as.numeric(pop_corr[,1]) + as.numeric(pop_corr[,2])+ as.numeric(pop_corr[,3])+ as.numeric(pop_corr[,4]) + as.numeric(pop_corr[,5]))  # would be done in anova step anyways, just saves time
        nullmodel = lm(as.numeric(phenos) ~ as.numeric(pop_corr[,1]) + as.numeric(pop_corr[,2]) + as.numeric(pop_corr[,3]) + as.numeric(pop_corr[,4])+ as.numeric(pop_corr[,5])) # can't compare models fit with REML with diff fixed eff
                }
        return(list(null=nullmodel,geno=genomodel))
	}, error=function(e) {
                message(e)
                return(NA)
                })
        }

print('starting to rename everything')

colnames(strs) = toupper(gsub('_[0-9a-zA-Z]+','',colnames(strs)))
# fix a few mapping errors between geno/pheno datasets
colnames(strs)[colnames(strs)=='COL'] = 'COL.0'
colnames(strs)[colnames(strs)=='OMO2.1'] = 'OMO.2.1'
colnames(strs)[colnames(strs)=='OMO2.3'] = 'OMO.2.3'
colnames(strs)[colnames(strs)=='SHA'] = 'SHAHDARA'
colnames(strs)[colnames(strs)=='AN.0'] = 'AN.1'
colnames(strs)[colnames(strs)=='KNOX.18'] = 'KNO.18'
colnames(strs)[colnames(strs)=='KNOX.10'] = 'KNO.10'
colnames(strs)[colnames(strs)=='VAR2.6'] = 'VAR.2.6'
colnames(strs)[colnames(strs)=='VAR2.1'] = 'VAR.2.1'

rownames(pop_corr) = colnames(strs)
rownames(K) = colnames(strs)
colnames(K) = colnames(strs)

#print(colnames(strs))

phenoaccs = toupper(gsub('-','\\.',as.vector(pheno[,2])))
accs = phenoaccs[phenoaccs %in% colnames(strs)]
phenoin = as.matrix(pheno[phenoaccs %in% accs,])
rownames(phenoin) = as.vector(accs)

repin = as.matrix(strs[,colnames(strs) %in% accs])	

# rownames already set as mips
phenoin = phenoin[colnames(repin),]
print(dim(phenoin))

# some phenos need to be log-transformed
# these ones were transformed by atwell et al.
atwell_log = c('X1_LD','X2_LDV','X3_SD','X4_SDV','X5_FT10','X6_FT16','X7_FT22','X8_Seed.Dormancy','X16_Na23','X22_Mn55','X24_Co59','X30_Mo98',"X28_As75","X31_Cd114",'X39_0W',
	'X40_2W','X41_4W','X42_8W','X43_FLC','X44_FRI','X45_8W.GH.FT','X46_8W.GH.LN','X47_0W.GH.FT','X48_0W.GH.LN','X59_FT.GH',
	"X60_FT.Duration.GH",'X61_LC.Duration.GH','X62_LFS.GH','X65_At1','X67_As','X71_At2','X73_As2',"X75_FW","X76_DW",'X80_LN10','X81_LN16','X82_LN22',
	'X163_Germ.22','X164_Width.10','X183_Trichome.avg.C','X184_Trichome.avg.JA',"X182_Hypocotyl.length",
	'X277_Secondary.Dormancy','X279_DSDS50','X281_Storage.7.days','X282_Storage.28.days')

# OK: something weird happened here, but i think these should now be logged
#  due to the unexpected data transformation (???) observed 3/2017.
# so no longer NOT logging these:
# these ones give really bad/anticonservative results when logged
#dont_log = c('X4_SDV', 'X3_SD', 'X5_FT10', 'X6_FT16', 'X7_FT22', 'X41_4W', 'X47_0W.GH.FT', 'X61_LC.Duration.GH', 'X62_LFS.GH', 'X81_LN16', 'X82_LN22', 'X279_DSDS50', 'X281_Storage.7.days', 'X282_Storage.28.days')

#stopifnot(all(dont_log %in% colnames(phenoin)))

# combo of 2
#to_log = atwell_log[!(atwell_log %in% dont_log)]

#print(to_log)

print('log transforming')
#for (i in to_log) {
for (i in atwell_log) {
	print(i)
	phenold = phenoin[,i]
	phenoin[,i] = log(as.numeric( phenoin[,i] ) + .5 )
	#print(summary(as.numeric(phenold)))
	#print(summary(as.numeric(phenoin[,i])))
	}

# make sure that formats work
if (ncol(phenoin) <4) {
	print('no cols in pheno!!')
	q()
	} else if (nrow(phenoin) <1) {
	print('no rows in pheno table!!')
	q()
	} else if (nrow(repin) < 1){
	print('no repeat data!!')
	q() } else if (ncol(repin) <2) {
	print('repeat data not in right format!! (2 cols)')
	q()
	}

anova_ps = c()

num_strs = nrow(repin)
num_phenos = ncol(phenoin)-2

# matrix to hold vanilla anova pvals
anovas = matrix(rep(NA,num_strs*num_phenos),num_strs)

rownames(anovas) = rownames(repin)
colnames(anovas) = colnames(phenoin)[3:ncol(phenoin)]

# another matrix to hold the mixed-model pvals
mmas = anovas
pca_lm = anovas

rm(anovas)

for (i in rownames(repin)) {
	print(i)
	rep = repin[i,]
	if (length(unique(na.omit(rep)))<2) {
		print('no str variation')
		next
		}
		
#	for (pheno in c('X1_LD','X2_LDV')) {
	for (pheno in colnames(phenoin)[3:ncol(phenoin)]) {
		phenotype = phenoin[,pheno]
		#print(pheno)		
		# matrix is necessary for pw because of inevitable NAs
		# make an object that has pcs and ids (to be passed to lm or lmekin respectively)
		for_mi = as.matrix(na.omit(cbind(as.numeric(repin[i,]),as.numeric(phenotype),colnames(repin),pop_corr[colnames(repin),])))
		colnames(for_mi) = c('geno','pheno','id','PC1','PC2','PC3','PC4','PC5')

		if (length(unique(na.omit(for_mi[,2])))<2 || length(unique(na.omit(for_mi[,1])))<2 ) {
			print(pheno)
			#print(phenotype)
			print('no str or phenotype variation after filter')
			next
			}
	
		# filter for alleles with < 3 reps in ANOVA, had to change to account for fractionals
		# ok, changing this around because it is not clear it is helping...
		#if (nrow(na.omit(for_mi)) >= 10) {	
		#anovas[i,pheno] = cor.test(for_mi[,1],for_mi[,2],na.rm=TRUE)$p.value	
		#} 	
		counts = table(for_mi[,1])
		passed = names(counts[counts>2])
		#passing = which(for_mi[,1] %in% passed)
		
		# ran into troubles with lmer crapping out on weird data...
		# hopefully this screens out
		if (nrow(na.omit(for_mi))>=25) {
			if (length(passed) >= 2) {
				#phenos = for_mi[,2]
				phenos = as.numeric(for_mi[,2])
				genos = as.factor(for_mi[,1])
				id = as.factor(for_mi[,3])
				pops = for_mi[,4:ncol(for_mi)]
				
				# first, standard emma-like kinship correction
				mods =
				new_mma(
				phenos = phenos, 
				genos = genos, 
				pops = id,
				K = K
				)
				
				#print(mods)
				#if (length(mods) == 1) {
				#cat('failed model (binomial response) fit for',i,pheno,'\n')
				#next} # removes binomial outcomes (see fn above)
				# LRT
				LRT2 = 2 * (mods$geno$loglik - mods$null$loglik) # / inflation
				# DF = num parameters estimated for genos - 1
				df = length(mods[[1]]$coefficients$fixed) - 1 
				p = pchisq(LRT2, df=df, lower=FALSE)	# when not FALSE, it is very confusing!!
				mmas[i,pheno] = p
				
				# now PCA correction
				mods = 
				pca_mod(
				phenos = as.numeric(phenos),
				genos = genos,
				pops = pops
				)
				
				# LRT
				LRT2=2*(as.numeric(logLik(mods$geno))-as.numeric(logLik(mods$null))) # / inflation
				#LRT2 = 2 * (logLik(mods$geno) - logLik(mods$null))
				df = summary(mods$geno)$df[1] - summary(mods$null)$df[1]
				p = pchisq(LRT2, df=df, lower=FALSE) # when not false, very confusing!!
				
				pca_lm[i,pheno] = p
				}
			} else {print('not enough data')}
		}
	}

write.table(mmas,file='str_pheno_mmas_pvals_031717_final.txt',quote=FALSE,sep='\t')
#write.table(anovas,file='str_pheno_anovas_pvals_110216.txt',quote=FALSE,sep='\t')
write.table(pca_lm,file='str_pheno_pcalm_pvals_031717_final.txt',quote=FALSE,sep='\t')
