#!/usr/bin/Rscript
# control for effects of known snps affecting FT

# use mixed model to do this
#library(lme4)
library(coxme)
library(stringr)
library(MASS)

set.seed(12345)
cat('reading data\n')
# make sure first that data has been EDITED for stupid spelling and parsing errors
strs = read.table('data/mip_geno_filtered_table.txt',header=T)

# now mapping among 3 files, simpler to change strs in this case
colnames(strs) = str_to_upper(colnames(strs))
colnames(strs)[colnames(strs)=='KNOX.18'] = 'KNO.18'
colnames(strs)[colnames(strs)=='KNOX.10'] = 'KNO.10'
colnames(strs)[colnames(strs)=='AN.0'] = 'AN.1'
colnames(strs)[colnames(strs)=='COL'] = 'COL.0'
colnames(strs)[colnames(strs)=='AN-0'] = 'AN-1'

# decided against transformation, doesn't seem to remove anticonservative behavior
pheno = read.table('data/phenotype_published_raw_rename.txt',header=T,sep='\t')


load('data/Kinmat.Rdat')

# fix mapping errors case-by-case
#snps =  snps[2:nrow(snps),str_to_upper(colnames(strs))]

mma_wcon= function(pops,genos,phenos,con,K,pheno_str) {
	out = c()
	for (i in colnames(con)) {
	data = data.frame(id=pops,genos=genos,phenos=phenos,con=con[,i])
	genomodel = lmekin(as.numeric(phenos) ~ genos + con + (1|id), varlist=K,data=data,method='ML')
	nullmodel = lmekin(as.numeric(phenos) ~ con + (1|id), varlist = K, data=data, method='ML')	# can't compare models fit with REML with diff fixed eff
	out = rbind(out,c(i,pheno_str,genomodel$loglik,nullmodel$loglik) )
	}
	return(out)
	}


colnames(strs) = toupper(gsub('_[0-9a-zA-Z]+','',colnames(strs)))
# fix a few mapping errors between geno/pheno datasets
colnames(strs)[colnames(strs)=='COL'] = 'COL.0'
colnames(strs)[colnames(strs)=='OMO2.1'] = 'OMO.2.1'
colnames(strs)[colnames(strs)=='OMO2.3'] = 'OMO.2.3'
colnames(strs)[colnames(strs)=='SHA'] = 'SHAHDARA'
colnames(strs)[colnames(strs)=='AN.0'] = 'AN.1'
colnames(strs)[colnames(strs)=='KNOX.18'] = 'KNO.18'
colnames(strs)[colnames(strs)=='KNOX.10'] = 'KNO.10'
colnames(strs)[colnames(strs)=='VAR2.6'] = 'VAR.2.6'
colnames(strs)[colnames(strs)=='VAR2.1'] = 'VAR.2.1'

#rownames(pop_corr) = colnames(strs)
rownames(K) = colnames(strs)
colnames(K) = colnames(strs)

#print(colnames(strs))

phenoaccs = toupper(gsub('-','\\.',as.vector(pheno[,2])))
accs = phenoaccs[phenoaccs %in% colnames(strs)]
phenoin = as.matrix(pheno[phenoaccs %in% accs,])
rownames(phenoin) = as.vector(accs)

repin = as.matrix(strs[,colnames(strs) %in% accs])	

# rownames already set as mips
phenoin = phenoin[colnames(repin),]
#print(dim(phenoin))

# some phenos need to be log-transformed
# these ones were transformed by atwell et al.
atwell_log = c('X1_LD','X2_LDV','X3_SD','X4_SDV','X5_FT10','X6_FT16','X7_FT22','X8_Seed.Dormancy','X16_Na23','X22_Mn55','X24_Co59','X30_Mo98',"X28_As75","X31_Cd114",'X39_0W',
	'X40_2W','X41_4W','X42_8W','X43_FLC','X44_FRI','X45_8W.GH.FT','X46_8W.GH.LN','X47_0W.GH.FT','X48_0W.GH.LN','X59_FT.GH',
	"X60_FT.Duration.GH",'X61_LC.Duration.GH','X62_LFS.GH','X65_At1','X67_As','X71_At2','X73_As2',"X75_FW","X76_DW",'X80_LN10','X81_LN16','X82_LN22',
	'X163_Germ.22','X164_Width.10','X183_Trichome.avg.C','X184_Trichome.avg.JA',"X182_Hypocotyl.length",
	'X277_Secondary.Dormancy','X279_DSDS50','X281_Storage.7.days','X282_Storage.28.days')

# these ones give really bad/anticonservative results when logged
#dont_log = c('X4_SDV', 'X3_SD', 'X5_FT10', 'X6_FT16', 'X7_FT22', 'X41_4W', 'X47_0W.GH.FT', 'X61_LC.Duration.GH', 'X62_LFS.GH', 'X81_LN16', 'X82_LN22', 'X279_DSDS50', 'X281_Storage.7.days', 'X282_Storage.28.days')

#stopifnot(all(dont_log %in% colnames(phenoin)))

# combo of 2
#to_log = atwell_log[!(atwell_log %in% dont_log)]
#print(to_log)

#print('log transforming')
for (i in atwell_log) {
	#print(i)
	phenold = phenoin[,i]
	phenoin[,i] = log(as.numeric( phenoin[,i] ) + .5 )
	}

# make sure that formats work
if (ncol(phenoin) <4) {
	print('no cols in pheno!!')
	q()
	} else if (nrow(phenoin) <1) {
	print('no rows in pheno table!!')
	q()
	} else if (nrow(repin) < 1){
	print('no repeat data!!')
	q() } else if (ncol(repin) <2) {
	print('repeat data not in right format!! (2 cols)')
	q()
	}

anova_ps = c()

num_strs = nrow(repin)
num_phenos = ncol(phenoin)-2

# matrix to hold vanilla anova pvals
anovas = matrix(rep(NA,num_strs*num_phenos),num_strs)

rownames(anovas) = rownames(repin)
colnames(anovas) = colnames(phenoin)[3:ncol(phenoin)]

# another matrix to hold the mixed-model pvals
mmas = anovas

rm(anovas)

control_snps_etc = read.table('data/control_snp_phenos_strwa_control_032317.txt',header=T)
colnames(control_snps_etc)[3:98] = colnames(strs) # simplest way to make the mapping work- fortunately, in the same order.

out_controlling = c()

#for (i in rownames(repin)) {
for (i in rownames(control_snps_etc)) {
	rep_i = gsub('str\\.([0-9]+)','\\1',i)
	print(i)
	#print(rep_i)
	rep = repin[rep_i,]
	if (length(unique(na.omit(rep)))<2) {
		print('no str variation')
		next
		}
		
	#for (pheno in c('X1_LD','X2_LDV','X40_2W','X42_8W','X46_8W.GH.LN','X57_FT.Field','X59_FT.GH','X80_LN10')) {
		pheno = as.character(control_snps_etc[i,'control_phenos'])
		phenotype = phenoin[,pheno]
		print(pheno)		
		# matrix is necessary for pw because of inevitable NAs
		for_mi = as.matrix(na.omit(cbind(as.numeric(repin[rep_i,]),as.numeric(phenotype),colnames(repin))))
		colnames(for_mi) = c('geno','pheno','id')
		if (length(unique(na.omit(for_mi[,2])))<2 || length(unique(na.omit(for_mi[,1])))<2 ) {
			print(phenotype)
			print(rep)
			print(colnames(repin))
			print(for_mi)
			#print(phenotype)
			stop('no str or phenotype variation after filter')
			print('no str or phenotype variation after filter')
			next
			}
	
		counts = table(for_mi[,1])
		passed = names(counts[counts>2])
		
		# hopefully this screens out
		if (nrow(na.omit(for_mi))>=25) {
			if (length(passed) >= 2) {
				phenos = as.numeric(for_mi[,2])
				genos = as.factor(for_mi[,1])
				id = as.factor(for_mi[,3])

					con = control_snps_etc[i,] 
					snp = control_snps_etc[i,'control_gwas_snps']
					bigpheno = pheno
				
				}
			} else {print('not enough data')}
		#}
				phenotype = as.numeric(phenoin[,bigpheno])
				for (snv in snp) {
				print(snv)
		                for_mi = as.matrix(na.omit(cbind(as.numeric(repin[rep_i,]),as.numeric(phenotype),colnames(repin),t(con[colnames(repin)]) )))
				#print(for_mi)
		                colnames(for_mi) = c('geno','pheno','id','control_snp')
				#for (snv in snp) {
		                data = data.frame(id=for_mi[,3],genos=for_mi[,1],phenos=as.numeric(for_mi[,2]),control=for_mi[,4])
                                cat(i,bigpheno,snp,'\n')
				print(lmekin(as.numeric(phenos) ~ (1|id), varlist=K,data=data,method='ML'))
				conmod = lmekin(as.numeric(phenos) ~ control + (1|id), varlist=K,data=data,method='ML')
				print(conmod)
				print(conmod$loglik)
				strmod = lmekin(as.numeric(phenos) ~ as.factor(genos) + (1|id), varlist=K,data=data,method='ML')
				print(strmod)
				print(fixed.effects(strmod))
				print(strmod$VarCorr)
				fullmod = lmekin(as.numeric(phenos) ~ as.factor(genos) + control + (1|id), varlist=K,data=data,method='ML')
				print(fullmod)
				print(fullmod$loglik)
				}
	}	

#write.table(out_controlling,'controlled_LLs_all_012417.txt',quote=FALSE,col.names=FALSE,row.names=FALSE)
#source('code/str_pheno_swarmer.R')
#write.table(mmas,file='str_pheno_mmas_pvals_121212_control_65291_37915.txt',quote=FALSE,sep='\t')
