# to extract modal counts of str copy number from Keisha's ".txt" files 
# summarizing degenerate counts.
capture_event_thresh = 2
read_thresh = 4

# read in a table and compute the mode of one column
get_table_mode = function(file) {
	mip = read.table(file, header=T)
	# cp_number is KC's way of talking about STR unit copy number
	mode = names(sort(-table(mip$cp_number)))[1]
	
	match_mode = mip[ mip$cp_number == as.numeric(mode) , ]
	if (nrow(match_mode) >= capture_event_thresh & sum(match_mode$count) >= read_thresh) {
		return(as.numeric(mode))
		} else {
		cat('failed to call','\n')
		cat('events',nrow(mip),'\n')
		cat('events matching mode',nrow(match_mode),'\n')
		cat('reads matching mode',sum(match_mode$count),'\n')
		
		return(NA)
		}
	}

# this function calls the mode function across all degen counts for a given genotype	
call_genos_dir = function(direc) {
# this directory "direc" corresponds to output of degenerate tag counting 
# (in principle, all mip genotyping) for a given strain

#direc = '/net/queitsch/vol1/Users/MIPSTR_analysis2016/0217_2016_...'

genotypes = c()

# assume that all .txt files in dir are output of degenerate tag counting 
files = dir(direc, pattern='.txt')

for (file in files) {
	if (grepl('.raw',file)) {
		next
		}
	#print(file)
	mip = gsub('.txt','',file)
	mode = get_table_mode(file.path(direc,file))
	genotypes[mip] = mode
	}
	
return(genotypes)
}

# actually go through each strain now
print('calling Col')
direc = '/net/queitsch/vol1/Users/MIPSTR_analysis2016/0217_2016_5strains/R1R2_Directories/Unzipped_fastqs_20160224/Col0_NewMIP_20160224/degen_counts'
col = call_genos_dir(direc)

print('calling Bay')
direc = '/net/queitsch/vol1/Users/MIPSTR_analysis2016/0217_2016_5strains/R1R2_Directories/Unzipped_fastqs_20160224/Bay0_20160224/degen_counts'
bay = call_genos_dir(direc)

print('calling Cvi')
direc = '/net/queitsch/vol1/Users/MIPSTR_analysis2016/0217_2016_5strains/R1R2_Directories/Unzipped_fastqs_20160224/Cvi_20160224/degen_counts'
cvi = call_genos_dir(direc)

print('calling Tsu')
direc = '/net/queitsch/vol1/Users/MIPSTR_analysis2016/0217_2016_5strains/R1R2_Directories/Unzipped_fastqs_20160224/Tsu1_20160224/degen_counts'
tsu = call_genos_dir(direc)

print('calling Vod')
direc = '/net/queitsch/vol1/Users/MIPSTR_analysis2016/0217_2016_5strains/R1R2_Directories/Unzipped_fastqs_20160224/Vod7_20160224/degen_counts'
vod = call_genos_dir(direc)

## CVI HAS 2 FEWER STRS THAN OTHERS???? WHY???
## SETTING THEM TO NAs to make life easier for now
cvi[c("86094", "86899")] = NA

## COL ALSO HAS ONE FEWER??? WTF??
col['8116'] = NA
mips = sort(names(col))

str_table = as.matrix(cbind(col[mips],bay[mips],tsu[mips],vod[mips],cvi[mips]))

write.table(str_table,'/net/queitsch/vol1/Users/MIPSTR_analysis2016/0217_2016_5strains/5strain_str_genotypes.txt',sep='\t',quote=FALSE)
