library(readxl)

setwd("//data.wexac.weizmann.ac.il/barkailab/gatk/R_DESeq/");
homeDir = "//data.wexac.weizmann.ac.il/barkailab/gatk/";
count_dir = paste(homeDir, "read_counts/", sep="");

counts_strict_cer = read_excel(paste(count_dir, "counts_strict_cer.xlsx", sep=""));
counts_strict_par = read_excel(paste(count_dir, "counts_strict_par.xlsx", sep=""));
counts_combined = read_excel(paste(count_dir, "counts_combined.xlsx", sep=""));

ORF = counts_strict_cer[, 1]
Nsamples = ncol(counts_strict_cer) -1
sampleNames = colnames(counts_strict_cer)[2:(Nsamples+1)]
counts_strict_cer = data.frame(counts_strict_cer)
counts_strict_par = data.frame(counts_strict_par)
counts_combined = data.frame(counts_combined)

count_list = list(counts_strict_cer, counts_strict_par, counts_combined)

for (i in 1:length(count_list)) {
  currD = count_list[[i]]
  rownames(currD) = currD[,1]
  currD = subset(currD, select = -c(ORF))
  count_list[[i]] = currD
}

indist = count_list[[3]] - (count_list[[1]] + count_list[[2]])

idx_cer = grep("cer", sampleNames)
idx_par = grep("par", sampleNames)
idx_hyb = grep("hyb", sampleNames)

cer_counts = count_list[[3]][,idx_cer]
par_counts = count_list[[3]][, idx_par]
hyc_counts = count_list[[1]][, idx_hyb] + indist[, idx_hyb]/2
colnames(hyc_counts) = gsub("hyb", "hyc", colnames(hyc_counts))
hyp_counts = count_list[[2]][, idx_hyb] + indist[, idx_hyb]/2
colnames(hyp_counts) = gsub("hyb", "hyp", colnames(hyp_counts))

count_per_gen = data.frame(c(cer_counts, par_counts, hyc_counts, hyp_counts))
rownames(count_per_gen) = rownames(cer_counts)
sampleNames = colnames(count_per_gen)
Nsamples = length(sampleNames)

sumExp = read_excel(paste(homeDir, "Screen/read_organize_data/sumExp.xlsx", sep=""));
sumExp = data.frame(sumExp)
sampleName2 = sumExp$sample_name
sumExp = cbind(sampleName2, sumExp)
idx_hyb = grep("hyb", sumExp$sample_name)
sumExp_hyb = sumExp[idx_hyb, ]
sumExp$sampleName2 = gsub("hyb", "hyc", sumExp$sampleName2)
sumExp_hyb$sampleName2 = gsub("hyb", "hyp", sumExp_hyb$sampleName2)
sumExp = rbind(sumExp, sumExp_hyb)
sumExp$gb = sumExp$species
sumExp$gb[grep("hyc", sumExp$sampleName2)] = "hyc"
sumExp$gb[grep("hyp", sumExp$sampleName2)] = "hyp"
  
idx = order(match(sumExp$sampleName2, colnames(count_per_gen)))
sumExp = sumExp[idx,]
sumExp$species = as.factor(sumExp$species)
sumExp$gb = as.factor(sumExp$gb)

count_per_gen[is.na(count_per_gen)] = 0;
for (i in 1:Nsamples) {
  count_per_gen[, i] = as.integer(count_per_gen[, i])
}

sumExp$batch = as.factor(paste0(sumExp$experiment_ID, ".", sumExp$library_ID))

sumExp$sp = sumExp$gb
sumExp$sp = gsub("hyc", "cer", sumExp$sp)
sumExp$sp = gsub("hyp", "par", sumExp$sp)

sumExp$F = sumExp$species
sumExp$F = gsub("cer", "0", sumExp$F)
sumExp$F = gsub("par", "0", sumExp$F)
sumExp$F = gsub("hyb", "1", sumExp$F)

sumExp = data.frame(lapply(sumExp, as.factor))


saveRDS(count_per_gen, file = "count_per_gen.rds")
saveRDS(sumExp, file = "sumExp.rds")
write.csv(count_per_gen, file = "count_per_gen.csv")