library("tidyverse")
library("monocle3")

cds <-  readRDS(file="/net/waterston/vol6/files_for_lou/cds_fly_pca300_vers3.final.annotation.rds")
allCells <- as.data.frame(colData(cds) ) %>% mutate(cell.type.3=if_else(is.na(cell.type.3),"NotAssigned",cell.type.3)) %>%
  mutate(cell.type.3=str_replace_all(cell.type.3," ","_"))
cellTypes <- dplyr::distinct(allCells,cell.type.3) 

g <- dplyr::group_by(allCells,cell.type.3) %>% dplyr::summarise(nCells=n())
l <- list()
i <- 10
for ( i in 1:nrow(cellTypes)){
  cellType <- cellTypes$cell.type.3[i]
  fileName <- str_c("/net/waterston/vol9/ChipSeqPipeline/BootstrapTPM/",cellType,".tsv",sep="")
  tsv <- read_tsv(fileName,col_names = TRUE)
  genes <- tsv$gene
  tsv <- select(tsv,-gene) 
  colnames(tsv) <- cellType
  l[[i]] <- tsv
}

tsv <- bind_cols(l) %>% mutate(Gene=genes,.before=1)
write_tsv(tsv,file="/net/waterston/vol9/ChipSeqPipeline/BootstrapTPM/cds_fly_pca300_vers3.final.annotation.bootstrapTPM.tsv")




orig <- read_tsv(file="/net/waterston/vol9/ChipSeqPipeline/fly_exp_matrix_renamed_and_clean_with_83_cell_types.tsv",col_names = TRUE)
orig <- dplyr::rename(orig,Gene=gene)

tsv <- read_tsv(file="/net/waterston/vol9/ChipSeqPipeline/BootstrapTPM/cds_fly_pca300_vers3.final.annotation.bootstrapTPM.tsv",col_names = TRUE)
tsvL <- pivot_longer(tsv,cols = 2:ncol(tsv), names_to = "cellType", values_to = "BootTPM")

nrow(filter(orig,TPM==0.0))
nrow(filter(tsvL,BootTPM==0.0))

j <- full_join(tsvL,orig,by = c("Gene","cellType"))
nrow(filter(j,TPM==0.0))
nrow(filter(j,BootTPM==0.0))

write_tsv(j,file="/net/waterston/vol6/files_for_lou/cds_fly_pca300_vers3.final.annotation.bootstrap.compare.tsv",col_names = TRUE)


j <- mutate(j,pctDiff=100.0*(BootTPM-TPM)/TPM)
j <- left_join(j,g,by=c("cellType"="cell.type.3"))
View(filter(j,BootTPM>500|TPM>500))
ggplot( filter(j,TPM<500)) + geom_histogram(mapping=aes(x=TPM),bins = 100)
ggplot( filter(j,BootTPM<500)) + geom_histogram(mapping=aes(x=BootTPM),bins = 100)
