# Human_align.txt is from RH_pools_human_align.xlsx, which is a simplified version of RH_mapped_unmapped_info_hg38.xlsx from Arshad




# ------------------ sem, compare fxns -------------------------


sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

compare <- function(a,b) {
	print(t.test(a,b))
	
	print(paste0("exact P value = ", t.test(a,b)$p.value))
	
	print(paste0("mean of a = ", mean(a, na.rm = TRUE)))
	print(paste0("sem of a = ", sem(a)))
	print(paste0("sd of a = ", sd(a, na.rm = TRUE)))
	print(paste0("number in a = ", sum(!is.na(a))))
	
	print(paste0("mean of b = ", mean(b, na.rm = TRUE)))
	print(paste0("sem of b = ", sem(b)))
	print(paste0("sd of b = ", sd(b, na.rm = TRUE)))
	print(paste0("number in b = ", sum(!is.na(b))))
	
}




# ---------------- load data ---------------------------




Human_align <- read.table("Human_align.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

cell <- read.table("cell_label_info.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

Human_align[grep("^RH1",Human_align$RH_ID),"pool"] <- 1
Human_align[grep("^RH2",Human_align$RH_ID),"pool"] <- 2
Human_align[grep("^RH3",Human_align$RH_ID),"pool"] <- 3
Human_align[grep("^RH4",Human_align$RH_ID),"pool"] <- 4
Human_align[grep("^RH5",Human_align$RH_ID),"pool"] <- 5
Human_align[grep("^RH6",Human_align$RH_ID),"pool"] <- 6

Human_align[grep("_w0_",Human_align$RH_ID),"week"] <- 0
Human_align[grep("_w1_",Human_align$RH_ID),"week"] <- 1
Human_align[grep("_w2_",Human_align$RH_ID),"week"] <- 2
Human_align[grep("_w3_",Human_align$RH_ID),"week"] <- 3
Human_align[grep("_w4_",Human_align$RH_ID),"week"] <- 4
Human_align[grep("_w6_",Human_align$RH_ID),"week"] <- 6

Human_align[grep("_d0",Human_align$RH_ID),"conc"] <- 0
Human_align[grep("_d8",Human_align$RH_ID),"conc"] <- 8
Human_align[grep("_d25",Human_align$RH_ID),"conc"] <- 25
Human_align[grep("_d75",Human_align$RH_ID),"conc"] <- 75

dim(Human_align)
# [1] 115   9

head(Human_align)
       # RH_ID Total_reads human_unaligned human_aligned human_aligned_and_hamster_unaligned human_aligned_and_hamster_aligned
# 1  RH1_w0_d0    42000172        41601530        398642                              298433                            100209
# 2  RH1_w1_d0    47971567        47496634        474933                              372475                            102458
# 3  RH1_w1_d8    62213204        61653445        559759                              425169                            134590
# 4 RH1_w1_d25    55396431        54958564        437867                              321998                            115869
# 5 RH1_w1_d75    66004325        65612039        392286                              247321                            144965
# 6  RH1_w2_d0    38276132        37777814        498318                              310062                            188256
  # pool week conc
# 1    1    0    0
# 2    1    1    0
# 3    1    1    8
# 4    1    1   25
# 5    1    1   75
# 6    1    2    0

dim(cell)
# [1] 115   4

head(cell)
  # pool conc week cell
# 1    1    0    0    1
# 2    1    0    1    2
# 3    1    0    2    2
# 4    1    0    3    2
# 5    1    0    4    2
# 6    1    0    6    2

Human_align_1 <- merge(Human_align,cell)

dim(Human_align_1)
# [1] 115  10

Human_align_2 <- Human_align_1[,c("RH_ID","pool", "week", "conc", "cell", "Total_reads", "human_unaligned", "human_aligned" ,"human_aligned_and_hamster_unaligned", "human_aligned_and_hamster_aligned")]

Human_align_3 <- Human_align_2[order(Human_align_2$cell),]



dim(Human_align_3)
# [1] 115  10


head(Human_align_3)
       # RH_ID pool week conc cell Total_reads human_unaligned human_aligned human_aligned_and_hamster_unaligned
# 1  RH1_w0_d0    1    0    0    1    42000172        41601530        398642                              298433
# 2  RH1_w1_d0    1    1    0    2    47971567        47496634        474933                              372475
# 6  RH1_w2_d0    1    2    0    2    38276132        37777814        498318                              310062
# 9  RH1_w3_d0    1    3    0    2    34816772        34444880        371892                              288187
# 13 RH1_w4_d0    1    4    0    2    36328023        35818437        509586                              326399
# 17 RH1_w6_d0    1    6    0    2    50327669        49691822        635847                              527642
   # human_aligned_and_hamster_aligned
# 1                             100209
# 2                             102458
# 6                             188256
# 9                              83705
# 13                            183187
# 17                            108205

tail(Human_align_3)
         # RH_ID pool week conc cell Total_reads human_unaligned human_aligned human_aligned_and_hamster_unaligned
# 103 RH6_w2_d25    6    2   25   29    33860116        33156618        703498                              539605
# 106 RH6_w3_d25    6    3   25   29    34412488        33503393        909095                              739908
# 110 RH6_w4_d25    6    4   25   29    41335980        40065276       1270704                             1068617
# 113 RH6_w6_d25    6    6   25   29    41324384        39887294       1437090                             1234824
# 107 RH6_w3_d75    6    3   75   30    38966343        37851842       1114501                              921369
# 114 RH6_w6_d75    6    6   75   30    38440264        37310117       1130147                              940009
    # human_aligned_and_hamster_aligned
# 103                            163893
# 106                            169187
# 110                            202087
# 113                            202266
# 107                            193132
# 114                            190138

# write.table(Human_align_3, "RH_pool_human_total_align.txt",quote=FALSE,sep="\t",row.names=FALSE)



# ~~~~~~~~~~~~~ Prepare table for latex: ~~~~~~~~~~~~~~~~~~~~~~~


# ~~~~~~~ RH pools only ~~~~~~~~~~~~~~~~~~~~~

Table_body <- cbind(Human_align_3[Human_align_3$week==0 & Human_align_3$conc == 0,c("pool","Total_reads","human_aligned","human_aligned_and_hamster_unaligned","human_aligned_and_hamster_aligned")],format(round(100*(Human_align_3[Human_align_3$week==0 & Human_align_3$conc == 0,c("human_aligned_and_hamster_aligned")]/Human_align_3[Human_align_3$week==0 & Human_align_3$conc == 0,c("human_aligned")]),1),nsmall=1))
Table_end <- c("Means",round(colMeans(Table_body[,2:5])),format(round(mean(as.numeric(as.character(Table_body[,6]))),1),nsmall=1))

Table_body
   # pool Total_reads human_aligned human_aligned_and_hamster_unaligned human_aligned_and_hamster_aligned
# 1     1    42000172        398642                              298433                            100209
# 21    2    35358581        868990                              785571                             83419
# 40    3    52166656       1173763                             1063450                            110313
# 60    4    32756205       1121927                             1048516                             73411
# 80    5    42876176        853173                              638622                            214551
# 98    6    41449894        842565                              632519                            210046
   # format(round(100 * (Human_align_3[Human_align_3$week == 0 & Human_align_3$conc == 
# 1                                                                                25.1
# 21                                                                                9.6
# 40                                                                                9.4
# 60                                                                                6.5
# 80                                                                               25.1
# 98                                                                               24.9


Table_end
                                                            # Total_reads                       human_aligned 
                            # "Means"                          "41101281"                            "876510"  <<<<<<<<<<<<< use in paper
# human_aligned_and_hamster_unaligned   human_aligned_and_hamster_aligned                                     
                           # "744518"                            "131992"                              "16.8" 

# write.table(Table_body,"Table_body_hum_align.txt",quote=FALSE,sep="&\t",row.names=FALSE,col.names=FALSE,eol="\\\n")

# write.table(Table_end,"Table_end_hum_align.txt",quote=FALSE,sep="&\t",row.names=FALSE,col.names=FALSE,eol="\\\n")

# sem of error rate as percent:
sd(100*Human_align_3[Human_align_3$week==0 & Human_align_3$conc == 0,c("human_aligned_and_hamster_aligned")]/Human_align_3[Human_align_3$week==0 & Human_align_3$conc == 0,c("human_aligned")])/sqrt(6)
# [1] 3.728779 <<<<<<< use in paper

# sem of other cols:
apply(Human_align_3[Human_align_3$week==0 & Human_align_3$conc == 0,c("Total_reads","human_aligned","human_aligned_and_hamster_unaligned","human_aligned_and_hamster_aligned")],2,FUN= function(x) {sd(x)/sqrt(6)})
                        # Total_reads                       human_aligned human_aligned_and_hamster_unaligned 
                         # 2763700.47                           112353.62                           118119.62  <<<<<<<<<<<<< use in paper
  # human_aligned_and_hamster_aligned 
                           # 25935.58 



# sem of all cols:
apply(cbind(Human_align_3[Human_align_3$week==0 & Human_align_3$conc == 0,c("Total_reads","human_aligned","human_aligned_and_hamster_unaligned","human_aligned_and_hamster_aligned")],100*(Human_align_3[Human_align_3$week==0 & Human_align_3$conc == 0,c("human_aligned_and_hamster_aligned")]/Human_align_3[Human_align_3$week==0 & Human_align_3$conc == 0,c("human_aligned")])),2,FUN= function(x) {sd(x)/sqrt(6)})
                                                          # Total_reads 
                                                         # 2.763700e+06 
                                                        # human_aligned 
                                                         # 1.123536e+05 
                                  # human_aligned_and_hamster_unaligned 
                                                         # 1.181196e+05 
                                    # human_aligned_and_hamster_aligned 
                                                         # 2.593558e+04 
# 100 * (Human_align_3[Human_align_3$week == 0 & Human_align_3$conc ==  
                                                         # 3.728779e+00  



# ~~~~~~~~~~~~~~~ All samples ~~~~~~~~~~~~~~~~~~~~~~~~~~~


# Means for all 115 samples:
colMeans(Human_align_3[,c(6:ncol(Human_align_3))])
                        # Total_reads                     human_unaligned                       human_aligned 
                         # 40453761.5                          39549339.1                            904422.3 
# human_aligned_and_hamster_unaligned   human_aligned_and_hamster_aligned 
                           # 745795.6                            158626.7  <<<<<<<<< use in paper

# sem for all 115 samples:
apply(Human_align_3[,c(6:ncol(Human_align_3))],2,FUN=function(x) {sd(x)/sqrt(nrow(Human_align_3))})
                        # Total_reads                     human_unaligned                       human_aligned 
                         # 666836.199                          672600.111                           36168.299 
# human_aligned_and_hamster_unaligned   human_aligned_and_hamster_aligned 
                          # 36516.574                            4515.167  <<<<<<<<< use in paper

# Mean correction rate for all 115 samples:
mean(Human_align_3[,c("human_aligned_and_hamster_aligned")]/Human_align_3[,c("human_aligned")])
# [1] 0.2096987 <<<<<<<< use in paper

# sem correction rate for all 115 samples:
sd(Human_align_3[,c("human_aligned_and_hamster_aligned")]/Human_align_3[,c("human_aligned")])/sqrt(115)
# [1] 0.01143507 <<<<<<<< use in paper



# ~~~~~~~~~~~~~ Seq coverage of total reads in RH pools compared to human genome: ~~~~~~~~~~~~~~~~ 

# reads = 64 bp long. Genome length (incl mitochondria) from human_chr_lengths_1.R = 3088286401

# mean:
64 * mean(Human_align_3[Human_align_3$week==0 & Human_align_3$conc==0,c("Total_reads")]) / 3088286401
# [1] 0.851761


# sem:
(64 * sd(Human_align_3[Human_align_3$week==0 & Human_align_3$conc==0,c("Total_reads")]) / 3088286401)/sqrt(6)
# [1] 0.05727345



# ~~~~~~~~~~~~~ Seq coverage of total reads in all samples compared to human genome: ~~~~~~~~~~~~~~~~

# mean:
64 * mean(Human_align_3[,c("Total_reads")]) / 3088286401
# [1] 0.8383422


# sem:
(64 * sd(Human_align_3[,c("Total_reads")]) / 3088286401)/sqrt(115)
# [1] 0.01381916



# ~~~~~~~~~~~~~ Seq coverage of human-specific aligned reads in RH pools compared to human genome: ~~~~~~~~~~~~~~~~ 

# reads = 64 bp long. Genome length (incl mitochondria) from human_chr_lengths_1.R = 3088286401

# mean:
64 * mean(Human_align_3[Human_align_3$week==0 & Human_align_3$conc==0,c("human_aligned_and_hamster_unaligned")]) / 3088286401
# [1] 0.01542899 <<<<<<<<< use in paper


# sem:
(64 * sd(Human_align_3[Human_align_3$week==0 & Human_align_3$conc==0,c("human_aligned_and_hamster_unaligned")]) / 3088286401)/sqrt(6)
# [1] 0.002447848 <<<<<<<<< use in paper



# ~~~~~~~~~~~~~ Seq coverage of human-specific aligned reads in all samples compared to human genome: ~~~~~~~~~~~~~~~~

# mean:
64 * mean(Human_align_3[,c("human_aligned_and_hamster_unaligned")]) / 3088286401
# [1] 0.01545547 <<<<<<<<< use in paper


# sem:
(64 * sd(Human_align_3[,c("human_aligned_and_hamster_unaligned")]) / 3088286401)/sqrt(115)
# [1] 0.00075675 <<<<<<<<< use in paper



# ~~~~~~~~~~ Calculate ratio hamster aligned reads to human aligned reads ~~~~~~~~~~~~~~~~~~~~~

# Read corresponding human and hamster tables

# From here:
human <- read.table("RH_pool_human_total_align.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

# From RH_hamster_total_align_1.R
hamster <- read.table("RH_pool_hamster_total_align.txt",header=TRUE,sep="\t",stringsAsFactors=FALSE)

dim(human)
# [1] 115  10

dim(hamster)
# [1] 115  10

head(human)
      # RH_ID pool week conc cell Total_reads human_unaligned human_aligned human_aligned_and_hamster_unaligned
# 1 RH1_w0_d0    1    0    0    1    42000172        41601530        398642                              298433
# 2 RH1_w1_d0    1    1    0    2    47971567        47496634        474933                              372475
# 3 RH1_w2_d0    1    2    0    2    38276132        37777814        498318                              310062
# 4 RH1_w3_d0    1    3    0    2    34816772        34444880        371892                              288187
# 5 RH1_w4_d0    1    4    0    2    36328023        35818437        509586                              326399
# 6 RH1_w6_d0    1    6    0    2    50327669        49691822        635847                              527642
  # human_aligned_and_hamster_aligned
# 1                            100209
# 2                            102458
# 3                            188256
# 4                             83705
# 5                            183187
# 6                            108205

head(hamster)
      # RH_ID pool week conc cell Total_reads hamster_unaligned hamster_aligned hamster_aligned_and_human_unaligned
# 1 RH1_w0_d0    1    0    0    1    42000172           4304785        37695387                            37595178
# 2 RH1_w1_d0    1    1    0    2    47971567           4982337        42989230                            42886773
# 3 RH1_w2_d0    1    2    0    2    38276132           3876901        34399231                            34210971
# 4 RH1_w3_d0    1    3    0    2    34816772           3585610        31231162                            31147459
# 5 RH1_w4_d0    1    4    0    2    36328023           3623841        32704182                            32520994
# 6 RH1_w6_d0    1    6    0    2    50327669           5325221        45002448                            44894243
  # hamster_aligned_and_human_aligned
# 1                            100209
# 2                            102457
# 3                            188260
# 4                             83703
# 5                            183188
# 6                            108205




# ~~~~~~~~ ratio for RH pools only: ~~~~~~~~~~~~~~

# Not corrected for ratio of human to hamster genome lengths, but is appropriate for where mentioned in paper.

mean(hamster[hamster$week ==0 & hamster$conc == 0,c("hamster_aligned_and_human_unaligned")]/human[human$week ==0 & human$conc == 0,c("human_aligned_and_hamster_unaligned")])
# [1] 58.90485 <<<<<<<<<<<< use in paper


# sem
sd(hamster[hamster$week ==0 & hamster$conc == 0,c("hamster_aligned_and_human_unaligned")]/human[human$week ==0 & human$conc == 0,c("human_aligned_and_hamster_unaligned")])/sqrt(6)
# [1] 14.34735 <<<<<<<<<<<< use in paper



# ~~~~~~~ ratio for all samples: ~~~~~~~~~~~~

mean(hamster[,c("hamster_aligned_and_human_unaligned")]/human[,c("human_aligned_and_hamster_unaligned")])
# [1] 83.70776 <<<<<<<<< use in paper

# sem for all samples:

sd(hamster[,c("hamster_aligned_and_human_unaligned")]/human[,c("human_aligned_and_hamster_unaligned")])/sqrt(nrow(human))
# [1] 15.81248 <<<<<<<<< use in paper




# -------------- Uncorrected human reads aligned to all samples -------------------------



mean(human$human_aligned)
# [1] 904422.3 <<<<<<<<< use in paper

sem(human$human_aligned)
# [1] 36168.3 <<<<<<<<< use in paper


# ----------------- Human reads that also aligned to hamster all samples ---------------------------------

# very similar to "hamster reads that also aligned to human all samples", except for minor differences in read alignments

mean(human$human_aligned_and_hamster_aligned)
# [1] 158626.7 <<<<<<<<< use in paper

sem(human$human_aligned_and_hamster_aligned)
# [1] 4515.167 <<<<<<<<< use in paper



# ------------------ Percent human reads that also aligned to hamster  all samples --------------------------


mean(human$human_aligned_and_hamster_aligned/human$human_aligned)
# [1] 0.2096987 <<<<<<<<< use in paper

sem(human$human_aligned_and_hamster_aligned/human$human_aligned)
# [1] 0.01143507 <<<<<<<<< use in paper




# --------------------- Human specific reads  all samples  -----------------------



mean(human$human_aligned_and_hamster_unaligned)
# [1] 745795.6 <<<<<<<<< use in paper

sem(human$human_aligned_and_hamster_unaligned)
# [1] 36516.57 <<<<<<<<< use in paper




# -------------- Uncorrected hamster reads aligned to all samples -------------------------



mean(hamster$hamster_aligned)
# [1] 35782901
 

sem(hamster$hamster_aligned)
# [1] 607987.4
 


# ----------------- hamster reads that also aligned to human all samples ---------------------------------

# very similar to "Human reads that also aligned to hamster all samples", except for minor differences in read alignments

mean(hamster$hamster_aligned_and_human_aligned)
# [1] 158627 <<<<<<<<<< use in paper
 

sem(hamster$hamster_aligned_and_human_aligned)
# [1] 4515.189 <<<<<<<<<< use in paper
 



# ------------------ Percent hamster reads that also aligned to human  all samples --------------------------


mean(hamster$hamster_aligned_and_human_aligned/hamster$hamster_aligned)
# [1] 0.004536133 <<<<<<<<<< use in paper
 

sem(hamster$hamster_aligned_and_human_aligned/hamster$hamster_aligned)
# [1] 0.0001372799 <<<<<<<<<< use in paper
 




# --------------------- hamster specific reads   all samples -----------------------



mean(hamster$hamster_aligned_and_human_unaligned)
# [1] 35624274
 


sem(hamster$hamster_aligned_and_human_unaligned)
# [1] 607969.1
 





























