# RH_hamster_reads.txt saved from "RH_PICR_RAW_ordered_fixed_11_29_18.xlsx"


RH_hamster <- read.table("RH_hamster_reads.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE)

dim(RH_hamster)
# [1] 237825    119



RH_hamster[1:10,1:10]
        # Contig_id  Chromosome Strat     End RH1_w0_0 RH1_w1_0 RH1_w1_8 RH1_w1_25 RH1_w1_75 RH1_w2_0
# 1  RAZU01000003.1 chromosome1     0 1000000    14756    16631    21242     19091     22873    13144
# 2  RAZU01000003.1 chromosome1 10000 1010000    14752    16628    21232     19088     22873    13134
# 3  RAZU01000003.1 chromosome1 20000 1020000    14767    16652    21298     19131     22889    13162
# 4  RAZU01000003.1 chromosome1 30000 1030000    14781    16671    21316     19174     22872    13204
# 5  RAZU01000003.1 chromosome1 40000 1040000    14757    16601    21268     19113     22818    13197
# 6  RAZU01000003.1 chromosome1 50000 1050000    14791    16599    21308     19140     22821    13190
# 7  RAZU01000003.1 chromosome1 60000 1060000    14785    16611    21298     19127     22811    13174
# 8  RAZU01000003.1 chromosome1 70000 1070000    14782    16575    21309     19109     22802    13186
# 9  RAZU01000003.1 chromosome1 80000 1080000    14794    16586    21300     19109     22819    13179
# 10 RAZU01000003.1 chromosome1 90000 1090000    14806    16586    21257     19103     22812    13173

# Rename columns
colnames(RH_hamster)[1] <- "Contig_ID"
colnames(RH_hamster)[3] <- "Start"
colnames(RH_hamster) <- gsub("_0$","_d0",colnames(RH_hamster))
colnames(RH_hamster) <- gsub("_8$","_d8",colnames(RH_hamster))
colnames(RH_hamster) <- gsub("_25$","_d25",colnames(RH_hamster))
colnames(RH_hamster) <- gsub("_75$","_d75",colnames(RH_hamster))


RH_hamster[1:10,1:10]
        # Contig_ID  Chromosome Start     End RH1_w0_d0 RH1_w1_d0 RH1_w1_d8 RH1_w1_d25 RH1_w1_d75 RH1_w2_d0
# 1  RAZU01000003.1 chromosome1     0 1000000     14756     16631     21242      19091      22873     13144
# 2  RAZU01000003.1 chromosome1 10000 1010000     14752     16628     21232      19088      22873     13134
# 3  RAZU01000003.1 chromosome1 20000 1020000     14767     16652     21298      19131      22889     13162
# 4  RAZU01000003.1 chromosome1 30000 1030000     14781     16671     21316      19174      22872     13204
# 5  RAZU01000003.1 chromosome1 40000 1040000     14757     16601     21268      19113      22818     13197
# 6  RAZU01000003.1 chromosome1 50000 1050000     14791     16599     21308      19140      22821     13190
# 7  RAZU01000003.1 chromosome1 60000 1060000     14785     16611     21298      19127      22811     13174
# 8  RAZU01000003.1 chromosome1 70000 1070000     14782     16575     21309      19109      22802     13186
# 9  RAZU01000003.1 chromosome1 80000 1080000     14794     16586     21300      19109      22819     13179
# 10 RAZU01000003.1 chromosome1 90000 1090000     14806     16586     21257      19103      22812     13173

# get rid of chrM
RH_hamster <- RH_hamster[!(RH_hamster$Chromosome == "chromosomeM"),]

# get rid of contigs with chromosome unknown
RH_hamster <- RH_hamster[!(RH_hamster$Chromosome == "chromosomeUnknown"),]



# Sort:
chrOrder<-paste("chromosome",c(1:10,"X"),sep="")
# chrOrder<-c(1:11)
RH_hamster$Chromosome <-factor(RH_hamster$Chromosome, levels=chrOrder)
RH_hamster <- RH_hamster[order(RH_hamster$Chromosome,RH_hamster$Contig_ID, RH_hamster$Start),]
RH_hamster$Chromosome <- as.character(RH_hamster$Chromosome)

contig_size <- aggregate(End ~ Contig_ID,FUN=max,data=RH_hamster)

dim(contig_size)
# [1] 341    2

head(contig_size)
       # Contig_ID      End
# 1 RAZU01000001.1 17833227
# 2 RAZU01000002.1 14745707
# 3 RAZU01000003.1  7084530
# 4 RAZU01000004.1    33761
# 5 RAZU01000005.1    31721
# 6 RAZU01000006.1    31178



contig_size <- unique(merge(contig_size,RH_hamster[,c("Contig_ID","Chromosome")]))

dim(contig_size)
# [1] 341    3

head(contig_size)
          # Contig_ID      End   Chromosome
# 1    RAZU01000001.1 17833227 chromosome10
# 1785 RAZU01000002.1 14745707 chromosome10
# 3260 RAZU01000003.1  7084530  chromosome1
# 3969 RAZU01000004.1    33761  chromosome1
# 3973 RAZU01000005.1    31721  chromosome1
# 3977 RAZU01000006.1    31178  chromosome1


chrOrder<-paste("chromosome",c(1:10,"X"),sep="")
# chrOrder<-c(1:11)
contig_size$Chromosome <-factor(contig_size$Chromosome, levels=chrOrder)
contig_size <- contig_size[order(contig_size$Chromosome, contig_size$Contig_ID, contig_size$End),]
contig_size$Chromosome <- as.character(contig_size$Chromosome)




# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~ For plotting p values may make sense to remove bottom 209 contigs as below ~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~ If desired to keep all contigs, ignore this section and move to section after ~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Chose this path for paper ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# number contigs taken away with size less than threshold spaced by 1e5 bp
x<-c(0); for(i in c(1:806)) {x[i] <- length(contig_size[contig_size$End < c(seq(1, max(contig_size$End),1e5))[i],"End"])}

# proportion of sequence removed
y<-c(0); for(i in c(1:806)) {y[i] <- sum(contig_size[contig_size$End < c(seq(1, max(contig_size$End),1e5))[i],"End"])/sum(contig_size$End)}

plot(x,y)

# Taking away smallest 150 contigs still preserves > 95% seq
Efficiency <- data.frame(contigs_removed = x, proport_seq_remove = y)

head(Efficiency,10)
   # contigs_removed proport_seq_remove
# 1                0        0.000000000
# 2              127        0.002059504
# 3              143        0.003011480
# 4              145        0.003217377
# 5              149        0.003809124
# 6              153        0.004586610
# 7              156        0.005294000
# 8              160        0.006426111
# 9              163        0.007402651
# 10             166        0.008546840

tail(Efficiency)
    # contigs_removed proport_seq_remove
# 801             340          0.9647924
# 802             340          0.9647924
# 803             340          0.9647924
# 804             340          0.9647924
# 805             340          0.9647924
# 806             340          0.9647924

# For example, a threshold contig of > 3.61305e6 removes 5% of sequence <<<<<<< use in paper

sum(contig_size[contig_size$End < 3.61305e6,"End"])/sum(contig_size$End)
# [1] 0.04969693

sum(contig_size[contig_size$End < 3.61306e6,"End"])/sum(contig_size$End)
# [1] 0.05127549

# But removes 209 out of 341 contigs

length(contig_size[contig_size$End < 3.61305e6,"End"])
# [1] 209 <<<<<<<<<< use in paper

length(contig_size[,"End"])
# [1] 341 <<<<<<<<<< use in paper

# which is 61% of contigs:
length(contig_size[contig_size$End < 3.61305e6,"End"])/length(contig_size[,"End"])
# [1] 0.6129032 <<<<<<<<<<<< use in paper

# leaving 132 contigs:
length(contig_size[,"End"]) - length(contig_size[contig_size$End < 3.61305e6,"End"])
# [1] 132 <<<<<<<<<<<< use in paper

# which is 95% of sequence
sum(contig_size[contig_size$End >= 3.61305e6,"End"])/sum(contig_size$End)
# [1] 0.9503031  <<<<<<<<<<<< use in paper


# To subset RH_hamster use following form:
dim(RH_hamster)
# [1] 229061    122

dim(RH_hamster[RH_hamster$Contig_ID %in% contig_size[contig_size$End >= 3.61305e6,"Contig_ID"],])
# [1] 217580    122

# As expected, leaves 95% of sequence intact.
# Note, this calx is very close (but not quite exact) because does not take into account ramp ups, ramp downs and edge effects at ends of contigs
# Just a sanity check
# Use calx above for percent of sequence left intact.
217580/229061
# [1] 0.949878

# As expected, 132 contigs are left:
length(unique(RH_hamster[RH_hamster$Contig_ID %in% contig_size[contig_size$End >= 3.61305e6,"Contig_ID"],"Contig_ID"]))
# [1] 132 <<<<<<<<<<< use in paper

# Thus, let's get rid of contigs below threshold:
contig_size <- contig_size[contig_size$End >= 3.61305e6,]

# For genome-wide coords:
# contig_size$End_coord <- c(0,cumsum(as.numeric(contig_size$End))[-nrow(contig_size)])

# For local chromosome-specific coords:
vec_length_A <- 1
for(i in c(1:10,"X")) {
	
	vec_length_B <- vec_length_A + length(c(0,cumsum(as.numeric(contig_size[contig_size[,"Chromosome"]==paste0("chromosome",i),]$End))[-nrow(contig_size[contig_size[,"Chromosome"]==paste0("chromosome",i),])]))
	contig_size[vec_length_A:(vec_length_B-1),"End_coord"] <- c(0,cumsum(as.numeric(contig_size[contig_size[,"Chromosome"]==paste0("chromosome",i),]$End))[-nrow(contig_size[contig_size[,"Chromosome"]==paste0("chromosome",i),])])
	vec_length_A <- vec_length_B
	}


dim(contig_size)
# [1] 132   4


head(contig_size)
          # Contig_ID      End  Chromosome End_coord
# 3260 RAZU01000003.1  7084530 chromosome1         0
# 3981 RAZU01000007.1 40532242 chromosome1   7084530
# 8039 RAZU01000009.1  5489406 chromosome1  47616772
# 8588 RAZU01000010.1  4531923 chromosome1  53106178
# 9042 RAZU01000011.1  4526673 chromosome1  57638101
# 9495 RAZU01000012.1 36610392 chromosome1  62164774


# # ~~~~~~~~~ Minimum and mean contig size (Compare with minimum and mean results using all contigs below): ~~~~~~~~~~ # #
# Minimum contig increases from 568 bp to 3.6 Mb

min(contig_size$End)
# [1] 3613054


# Mean contig size increases from 6.7 Mb to 16.5 Mb
mean(contig_size$End)
# [1] 16477872


sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}


sem(contig_size$End)
# [1] 1281198


# # ~~~~~~~~~ Continue: ~~~~~~~~~~~~ # #


RH_hamster <- RH_hamster[RH_hamster$Contig_ID %in% contig_size[,"Contig_ID"],]

RH_hamster <- merge(RH_hamster,contig_size[,c(1,3:4)])

chrOrder<-paste("chromosome",c(1:10,"X"),sep="")
# chrOrder<-c(1:11)
RH_hamster$Chromosome <-factor(RH_hamster$Chromosome, levels=chrOrder)
RH_hamster <- RH_hamster[order(RH_hamster$Chromosome, RH_hamster$Contig_ID, RH_hamster$End),]
RH_hamster$Chromosome <- as.character(RH_hamster$Chromosome)

# Genome coords in Start_new and End_new
RH_hamster$Start_new <- RH_hamster$Start+RH_hamster$End_coord
RH_hamster$End_new <- RH_hamster$End+RH_hamster$End_coord

colnames(RH_hamster)[121:122] <- c("posS","posE")

RH_hamster <- RH_hamster[,c(1:2,121:122,5:119)]

RH_hamster$pos <- round(rowMeans(RH_hamster[,c("posS","posE")]))

RH_hamster <- RH_hamster[,c(1:4,120,5:119)]

#substitute chr for chromosome:
RH_hamster$Chromosome <- gsub("chromosome","chr", RH_hamster$Chromosome)

# ensure order:
chrOrder<-c(paste("chr",1:10,sep=""),"chrX")
RH_hamster$Chromosome <-factor(RH_hamster$Chromosome, levels=chrOrder)
RH_hamster <- RH_hamster[order(RH_hamster$Chromosome, RH_hamster$pos), ]
RH_hamster$Chromosome <- as.character(RH_hamster$Chromosome)

dim(RH_hamster)
# [1] 217580    120

RH_hamster[1:10,1:10]
        # Contig_ID Chromosome  posS    posE    pos RH1_w0_d0 RH1_w1_d0 RH1_w1_d8 RH1_w1_d25 RH1_w1_d75
# 1  RAZU01000003.1       chr1     0 1000000 500000     14756     16631     21242      19091      22873
# 2  RAZU01000003.1       chr1 10000 1010000 510000     14752     16628     21232      19088      22873
# 3  RAZU01000003.1       chr1 20000 1020000 520000     14767     16652     21298      19131      22889
# 4  RAZU01000003.1       chr1 30000 1030000 530000     14781     16671     21316      19174      22872
# 5  RAZU01000003.1       chr1 40000 1040000 540000     14757     16601     21268      19113      22818
# 6  RAZU01000003.1       chr1 50000 1050000 550000     14791     16599     21308      19140      22821
# 7  RAZU01000003.1       chr1 60000 1060000 560000     14785     16611     21298      19127      22811
# 8  RAZU01000003.1       chr1 70000 1070000 570000     14782     16575     21309      19109      22802
# 9  RAZU01000003.1       chr1 80000 1080000 580000     14794     16586     21300      19109      22819
# 10 RAZU01000003.1       chr1 90000 1090000 590000     14806     16586     21257      19103      22812

# # RH_hamster_gseq.txt employed for genome scans:
# write.table(RH_hamster,"RH_hamster_gseq.txt",quote=FALSE,sep="\t",row.names=FALSE)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~ To keep all contigs, proceed as below ~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


# For genome-wide coords:
# contig_size$End_coord <- c(0,cumsum(as.numeric(contig_size$End))[-nrow(contig_size)])

# For local chromosome-specific coords:
vec_length_A <- 1
for(i in c(1:10,"X")) {
	
	vec_length_B <- vec_length_A + length(c(0,cumsum(as.numeric(contig_size[contig_size[,"Chromosome"]==paste0("chromosome",i),]$End))[-nrow(contig_size[contig_size[,"Chromosome"]==paste0("chromosome",i),])]))
	contig_size[vec_length_A:(vec_length_B-1),"End_coord"] <- c(0,cumsum(as.numeric(contig_size[contig_size[,"Chromosome"]==paste0("chromosome",i),]$End))[-nrow(contig_size[contig_size[,"Chromosome"]==paste0("chromosome",i),])])
	vec_length_A <- vec_length_B
	}


dim(contig_size)
# [1] 341   4


head(contig_size)
          # Contig_ID      End  Chromosome End_coord
# 3260 RAZU01000003.1  7084530 chromosome1         0
# 3969 RAZU01000004.1    33761 chromosome1   7084530
# 3973 RAZU01000005.1    31721 chromosome1   7118291
# 3977 RAZU01000006.1    31178 chromosome1   7150012
# 3981 RAZU01000007.1 40532242 chromosome1   7181190
# 8035 RAZU01000008.1    31103 chromosome1  47713432

# # ~~~~~~~ Minimum and mean contig size:  ~~~~~~~~~~~ # #

min(contig_size$End)
# [1] 568

mean(contig_size$End)
# [1] 6712102

sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

sem(contig_size$End)
# [1] 650807.2

# # ~~~~~~~~ Continue: ~~~~~~~~~~~~ # #

RH_hamster <- RH_hamster[RH_hamster$Contig_ID %in% contig_size[,"Contig_ID"],]

RH_hamster <- merge(RH_hamster,contig_size[,c(1,3:4)])

chrOrder<-paste("chromosome",c(1:10,"X"),sep="")
# chrOrder<-c(1:11)
RH_hamster$Chromosome <-factor(RH_hamster$Chromosome, levels=chrOrder)
RH_hamster <- RH_hamster[order(RH_hamster$Chromosome, RH_hamster$Contig_ID, RH_hamster$End),]
RH_hamster$Chromosome <- as.character(RH_hamster$Chromosome)

# Genome coords in Start_new and End_new
RH_hamster$Start_new <- RH_hamster$Start+RH_hamster$End_coord
RH_hamster$End_new <- RH_hamster$End+RH_hamster$End_coord

colnames(RH_hamster)[121:122] <- c("posS","posE")

RH_hamster <- RH_hamster[,c(1:2,121:122,5:119)]

RH_hamster$pos <- round(rowMeans(RH_hamster[,c("posS","posE")]))

RH_hamster <- RH_hamster[,c(1:4,120,5:119)]

#substitute chr for chromosome:
RH_hamster$Chromosome <- gsub("chromosome","chr", RH_hamster$Chromosome)

# ensure order:
chrOrder<-c(paste("chr",1:10,sep=""),"chrX")
RH_hamster$Chromosome <-factor(RH_hamster$Chromosome, levels=chrOrder)
RH_hamster <- RH_hamster[order(RH_hamster$Chromosome, RH_hamster$pos), ]
RH_hamster$Chromosome <- as.character(RH_hamster$Chromosome)

dim(RH_hamster)
# [1] 229061    120

RH_hamster[1:10,1:10]
          # Contig_ID Chromosome  posS    posE    pos RH1_w0_d0 RH1_w1_d0 RH1_w1_d8 RH1_w1_d25 RH1_w1_d75
# 3260 RAZU01000003.1       chr1     0 1000000 500000     14756     16631     21242      19091      22873
# 3261 RAZU01000003.1       chr1 10000 1010000 510000     14752     16628     21232      19088      22873
# 3262 RAZU01000003.1       chr1 20000 1020000 520000     14767     16652     21298      19131      22889
# 3263 RAZU01000003.1       chr1 30000 1030000 530000     14781     16671     21316      19174      22872
# 3264 RAZU01000003.1       chr1 40000 1040000 540000     14757     16601     21268      19113      22818
# 3265 RAZU01000003.1       chr1 50000 1050000 550000     14791     16599     21308      19140      22821
# 3266 RAZU01000003.1       chr1 60000 1060000 560000     14785     16611     21298      19127      22811
# 3267 RAZU01000003.1       chr1 70000 1070000 570000     14782     16575     21309      19109      22802
# 3268 RAZU01000003.1       chr1 80000 1080000 580000     14794     16586     21300      19109      22819
# 3269 RAZU01000003.1       chr1 90000 1090000 590000     14806     16586     21257      19103      22812

# # Not used
# write.table(RH_hamster,"RH_hamster_gseq.txt",quote=FALSE,sep="\t",row.names=FALSE)






































