# A23_reads.txt saved from "HEK293_A23_to_picr_bed.xlsx"

A23 <- read.table("A23_reads.txt", header=TRUE, sep="\t", stringsAsFactors=FALSE)

head(A23)
       # Contig_ID  Chromosome Start     End HEK293_vs_picr A23_vs_picr HEK293_vs_picr_not_hg38 A23_vs_picr_not_hg38
# 1 RAZU01000003.1 chromosome1     0 1000000             54       11830                      35                11795
# 2 RAZU01000003.1 chromosome1 10000 1010000             54       11845                      35                11803
# 3 RAZU01000003.1 chromosome1 20000 1020000             54       11868                      35                11814
# 4 RAZU01000003.1 chromosome1 30000 1030000             54       11893                      35                11840
# 5 RAZU01000003.1 chromosome1 40000 1040000             54       11863                      35                11807
# 6 RAZU01000003.1 chromosome1 50000 1050000             57       11864                      38                11807

A23 <- A23[,c(1:4,6)]

head(A23)
       # Contig_ID  Chromosome Start     End A23_vs_picr
# 1 RAZU01000003.1 chromosome1     0 1000000       11830
# 2 RAZU01000003.1 chromosome1 10000 1010000       11845
# 3 RAZU01000003.1 chromosome1 20000 1020000       11868
# 4 RAZU01000003.1 chromosome1 30000 1030000       11893
# 5 RAZU01000003.1 chromosome1 40000 1040000       11863
# 6 RAZU01000003.1 chromosome1 50000 1050000       11864

# get rid of chrM
A23 <- A23[!(A23$Chromosome == "chromosomeM"),]

# get rid of contigs with chromosome unknown
A23 <- A23[!(A23$Chromosome == "chromosomeUnknown"),]

# Sort:
chrOrder<-paste("chromosome",c(1:10,"X"),sep="")
# chrOrder<-c(1:11)
A23$Chromosome <-factor(A23$Chromosome, levels=chrOrder)
A23 <- A23[order(A23$Chromosome,A23$Contig_ID, A23$Start),]
A23$Chromosome <- as.character(A23$Chromosome)

contig_size <- aggregate(End ~ Contig_ID,FUN=max,data=A23)

dim(contig_size)
# [1] 341    2

head(contig_size)
       # Contig_ID      End
# 1 RAZU01000001.1 17833227
# 2 RAZU01000002.1 14745707
# 3 RAZU01000003.1  7084530
# 4 RAZU01000004.1    33761
# 5 RAZU01000005.1    31721
# 6 RAZU01000006.1    31178

contig_size <- unique(merge(contig_size,A23[,c("Contig_ID","Chromosome")]))

dim(contig_size)
# [1] 341    3

head(contig_size)
          # Contig_ID      End   Chromosome
# 1    RAZU01000001.1 17833227 chromosome10
# 1785 RAZU01000002.1 14745707 chromosome10
# 3260 RAZU01000003.1  7084530  chromosome1
# 3969 RAZU01000004.1    33761  chromosome1
# 3973 RAZU01000005.1    31721  chromosome1
# 3977 RAZU01000006.1    31178  chromosome1


chrOrder<-paste("chromosome",c(1:10,"X"),sep="")
# chrOrder<-c(1:11)
contig_size$Chromosome <-factor(contig_size$Chromosome, levels=chrOrder)
contig_size <- contig_size[order(contig_size$Chromosome, contig_size$Contig_ID, contig_size$End),]
contig_size$Chromosome <- as.character(contig_size$Chromosome)



# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~ For plotting A23 reads, may make sense to remove bottom 209 contigs as below ~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~ If desired to keep all contigs, ignore this section and move to section after ~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Chose this path for paper ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# number contigs taken away with size less than threshold spaced by 1e5 bp
x<-c(0); for(i in c(1:806)) {x[i] <- length(contig_size[contig_size$End < c(seq(1, max(contig_size$End),1e5))[i],"End"])}

# proportion of sequence removed
y<-c(0); for(i in c(1:806)) {y[i] <- sum(contig_size[contig_size$End < c(seq(1, max(contig_size$End),1e5))[i],"End"])/sum(contig_size$End)}

plot(x,y)

# Taking away smallest 150 contigs still preserves > 95% seq
Efficiency <- data.frame(contigs_removed = x, proport_seq_remove = y)

head(Efficiency,10)
   # contigs_removed proport_seq_remove
# 1                0        0.000000000
# 2              127        0.002059504
# 3              143        0.003011480
# 4              145        0.003217377
# 5              149        0.003809124
# 6              153        0.004586610
# 7              156        0.005294000
# 8              160        0.006426111
# 9              163        0.007402651
# 10             166        0.008546840

tail(Efficiency)
    # contigs_removed proport_seq_remove
# 801             340          0.9647924
# 802             340          0.9647924
# 803             340          0.9647924
# 804             340          0.9647924
# 805             340          0.9647924
# 806             340          0.9647924

# For example, a threshold contig of > 3.61305e6 removes 5% of sequence

sum(contig_size[contig_size$End < 3.61305e6,"End"])/sum(contig_size$End)
# [1] 0.04969693

sum(contig_size[contig_size$End < 3.61306e6,"End"])/sum(contig_size$End)
# [1] 0.05127549

# But removes 209 out of 341 contigs

length(contig_size[contig_size$End < 3.61305e6,"End"])
# [1] 209

length(contig_size[,"End"])
# [1] 341

# which is 61% of contigs:
length(contig_size[contig_size$End < 3.61305e6,"End"])/length(contig_size[,"End"])
# [1] 0.6129032

# leaving 132 contigs:
length(contig_size[,"End"]) - length(contig_size[contig_size$End < 3.61305e6,"End"])
# [1] 132


# To subset A23 use following form:
dim(A23)
# [1] 229061    122

dim(A23[A23$Contig_ID %in% contig_size[contig_size$End >= 3.61305e6,"Contig_ID"],])
# [1] 217580    122

# As expected, leaves 95% of sequence intact.
# Note, this calx is very close (but not quite exact) because does not take into account ramp ups, ramp downs and edge effects at ends of contigs
# Just a sanity check
# Use calx above for percent of sequence left intact.
217580/229061
# [1] 0.949878

# As expected, 132 contigs are left:
length(unique(A23[A23$Contig_ID %in% contig_size[contig_size$End >= 3.61305e6,"Contig_ID"],"Contig_ID"]))
# [1] 132

# Thus, let's get rid of contigs below threshold:
contig_size <- contig_size[contig_size$End >= 3.61305e6,]

# For genome-wide coords:
# contig_size$End_coord <- c(0,cumsum(as.numeric(contig_size$End))[-nrow(contig_size)])

# For local chromosome-specific coords:
vec_length_A <- 1
for(i in c(1:10,"X")) {
	
	vec_length_B <- vec_length_A + length(c(0,cumsum(as.numeric(contig_size[contig_size[,"Chromosome"]==paste0("chromosome",i),]$End))[-nrow(contig_size[contig_size[,"Chromosome"]==paste0("chromosome",i),])]))
	contig_size[vec_length_A:(vec_length_B-1),"End_coord"] <- c(0,cumsum(as.numeric(contig_size[contig_size[,"Chromosome"]==paste0("chromosome",i),]$End))[-nrow(contig_size[contig_size[,"Chromosome"]==paste0("chromosome",i),])])
	vec_length_A <- vec_length_B
	}

dim(contig_size)
# [1] 132   4


head(contig_size)
          # Contig_ID      End  Chromosome End_coord
# 3260 RAZU01000003.1  7084530 chromosome1         0
# 3981 RAZU01000007.1 40532242 chromosome1   7084530
# 8039 RAZU01000009.1  5489406 chromosome1  47616772
# 8588 RAZU01000010.1  4531923 chromosome1  53106178
# 9042 RAZU01000011.1  4526673 chromosome1  57638101
# 9495 RAZU01000012.1 36610392 chromosome1  62164774


# # ~~~~~~~~~ Minimum and mean contig size (Compare with minimum and mean results using all contigs below): ~~~~~~~~~~ # #
# Minimum contig increases from 568 bp to 3.6 Mb

min(contig_size$End)
# [1] 3613054


# Mean contig size increases from 6.7 Mb to 16.5 Mb
mean(contig_size$End)
# [1] 16477872


sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}


sem(contig_size$End)
# [1] 1281198


# # ~~~~~~~~~ Continue: ~~~~~~~~~~~~ # #

A23 <- A23[A23$Contig_ID %in% contig_size[,"Contig_ID"],]

A23 <- merge(A23,contig_size[,c(1,3:4)])

chrOrder<-paste("chromosome",c(1:10,"X"),sep="")
# chrOrder<-c(1:11)
A23$Chromosome <-factor(A23$Chromosome, levels=chrOrder)
A23 <- A23[order(A23$Chromosome, A23$Contig_ID, A23$End),]
A23$Chromosome <- as.character(A23$Chromosome)

# Genome coords in Start_new and End_new
A23$Start_new <- A23$Start+A23$End_coord
A23$End_new <- A23$End+A23$End_coord

colnames(A23)[7:8] <- c("posS","posE")

A23 <- A23[,c(1:2,7:8,5)]

A23$pos <- round(rowMeans(A23[,c("posS","posE")]))

A23 <- A23[,c(1:4,6,5)]

#substitute chr for chromosome:
A23$Chromosome <- gsub("chromosome","chr", A23$Chromosome)

# ensure order:
chrOrder<-c(paste("chr",1:10,sep=""),"chrX")
A23$Chromosome <-factor(A23$Chromosome, levels=chrOrder)
A23 <- A23[order(A23$Chromosome, A23$pos), ]
A23$Chromosome <- as.character(A23$Chromosome)

dim(A23)
# [1] 217580    6

head(A23)
          # Contig_ID Chromosome  posS    posE    pos A23_vs_picr
# 3260 RAZU01000003.1       chr1     0 1000000 500000       11830
# 3261 RAZU01000003.1       chr1 10000 1010000 510000       11845
# 3262 RAZU01000003.1       chr1 20000 1020000 520000       11868
# 3263 RAZU01000003.1       chr1 30000 1030000 530000       11893
# 3264 RAZU01000003.1       chr1 40000 1040000 540000       11863
# 3265 RAZU01000003.1       chr1 50000 1050000 550000       11864

colnames(A23)[6] <- c("reads")

head(A23)
          # Contig_ID Chromosome  posS    posE    pos reads
# 3260 RAZU01000003.1       chr1     0 1000000 500000 11830
# 3261 RAZU01000003.1       chr1 10000 1010000 510000 11845
# 3262 RAZU01000003.1       chr1 20000 1020000 520000 11868
# 3263 RAZU01000003.1       chr1 30000 1030000 530000 11893
# 3264 RAZU01000003.1       chr1 40000 1040000 540000 11863
# 3265 RAZU01000003.1       chr1 50000 1050000 550000 11864

write.table(A23,"A23_gseq.txt",quote=FALSE,sep="\t",row.names=FALSE)


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~ To keep all contigs, proceed as below ~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


# For genome-wide coords:
# contig_size$End_coord <- c(0,cumsum(as.numeric(contig_size$End))[-nrow(contig_size)])

# For local chromosome-specific coords:
vec_length_A <- 1
for(i in c(1:10,"X")) {
	
	vec_length_B <- vec_length_A + length(c(0,cumsum(as.numeric(contig_size[contig_size[,"Chromosome"]==paste0("chromosome",i),]$End))[-nrow(contig_size[contig_size[,"Chromosome"]==paste0("chromosome",i),])]))
	contig_size[vec_length_A:(vec_length_B-1),"End_coord"] <- c(0,cumsum(as.numeric(contig_size[contig_size[,"Chromosome"]==paste0("chromosome",i),]$End))[-nrow(contig_size[contig_size[,"Chromosome"]==paste0("chromosome",i),])])
	vec_length_A <- vec_length_B
	}

dim(contig_size)
# [1] 341   4


head(contig_size)
          # Contig_ID      End  Chromosome End_coord
# 3260 RAZU01000003.1  7084530 chromosome1         0
# 3969 RAZU01000004.1    33761 chromosome1   7084530
# 3973 RAZU01000005.1    31721 chromosome1   7118291
# 3977 RAZU01000006.1    31178 chromosome1   7150012
# 3981 RAZU01000007.1 40532242 chromosome1   7181190
# 8035 RAZU01000008.1    31103 chromosome1  47713432

# # ~~~~~~~ Minimum and mean contig size:  ~~~~~~~~~~~ # #

min(contig_size$End)
# [1] 568

mean(contig_size$End)
# [1] 6712102

sem <- function(x) {sqrt(var(x,na.rm=TRUE)/sum(!is.na(x)))}

sem(contig_size$End)
# [1] 650807.2

# # ~~~~~~~~ Continue: ~~~~~~~~~~~~ # #

A23 <- A23[A23$Contig_ID %in% contig_size[,"Contig_ID"],]

A23 <- merge(A23,contig_size[,c(1,3:4)])

chrOrder<-paste("chromosome",c(1:10,"X"),sep="")
# chrOrder<-c(1:11)
A23$Chromosome <-factor(A23$Chromosome, levels=chrOrder)
A23 <- A23[order(A23$Chromosome, A23$Contig_ID, A23$End),]
A23$Chromosome <- as.character(A23$Chromosome)

# Genome coords in Start_new and End_new
A23$Start_new <- A23$Start+A23$End_coord
A23$End_new <- A23$End+A23$End_coord

colnames(A23)[7:8] <- c("posS","posE")

A23 <- A23[,c(1:2,7:8,5)]

A23$pos <- round(rowMeans(A23[,c("posS","posE")]))

A23 <- A23[,c(1:4,6,5)]

#substitute chr for chromosome:
A23$Chromosome <- gsub("chromosome","chr", A23$Chromosome)

# ensure order:
chrOrder<-c(paste("chr",1:10,sep=""),"chrX")
A23$Chromosome <-factor(A23$Chromosome, levels=chrOrder)
A23 <- A23[order(A23$Chromosome, A23$pos), ]
A23$Chromosome <- as.character(A23$Chromosome)

dim(A23)
# [1] 229061    6

head(A23)
          # Contig_ID Chromosome  posS    posE    pos A23_vs_picr
# 3260 RAZU01000003.1       chr1     0 1000000 500000       11830
# 3261 RAZU01000003.1       chr1 10000 1010000 510000       11845
# 3262 RAZU01000003.1       chr1 20000 1020000 520000       11868
# 3263 RAZU01000003.1       chr1 30000 1030000 530000       11893
# 3264 RAZU01000003.1       chr1 40000 1040000 540000       11863
# 3265 RAZU01000003.1       chr1 50000 1050000 550000       11864

colnames(A23)[6] <- c("reads")

# write.table(A23,"A23_gseq.txt",quote=FALSE,sep="\t",row.names=FALSE)


















