require(rtracklayer)

args <- commandArgs()
#this script will be run catted into "R --vanilla" so the inputs start at pos 3
#first real input is input bed file name, second is output path

input_bed_file <- args[3]
input_GR_ref_obj <- args[4]
output_obj_file <- args[5]

input_GR_ref_name <- load(input_GR_ref_obj)

input_GR_ref <- get(input_GR_ref_name)

rm(list= input_GR_ref_name)

raw_bed_GR <- import(con=gzfile(input_bed_file))

#the files from the HTT locus and the SORT1 locus have relative coords.  Convert to absolute hg38.  Leave E. Coli. reads untouched

htt_loc_found <- length(grep("htt_loc_hg38_chr4_2598274_3998273",as.character(seqnames(raw_bed_GR[1:10000]))))>0

sort1_loc1_found <- length(grep("RP11_47M16_BAC_hg38",as.character(seqnames(raw_bed_GR[1:10000]))))>0

sort1_loc2_found <- length(grep("RP11_463O24_BAC_hg38",as.character(seqnames(raw_bed_GR[1:10000]))))>0

if(htt_loc_found){
	new_seqnames <- as.character(seqnames(raw_bed_GR))
	mask_loc_coords <- grep("htt_loc_hg38_chr4_2598274_3998273", new_seqnames)
	new_seqnames[mask_loc_coords] <- "chr4"
	new_starts <- start(raw_bed_GR)
	new_starts[mask_loc_coords] <- start(raw_bed_GR[mask_loc_coords])+2598273
	new_ends <- end(raw_bed_GR)
	new_ends[mask_loc_coords] <- end(raw_bed_GR[mask_loc_coords])+2598273
	
	new_bed_df <- data.frame(chr=new_seqnames,start= new_starts,end= new_ends,strand=strand(raw_bed_GR))
	raw_bed_GR <- makeGRangesFromDataFrame(new_bed_df)
	
	rm(mask_loc_coords)
	rm(new_seqnames)
	rm(new_starts)
	rm(new_ends)
	rm(new_bed_df)
}

if(sort1_loc1_found){
	
	new_seqnames <- as.character(seqnames(raw_bed_GR))
	mask_loc_coords <- grep("RP11_47M16_BAC_hg38", new_seqnames)
	new_seqnames[mask_loc_coords] <- "chr1"
	new_starts <- start(raw_bed_GR)
	new_starts[mask_loc_coords] <- start(raw_bed_GR[mask_loc_coords])+ 109267734
	new_ends <- end(raw_bed_GR)
	new_ends[mask_loc_coords] <- end(raw_bed_GR[mask_loc_coords])+ 109267734
	
	new_bed_df <- data.frame(chr=new_seqnames,start= new_starts,end= new_ends,strand=strand(raw_bed_GR))
	raw_bed_GR <- makeGRangesFromDataFrame(new_bed_df)
	
	rm(mask_loc_coords)
	rm(new_seqnames)
	rm(new_starts)
	rm(new_ends)
	rm(new_bed_df)

}

if(sort1_loc2_found){
	
	new_seqnames <- as.character(seqnames(raw_bed_GR))
	mask_loc_coords <- grep("RP11_463O24_BAC_hg38", new_seqnames)
	new_seqnames[mask_loc_coords] <- "chr1"
	new_starts <- start(raw_bed_GR)
	new_starts[mask_loc_coords] <- start(raw_bed_GR[mask_loc_coords])+ 109197414
	new_ends <- end(raw_bed_GR)
	new_ends[mask_loc_coords] <- end(raw_bed_GR[mask_loc_coords])+ 109197414
	
	new_bed_df <- data.frame(chr=new_seqnames,start= new_starts,end= new_ends,strand=strand(raw_bed_GR))
	raw_bed_GR <- makeGRangesFromDataFrame(new_bed_df)
	
	rm(mask_loc_coords)
	rm(new_seqnames)
	rm(new_starts)
	rm(new_ends)
	rm(new_bed_df)

}


#loop through subsets of input_GR_ref so R doesn't have to allocate as big a vector
chunk_size <- 100000
num_chunks <- ceiling(length(input_GR_ref)/chunk_size)

list_chunk_indices <- lapply(c(1:num_chunks),function(num_chunk_x){
	startx <- 1+(num_chunk_x-1)*chunk_size
	endx <- num_chunk_x*chunk_size
	return(c(startx, endx))
})

list_chunk_indices[[length(list_chunk_indices)]][2] <- length(input_GR_ref)

#plus strand
strand(input_GR_ref) <- "+"

list_plus_overlaps_cnts_by_chunk <- lapply(list_chunk_indices,function(chunk_index_x){
	print(paste("overlapping",chunk_index_x[1],"to",chunk_index_x[2]))
	
	chunk_x_input_GR_ref <- input_GR_ref[chunk_index_x[1]: chunk_index_x[2]]	
	overlap_chunk_x <- findOverlaps(chunk_x_input_GR_ref, raw_bed_GR)
	rm(chunk_x_input_GR_ref)
	
	rle_overlaps_chunk_x <- rle(queryHits(overlap_chunk_x))
	rm(overlap_chunk_x)
	
	out_rle_mat <- cbind(rle_overlaps_chunk_x$values+(chunk_index_x[1]-1), rle_overlaps_chunk_x$lengths)
	return(out_rle_mat)
})

all_plus_rle_mat <- do.call(rbind, list_plus_overlaps_cnts_by_chunk)
rm(list_plus_overlaps_cnts_by_chunk)

input_GR_ref$plus_counts <- 0
input_GR_ref$plus_counts[all_plus_rle_mat[,1]] <- all_plus_rle_mat[,2]

rm(all_plus_rle_mat)


#minus strand
strand(input_GR_ref) <- "-"

list_minus_overlaps_cnts_by_chunk <- lapply(list_chunk_indices,function(chunk_index_x){
	print(paste("overlapping",chunk_index_x[1],"to",chunk_index_x[2]))
	
	chunk_x_input_GR_ref <- input_GR_ref[chunk_index_x[1]: chunk_index_x[2]]	
	overlap_chunk_x <- findOverlaps(chunk_x_input_GR_ref, raw_bed_GR)
	rm(chunk_x_input_GR_ref)
	
	rle_overlaps_chunk_x <- rle(queryHits(overlap_chunk_x))
	rm(overlap_chunk_x)
	
	out_rle_mat <- cbind(rle_overlaps_chunk_x$values+(chunk_index_x[1]-1), rle_overlaps_chunk_x$lengths)
	return(out_rle_mat)
})

all_minus_rle_mat <- do.call(rbind, list_minus_overlaps_cnts_by_chunk)
rm(list_minus_overlaps_cnts_by_chunk)

input_GR_ref$minus_counts <- 0
input_GR_ref$minus_counts[all_minus_rle_mat[,1]] <- all_minus_rle_mat[,2]

rm(all_minus_rle_mat)

save(input_GR_ref,file= output_obj_file)

q()

