


#############################################################################################
####The R function to find the reads that are not aligned back to the genome
####file is the bam file storing the information of aligned reads
unmapped<- function(file)

{
	####get ID of the unmapped reads  
	com1 = paste("samtools view ",file, ".sorted.bam |awk \'{if($2 ==4 || $2==20){print $_}}\' > ", file,"Unmapped.sorted.bam",sep="")
	system(com1)
	com2 = paste("less ",file,"Unmapped.sorted.bam | cut -f 1 > fail",file,sep="")
	system(com2)
	
	####get the unmapped fastq files which doesn't contain adapters
	library(ShortRead)
	fastaf<- paste(file,".fastq",sep ="")
	reads1<-readDNAStringSet(fastaf,"fasta")
	fileID<- paste("fail",file,sep ="")
	ID<-read.table(fileID)
	reads <- reads1[names(reads1) %in% ID[,1]]
        
	reads<- reads[width(reads)==51,]
	fileo<-paste(file,"unmapped.fastq",sep ="")
	writeXStringSet(reads,fileo)
}
####get the unmapped reads
unmapped("dcl234_total.sorted.bam")

#############################################################################################
####use blastall to align the unmappe reads to the genome
blastall -p blastn -i unmapped.fastq -d ~/genome/Blast/arabidopsis_whole_genome.fasta -o dclblast -m 8 -F F

#############################################################################################
####The R command to filter out the alignment and separate the file into five files according to the chromosome information
re<-read.table("dclblast")
re<- re[re[,3]==100.00,]
ch1<-re[re[,2]=="Chr1",]
write.table(ch1,"Chr1")

ch1<-re[re[,2]=="Chr2",]
write.table(ch1,"Chr2")

ch1<-re[re[,2]=="Chr3",]
write.table(ch1,"Chr3")

ch1<-re[re[,2]=="Chr4",]
write.table(ch1,"Chr4")

ch1<-re[re[,2]=="Chr5",]
write.table(ch1,"Chr5")


#############################################################################################
####The R function to  find the spliced reads
####ch1 is the files stored the blastall results
####limit is the distance allowed between two spliced reads

intron<-function(ch1,limit)
{
####get aligned reads on the positive strand
ch1P <- ch1[ch1[,10]> ch1[,9],]
####get the aligned reads which either contain the start or the end 
ch1Ps<- ch1P[ch1P[,7]==1,]
ch1Pe<- ch1P[ch1P[,8]==51,]
nameS = unique(ch1Ps[,1])
nameE =  unique(ch1Pe[,1])
nameP = nameS[nameS %in% nameE]
ch1Ps= ch1Ps[ch1Ps[,1] %in% nameP,]
ch1Pe= ch1Pe[ch1Pe[,1] %in% nameP,]
#####get the aligned reads of which both the beginning and the end  were mapped to the same strand within a distance of 1000 nt
target = data.frame(reads = character(0),Chr =character(0),  startR1 = integer(0),endR1 = integer(0),start1 = integer(0),end1 = integer(0), startR2 = integer(0),endR2 = integer(0), start2 = integer(0),end2 = integer(0))
for ( i in 1:length(nameP))
{
	st = ch1Ps[ch1Ps[,1] == nameP[i],]
	end = ch1Pe[ch1Pe[,1] == nameP[i],]
	for(j in 1:length(st[,1]))
	{
	  for ( k in 1:length(end[,1]))
		{
			
		dis = end[k,9]- st[j,10]
		if (dis > 0 & dis < limit)
		{
target = rbind(target,data.frame(nameP[i], st[j,2],st[j,7], st[j,8],st[j,9],st[j,10],end[k,7],end[k,8], end[k,9],end[k,10]))
	 
		}
		}
	}
}


targetP = cbind(target,rep("+",length(target[,1])))
colnames(targetP) = c("read","chr","startRead1","endRead1","startGenome1","endGenome1","startRead2","endRead2","startGenome2","endGenome2","strand")

####the same step were performed on the reads aligned back to the negative strand
ch1N <- ch1[ch1[,10]< ch1[,9],]
###get the alignment which has start and end 
ch1Ns<- ch1N[ch1N[,7]==1,]
ch1Ne<- ch1N[ch1N[,8]==51,]
nameS = unique(ch1Ns[,1])
nameE =  unique(ch1Ne[,1])
nameN = nameS[nameS %in% nameE]
ch1Ns= ch1Ns[ch1Ns[,1] %in% nameN,]
ch1Ne= ch1Ne[ch1Ne[,1] %in% nameN,]


target = data.frame(reads = character(0),Chr =character(0),  startR1 = integer(0),endR1 = integer(0),start1 = integer(0),end1 = integer(0), startR2 = integer(0),endR2 = integer(0), start2 = integer(0),end2 = integer(0))
for ( i in 1:length(nameN))
{
	st = ch1Ns[ch1Ns[,1] == nameN[i],]
	end = ch1Ne[ch1Ne[,1] == nameN[i],]
	for(j in 1:length(st[,1]))
	{
	  for ( k in 1:length(end[,1]))
		{
			
		dis = st[j,10]- end[k,9]
		if (dis > 0 & dis < limit)
		{
	 target = rbind(target,data.frame(nameP[i], st[j,2],st[j,7], st[j,8],st[j,9],st[j,10],end[k,7],end[k,8], end[k,9],end[k,10]))
		}
		}
	}
}
targetN = cbind(target,rep("-",length(target[,1])))

colnames(targetN) = c("read","chr","startRead1","endRead1","startGenome1","endGenome1","startRead2","endRead2","startGenome2","endGenome2","strand")

####combine all of the reads
target= rbind(targetP,targetN)

return(target)
}

####get the spliced reads on each chromosome
ch1<-read.table("Chr1")
in1 <-intron(ch1,1000)
write.table(in1,"intronChr1")


ch1<-read.table("Chr2")
in1 <-intron(ch1,1000)
write.table(in1,"intronChr2")


ch1<-read.table("Chr3")
in1 <-intron(ch1,1000)
write.table(in1,"intronChr3")


ch1<-read.table("Chr4")
in1 <-intron(ch1,1000)
write.table(in1,"intronChr4")


ch1<-read.table("Chr5")
in1 <-intron(ch1,1000)
write.table(in1,"intronChr5")

