##mergeMaps.sh #!/bin/bash DIR=$1 base=$(echo $DIR | perl -ne 'm|Sample_([^/]*)|; print "$1";') echo $base file1=$(ls $DIR/*UNIQUE.map|head -1) OUT=${base}___03a___UNIQUE_FILT.map head -1 $file1 >$OUT ls $DIR/*UNIQUE.map | xargs fgrep -hv mixer | awk '$11>0{print $0}' | sort -k1,1 -k2,2n >>$OUT #ls $DIR/*UNIQUE.map | xargs fgrep -hv mixer | sort -k1,1 -k2,2n >>$OUT ##mergeMultiMaps.sh #!/bin/bash DIR=$1 base=$(echo $DIR | perl -ne 'm|Sample_([^/]*)|; print "$1";') echo $base file1=$(ls $DIR/*MULTI.map|head -1) OUT=${base}___03a___MULTI_FILT.map head -1 $file1 >$OUT ls $DIR/*MULTI.map | xargs fgrep -hv mixer | awk '$11>0{print $0}' | sort -k1,1 -k2,2n >>$OUT #ls $DIR/*MULTI.map | xargs fgrep -hv mixer | sort -k1,1 -k2,2n >>$OUT ##sam2MapCheckClip.py #!/usr/bin/env python2.6 import sys import pysam BAM_CHARD_CLIP=5 BAM_CSOFT_CLIP=4 BAM_CMATCH=0 cigarDict={BAM_CSOFT_CLIP:"S",BAM_CMATCH:"M",BAM_CHARD_CLIP:"H"} def formatCigar(cigar): cigarStr="" for (code,num) in cigar: try: cigarStr+=str(num)+cigarDict[code] except: print cigar, code, num raise return cigarStr def getGmapperOpts(cmd): F=cmd.split() parse=[(i,x.replace("-","")) for i,x in enumerate(F) if x[0]=="-" and x[1] not in "-0123456789"] ret=dict([(x,F[i+1]) for i,x in parse]) ret['h']=int(ret['h']) ret['i']=int(ret['i']) ret['m']=int(ret['m']) return ret def fmtSAMalignPos(aa): if aa.flag & 16: return [aa.aend,aa.pos+1,"-"] else: return [aa.pos+1,aa.aend,"+"] samFile=sys.argv[1] genomeFile=sys.argv[2] uniqueFP=open(samFile+"_UNIQUE.map","w") multiFP=open(samFile+"_MULTI.map","w") COLNAMES="chrom start stop strand cigar leftClip rightClip NH NM AS.orig AS.clip alignLen rID mixerID seqOrig seqAligned leftClipSeq rightClipSeq".replace(" ","\t") print >>uniqueFP, COLNAMES print >>multiFP, COLNAMES sam=pysam.Samfile(samFile) opts=getGmapperOpts(sam.header['PG'][0]['CL']) genome=pysam.Fastafile(genomeFile) for si in sam: assert isinstance(si,pysam.AlignedRead) if si.is_unmapped: continue leftClip=0 if si.cigar[0][0]!=BAM_CSOFT_CLIP else si.cigar[0][1] leftClipSeq = si.seq[:leftClip] leftClipMM = sum([x.upper()!="C" for x in leftClipSeq]) rightClip=0 if si.cigar[-1][0]!=BAM_CSOFT_CLIP else si.cigar[-1][1] rightClipSeq = si.seq[-rightClip:] rightClipMM = sum([x.upper()!="G" for x in rightClipSeq]) oldScore=si.opt("AS") newScore=opts['i']*(leftClipMM+rightClipMM)+oldScore #print >>sys.stderr, sam.references[si.rname], si.pos-1, si.aend+1, "clip=",(leftClip, rightClip), "CIGAR=",si.cigar, "Score=",(oldScore, newScore), "OPTS=",(opts['i'],opts['h']) if 1 or newScore>=opts['h']: chrom=sam.references[si.rname] pos=fmtSAMalignPos(si) out=[chrom]+pos #print >>sys.stderr, si out.append(formatCigar(si.cigar)) out.extend([leftClip,rightClip,si.opt("NH"),si.opt("NM")]) out.extend([si.opt("AS"),newScore]) out.append(si.alen) out.append(si.qname), out.append(si.qname.split(":")[-1]) out.append(si.seq) out.append(si.seq[si.qstart:si.qend]) out.append(leftClipSeq) out.append(rightClipSeq) #out.append((leftClipMM,rightClipMM)) # # Get genome flank #leftFlank=0 if si.pos-1<0 else si.pos-1 #out.append(genome.fetch(chrom,leftFlank,si.pos)) #out.append(genome.fetch(chrom,si.aend,si.aend+1)) if si.opt("NH")==1: print >>uniqueFP, "\t".join(map(str,out)) else: print >>multiFP, "\t".join(map(str,out)) uniqueFP.close() multiFP.close() ##splitMixer.py #!/usr/bin/env python2.6 import sys import Bio.SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord for seq in Bio.SeqIO.parse(sys.stdin,"fastq"): newSeq=seq[5:] newSeq.id=seq.id+":"+str(seq.seq[:5]) newSeq.name="" newSeq.description="" Bio.SeqIO.write(newSeq,sys.stdout,"fastq") ##spo11_Pipeline01.sh #!/bin/bash echo "###TS" `date` ADAPTER=AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC GENOME=/ifs/data/bio/Genomes/S.cerevisiae/sacCer2/SGD/20080628/SGD_sacCer2.fasta GENOMETAG=SGD_sacCer2 #GMAPPER=/home/socci/bin/gmapper-ls --local --qv-offset 33 GMAPPER=/ifs/data/socci/Work/SeqAna/Mappers/SHRiMP/2_1_1/SHRiMP_2_1_1b/bin/gmapper-ls FASTQ=$1 FOLDER=$(dirname $FASTQ | pyp "s[-2:]|s") BASE=${FASTQ##*/} BASE=${BASE%%.*} OUTFOLDER=_._results01/$GENOMETAG/$FOLDER BASE=$OUTFOLDER/$BASE mkdir -p $OUTFOLDER BIN=pipe02.bin zcat $FASTQ | /ifs/data/socci/opt/bin/fastx_clipper -a $ADAPTER -l 15 -n -v -Q33 -i - \ | $BIN/splitMixer.py > ${BASE}___CLIPPED.fastq $GMAPPER -N 15 -U -g -1000 -q -1000 \ -m 10 -i -20 -h 100 -r 50% \ -n 1 -s 1111111111,11110111101111,1111011100100001111,1111000011001101111 \ -o 1001 -Q -E --sam-unaligned --strata \ ${BASE}___CLIPPED.fastq $GENOME >${BASE}.sam echo "###TS" `date` ##pipe02.sh #!/bin/bash BIN=pipe02.bin GENOME=/ifs/data/bio/Genomes/S.cerevisiae/sacCer2/SGD/20080628/SGD_sacCer2.fasta DATADIR=$1 DATADIR=${DATADIR%%/} TAG=$(basename $DATADIR) echo $TAG RESDIR=_._results01 ls -1 $DATADIR/S*/*_R1_*gz | xargs -n 1 bsub -pe alloc 7 -N SPO11 $BIN/spo11_Pipeline01.sh qSYNC SPO11 find $RESDIR -name "*.sam" | xargs -n 1 -I % bsub -N SAM2MAP $BIN/sam2MapCheckClip.py % $GENOME qSYNC SAM2MAP ls -d _._results01/SGD_sacCer2/$TAG/Sample_* | xargs -n 1 bsub $BIN/mergeMaps.sh ls -d _._results01/SGD_sacCer2/$TAG/Sample_* | xargs -n 1 bsub $BIN/mergeMultiMaps.sh