
# step: filtering female Illumina short reads at Phred score of 20
# input file  / format: FemaleReades.fq / fastq file, uncompressed
# output file / format: FemHouse.jelly / Jellyfish binary file.  
# comments: See Methods for choosing k‐mer size (‐m option). See Jellyfish manual for other parameters.
# The k-mer size should be large enough so that identical k-mers seldom occur by chance in the assembled genome or are generated by sequencing errors in the female reads (Kelley et al. 2010; Li et al. 2010). 
# On the other hand, run time and memory usage scale by a factor of 4k, which is the number of elements of the bit-array. 
# Carvalho and Clark used k = 15 for Drosophila, which they suggest for insect genomes. 
# We used k=15 for house fly.
jellyfish count -m 15 -o FemHouse.jelly -c 4 -s 10G -t 4 -C FemaleReades.fq 

# step: production of the short‐read fasta file
# input file /  format: FemHouse.jelly / Jellyfish binary file.
# output file / format: FemHouse.fasta.gz / fasta format (text file; compressed)
# comments: see Kelley et al (2010) for choosing minimum frequency cut‐off.  
jellyfish dump FemHouse.jelly | gzip -c > FemHouse.fasta.gz

# Current versions of JellyFish do not do the kmer filtering
# Run a perl script to select kmers that are present in at least 5 copies
gunzip FemHouse.fasta.gz
perl kmer_extract.pl FemHouse.fasta 5 FemHouse_5.fasta
gzip FemHouse_5.fasta
gzip FemHouse.fasta

# step: production of the bit‐array representing the female k‐mers, filtered by quality and frequency
# input file /  format: FemHouse_5.fasta.gz / fasta format (text file; compressed or uncompressed)
# output file / format: FemHouse_5.trace.gz / hexadecimal representation of the bit‐array (text file; compressed)  
# comments: output file produced with the "to_Hex" function of the Bit::Vector module (http://search.cpan.org/dist/Bit‐Vector). See its manual for details.
perl YGS.pl kmer_size=15 mode=trace trace=FemHouse_5.fasta.gz

# step: production of the validating bit‐array (part 1). Its use is optional. 
# input file  / format: male.fq / fastq (text file; compressed or uncompressed)
# output file / format: male.jelly / Jellyfish binary file.
jellyfish count -m 15 -o male.jelly -c 4 -s 10G -t 4 -C male.fq
# step: production of the validating bit‐array (part 2).
# input file  / format: male.jelly / Jellyfish binary file
# output file / format: male.fasta.gz / fasta format (text file; compressed)
jellyfish dump male.jelly | gzip -c > male.fasta.gz
# Current versions of JellyFish do not do the kmer filtering
# Run a perl script to select kmers that are present in at least 5 copies
gunzip male.fasta.gz
perl kmer_extract.pl male.fasta 5 male_5.fasta
gzip male_5.fasta
gzip male.fasta
# step: production of the validating bit‐array (part 3).
# input file  / format: male_5.fasta.gz / fasta format (text file; compressed or uncompressed)
# output file / format: male_5.trace.gz / hexadecimal representation of the bit‐array (text file; compressed)
# comments: the file male_5.trace.gz contains the validating bit‐array.
perl YGS.pl kmer_size=15 mode=trace trace=male_5.fasta.gz

# step: production of the bit‐array representing the repetitive k‐mers of the genome (i.e., with copy number > 1)
# input file  / format: SOAP_scaf_1000.fas.gz / genome in fasta format (text file; compressed or uncompressed)
# output file / format: SOAP_scaf_1000.gen_rep.gz / hexadecimal representation of the bit‐array (text file; compressed)
# comments: very large scaffolds (> ~20 Mbp) are processed slowly. 
perl YGS.pl kmer_size=15 mode=contig contig=SOAP_scaf_1000.fas.gz

# step: final run.  
# input files (formats): 
# 	SOAP_scaf_1000.fas.gz (fasta, compressed or uncompressed), 
#	FemHouse_5.trace.gz (bit‐array, compressed text), 
#	SOAP_scaf_1000.gen_rep.gz (bit‐array, compressed text)
# 	male_trace = male_5.trace.gz (bit‐array, compressed text)
# output file /  format: SOAP_scaf_1000_FemHouse_5_male_5.final_result / text file. Each line contains the result of one scaffold, and is printed as processed.   
perl YGS.pl kmer_size=15 mode=final_run contig=SOAP_scaf_1000.fas.gz trace=FemHouse_5.trace.gz gen_rep=a_1000.gen_rep.gz male_trace=male_5.trace.gz

