Software_requirement

- BLAST (https://www.ncbi.nlm.nih.gov/books/NBK279690/)
- R (https://www.r-project.org/)
- Geneious (https://www.geneious.com/)
- MAFFT (http://mafft.cbrc.jp/alignment/software/)
- FastTree (http://www.microbesonline.org/fasttree/)
- bedtools2 (https://github.com/arq5x/bedtools2/releases/latest)
- seqtk (https://github.com/lh3/seqtk.git) 
- Sixpack (http://emboss.sourceforge.net/apps/release/6.6/emboss/apps/sixpack.html)


####################################
#                                  #
#             STEP_1               #
#                                  #
####################################
[1]

#########
##BLAST##
#########

###Make_blast_db_from_raw_reads###

cd /path_to_genome_reads/

for f in * 

do; 

makeblastdb -in ./$f -dbtype nucl;

done



###tblastn_on_TE_query_list####

for f in SRR*.fasta
do;
tblastn -db ./$f -query ./query.fasta -out ../blast/$f -outfmt  " 6 qseqid means sseqid means qstart means qend means sstart means send means evalue means bitscore length means pident means mismatch means gapopen means qlen means";
done



[2]
#################################
##Read_filtering_and_extraction##
#################################

###Open_R_session###


directory <- ("/path_to_blast_output_table/")
setwd(directory)
filestodo <- dir(path=directory, pattern="*")
filestodo



lists <- lapply(filestodo, function(x) {
  tryCatch(read.table(x, header = FALSE), error=function(e) NULL)
})


ta <- unique(lists[[1]]$V1)


fun0 <- function(x){
subset(x, x$V1==tb)
}
lis <- list()
lisi <- list()
for (i in 1:length(ta)){
tb <- as.character(ta[i])
lis[[i]] <- lapply(lists, fun0)
lisi[[i]] <- do.call(rbind, lis[[i]])
}

tc <- as.character(ta)



funu1 <- function(x){
subset (x, x[,10]>=80)
}
lis1 <- lapply(lisi, funu1)

funu2 <- function(x){
x$V5-x$V6
}
lis2 <- lapply(lis1, funu2)

funu3 <- function(x){
ifelse (x[,5]-x[,6] <=0, x[,5], x[,6])
}
nn1 <- lapply (lis1, funu3) 

funu4 <- function(x){
ifelse (x[,5]-x[,6] >=0, x[,5], x[,6])
}
nn2 <- lapply (lis1, funu4)

funu5 <- function(x){
as.character(x[,2])
} 

nn0 <- lapply (lis1, funu5)


funu6 <- function(x){
as.character(x[,1])
} 
nn3 <- lapply (lis1, funu6)

funu7 <- function(x){
paste (x ,c(1:length(x)), sep="_")
}

nn4 <- lapply(nn3, funu7)

kk <- mapply (cbind, nn0, nn1, nn2, nn4,  SIMPLIFY=F)

setwd("/path_to_bed_file_output/")

###Make_bed_files_with_exact_blast_hits_coordinates###

mapply (write.table, kk, file=tc, row.names=FALSE, col.names=FALSE, quote=FALSE, sep="	")



###Get_extanted_coordinates_to_extract_full_length_read###


directory <- ("/path_to_bed_file/")
setwd(directory)
filestodo <- dir(path=directory, pattern="*.txt")
filestodo
lists <- lapply (filestodo, read.table, header=FALSE)

directory <- ("/path_to_read_length/")
setwd(directory)
filestodo1 <- dir(path=directory, pattern="lengthXXX")
lists1 <- lapply (filestodo1, read.table, header=FALSE)
 
ko<-NULL
nj<-NULL
for (n in 1:length(lists)){

ko <- lists[[n]]$V1
nj[[n]] <- lists1[[1]][lists1[[1]]$V1 %in% ko, ]
}


j <- NULL
for (i in 1:length(nj)){

a <- nj[[i]]$V1
b <- rep(1,length(nj[[i]]$V1))
c <- nj[[i]]$V2
d <- nj[[i]]$V1

j[[i]] <- cbind(as.character(a),b,c,as.character(d))
}

###Make_bed_files_with_total_read_length###

setwd("/path_to_total_coordinates/")
for (i in 1:length(filestodo)){
write.table (file=paste(filestodo[[i]], ".bed", sep=""), j[[i]], col.names=FALSE, row.names=FALSE, quote=FALSE, sep="	") }


###exit(R)####



####extract_read_with_bedtools#####

cd /path_to_total_coordinates/

for f in *


do
/path_to_bedtools_bin/bedtools getfasta -fi /path_to_genome/ -bed $f -fo /path_to_fasta_output/$f.fa
done





[3]
####################
##de novo assembly##
####################


Import fasta files to geneious

run de novo assembly with Genious Assembler --[option] High Sensitivity / Medium

Export Consensus assemblies to a new directory


[4]
####################
##FastTree_Sorting##
####################





[[4-1]]
###make_new_directory_for_list_of_consensus_per_reference_TE####


ROOT_FOLDER="/Volumes/Baba_data/suzukii/consensus/"

cd "$ROOT_FOLDER"




for file in *; do
    dir=${file%%.*}
    mkdir -p "$dir"
    mv "$file" "$dir"
done




for f in "$ROOT_FOLDER"*; 

 
do    

[ -d $f ] && cd "$f" && 
awk 'BEGIN {n_seq=0;} /^>/ {if(n_seq%1==0){file=sprintf("myseq%d.fa",n_seq);} print >> file; n_seq++; next;} { print >> file; close(file)}' < tr4* 
 
done




for f in "$ROOT_FOLDER"*;

do    

[ -d $f ] && cd "$f" && rm ./*.fasta 

done


for f in "$ROOT_FOLDER"*;

do    

[ -d $f ] && cd "$f" && mkdir cons 

done



for f in "$ROOT_FOLDER"*;
do    

[ -d $f ] && cd "$f" && for i in *; do mv $i ./cons; done 

done


for f in "$ROOT_FOLDER"*;
do    

[ -d $f ] && cd "$f" &&  mkdir prep_ali ali tree valide

done




[[4-2]]
############Run-alignments(MAFFT)_and_TREES(FastTree)############


#######ADD_REFERENCE_TREE_FILE_(total.fasta)########





cd "$ROOT_FOLDER"

for f in "$ROOT_FOLDER"*/cons;

do    

  cp "$ROOT_FOLDER"/total.fasta $f

done



rm "$ROOT_FOLDER"total.fasta



cd "$ROOT_FOLDER"

for i in * 

do  cd  "$ROOT_FOLDER"/$i/cons 

for f in myseq*  ;  do cat $f total.fasta> ../prep_ali/$f ; done

done



cd "$ROOT_FOLDER"


for i in * 

do  cd "$ROOT_FOLDER"/$i/prep_ali



for f in * ; do mafft --auto $f > ../ali/$f ; done


done



FASTTREE="/Users/basai/Desktop/"



cd "$ROOT_FOLDER"

for i in * 

do  cd "$ROOT_FOLDER"$i/ali/


for f in myseq* ; do "$FASTTREE"FastTree -gtr -nt ./$f > ../tree/$f ; done

done




[[4-3]]
##########TREE_SORTING##########

####rename_trees_with_reference_TE_name####

cd "$ROOT_FOLDER"

for dir in *; do 
    find "$dir/tree/" -type f -print0 |  
        while IFS= read -r -d '' f; do 
            dd=$(dirname "$f")
            new="${f/tree\/}"
            new="${new//\//_}" 
            mv "$f" "$dd"/"$new"
        done
done

####cp_all_trees_in_new_directory####



cd "$ROOT_FOLDER"

mkdir totaltree

for i in * 

do   cd "$ROOT_FOLDER"$i/tree 

for f in *myseq*  ;  do mv $f "$ROOT_FOLDER"/totaltree ; done

done

cd "$ROOT_FOLDER"totaltree

mkdir valid



####OPEN_R_SESSION####





library("ape")
library("phytools")


directory <- ("/path_to_trees/")
setwd(directory)
filestodo <- dir(path=directory, pattern="*fa")

lists <- lapply (filestodo, read.newick)


fun <- function(x){
tt <- as.data.frame(x$edge)
min <- min(tt[,1])

rootedge <- subset(tt, tt[,1]==min)
}

U <- lapply(lists, fun)


funj <- function(x){
ifelse (x[[2]]- x[[1]] <= 0, x[[2]], x[[2]]-x[[1]])
}
HH <- lapply(U, funj)

fg <- function(x){
c(x[1], x[2]+1, x[3]+2)
}
HH <- lapply(HH, fg)


fun1 <- function(x){
x$tip.label
}
W <- lapply(lists, fun1)


gh <- function(x,y){
x[y]
}
JJ <- mapply(gh, W, HH)


HILL  <- as.list(as.data.frame(JJ))
funnn <- function(x){ grep("Contig", x)}
tg <- lapply(HILL, funnn)

moms <- function(x){
if (length(x)==0)
return(0)
return(1)
}

tg <- lapply(tg, moms)

ir <- do.call(rbind, tg)

tty <- cbind(ir, filestodo)
ttx <- subset(tty, tty[,1]=="0")

B <- cbind(ttx[,2])
M <- as.data.frame(filestodo)
QQ <- M$filestodo%in%B
MM <- cbind(M,QQ)
E <- as.numeric(subset(row.names(MM), MM$QQ==1))


setwd("/path_to_trees/valid/")
JLO <- for (i in c(E)){
write.table(file=as.character(filestodo[i]), write.tree(lists[[i]]), quote=FALSE, col.names=FALSE, row.names=FALSE)
}




[[4-4]]
####################################################################################
#                                                                                  #
#       Visual Inspection of remniscent Trees:                                     #
#                                                                                  #
#       -Import Trees from the valid folder into a tree viewer                     #
#                                                                                  #
#       -Double Check for Tree topology making sure that consensus are outgroups   #
#                                                                                  #
####################################################################################



[[4-5]]
############Import_consensus_sequence_from_tree_to generate_new_query_for_step2#######


cd "$ROOT_FOLDER"

for dir in *; do 
    find "$dir/cons/" -type f -print0 |  
        while IFS= read -r -d '' f; do 
            dd=$(dirname "$f")
            new="${f/cons\/}"
            new="${new//\//_}" 
            mv "$f" "$dd"/"$new"
        done
done


cd "$ROOT_FOLDER"

mkdir totalseq
mkdir sub

for i in * 

do   cd "$ROOT_FOLDER"$i/cons 

for f in *myseq*  ;  do mv $f "$ROOT_FOLDER"/totalseq ; done

done



#####Import_all_filtered_ingroup_Trees_in_a_'Trees'_folder#####

cd "$ROOT_FOLDER"/Trees

for entry in `ls $search_dir`; do     echo $entry; done >name.txt


awk '{print "$ROOT_FOLDERtotalseq/"$0}' $ROOT_FOLDER/Trees/name.txt >$ROOT_FOLDER/Trees/name1.txt




cd $ROOT_FOLDER/totalseq

for f in * 

do

grep -f $ROOT_FOLDER/Trees/name1.txt ./$f | while read f; do mv "$f" ../sub/

done


cd $ROOT_FOLDER/sub/


cat * > query.fasta

awk '/^>/{print ">consensus" ++i; next}{print}' < query.fasta > query_step2.fasta















####################################
#                                  #
#             STEP_2               #
#                                  #
####################################


Reiterate_step_1_until_step[[4-4]]_using_blastn_instead_of_tblast_in_[[1]] in a new directory:


ROOT_FOLDER="path_to_your_main_Folder/consensus_step2/"



###blastn_on_new_query_list####

for f in *.fasta
do;
blastn -db ./$f -query ./query.fasta -out ../blast/$f -outfmt  " 6 qseqid means sseqid means qstart means qend means sstart means send means evalue means bitscore length means pident means mismatch means gapopen means qlen means";
done

#############

Run up to step [[4-4]]

############



[[4-4]]
####################################################################################
#                                                                                  #
#       Visual Inspection of remniscent Trees:                                     #
#                                                                                  #
#       -Import Trees from the valid folder into a tree viewer                     #
#                                                                                  #
#       -Double Check for Tree topology making sure that consensus are outgroups   #
#                                                                                  #
#       -Cluster consensus by tree node (HeT-A/TAHRE; TART; TR1; TR2; TR3; TR4;    #
#        Jockey                                                                    #
#                                                                                  #
####################################################################################


(e.g. for folders clustered by tree node)


/cd "$ROOT_FOLDER"/Trees/
└── ORF_1
    ├── HeT-A:TAHRE
    │   ├── consensus_1.newick
    │   ├── consensus_2.newick
    │   └── consensus_n.newick
    ├── Jockey
    │   ├── consensus_1.newick
    │   ├── consensus_2.newick
    │   └── consensus_n.newick
    ├── TART
    │   ├── consensus_1.newick
    │   ├── consensus_2.newick
    │   └── consensus_n.newick
    ├── TR1
    │   ├── consensus_1.newick
    │   ├── consensus_2.newick
    │   └── consensus_n.newick
    ├── TR2
    │   ├── consensus_1.newick
    │   ├── consensus_2.newick
    │   └── consensus_n.newick
    ├── TR3
    │   ├── consensus_1.newick
    │   ├── consensus_2.newick
    │   └── consensus_n.newick
    └── TR4
        ├── consensus_1.newick
        ├── consensus_2.newick
        └── consensus_n.newick

##########################################


cd "$ROOT_FOLDER"

for dir in *; do 
    find "$dir/cons/" -type f -print0 |  
        while IFS= read -r -d '' f; do 
            dd=$(dirname "$f")
            new="${f/cons\/}"
            new="${new//\//_}" 
            mv "$f" "$dd"/"$new"
        done
done


cd "$ROOT_FOLDER"

mkdir totalseq
mkdir sub

for i in * 

do   cd "$ROOT_FOLDER"$i/cons 

for f in *myseq*  ;  do mv $f "$ROOT_FOLDER"/totalseq ; done

done



#####Import_all_filtered_ingroup_Trees_in_a_'Trees'_folder#####

cd "$ROOT_FOLDER"/Trees/ORF_1/


cd "$ROOT_FOLDER"/Trees/ORF_1/HeT-A\:TAHRE/

for entry in `ls $search_dir`; do     echo $entry; done >name.txt

awk '{print "$ROOT_FOLDERtotalseq/"$0}' $ROOT_FOLDER/Trees/ORF_1/HeT-A\:TAHRE/name.txt >$ROOT_FOLDER/Trees/ORF_1/HeT-A\:TAHRE/name1.txt




while read file; do mv "$file" "$ROOT_FOLDER"/sub/o1heta/ ; done < /"$ROOT_FOLDER"/Trees/ORF_1/HeT-A\:TAHRE/name1.txt



cd "$ROOT_FOLDER"/sub/



for i in * 

do  cd "$ROOT_FOLDER"/sub/$i/


for f in *myseq* ; do 
transeq -sequence ./$f -outseq ./t/$f.fasta -frame 6 ; done

done





for f in "$ROOT_FOLDER"/sub/*; 

 
do    

[ -d $f ] && cd "$f" && 
awk 'BEGIN {n_seq=0;} /^>/ {if(n_seq%1==0){file=sprintf("myseq%d.fa",n_seq);} print >> file; n_seq++; next;} { print >> file; close(file)}' < *.fasta 
 
done





for f in "$ROOT_FOLDER"/sub/*;

do    

[ -d $f ] && cd "$f" && rm ./*.fasta 

done


for f in "$ROOT_FOLDER"/sub/*;

do    

[ -d $f ] && cd "$f" && mkdir cons 

done



for f in "$ROOT_FOLDER"/sub/*;
do    

[ -d $f ] && cd "$f" && for i in *; do mv $i ./cons; done 

done


for f in "$ROOT_FOLDER"/sub/*;
do    

[ -d $f ] && cd "$f" &&  mkdir prep_ali ali tree

done

#####
add fasta node to seq file
####


cd "$ROOT_FOLDER"/sub/

for i in * 

do  cd  "$ROOT_FOLDER"/sub/$i/cons 

for f in myseq*  ;  do cat $f total.fasta> ../prep_ali/$f ; done

done






cd "$ROOT_FOLDER"/sub/


for i in * 

do  cd "$ROOT_FOLDER"/sub//$i/prep_ali



for f in * ; do mafft --auto $f > ../ali/$f ; done


done




######################################
STEP2_OVER
######################################





Import translated consensus aligned to reference TEs and generate final consensus 
using the majority rule on Genieous.