########################################################## ########################################################## ##### Python code to convert protein alignments ####### ########## into codon alignment using ################# ######## guidance alignement filtering ############# ########################################################## #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Oct 3 17:58:14 2019 @author: bfouks """ ### NB: tabs (\t) == 4 spaces in python script import os, os.path from Bio import AlignIO from Bio import SeqIO from Bio.Data import CodonTable from Bio.SeqRecord import SeqRecord from Bio.Seq import Seq import collections standard_table = CodonTable.unambiguous_dna_by_id[1] stpcdn = standard_table.stop_codons def matchDNA_AA(filaln,dnaseq,aaseq,seqid,fltcol): codon_table = { 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W'} # import standard codon table postchck = False extrarem = [] errors = [] # create list to put all errors nsymbol = ['A','T','C','G','N'] # list of accepted nucleotide symbol if len([i for i in dnaseq if i not in nsymbol]) > 0: # check there is no ambiguous nucleotide code errors.append("Ambiguous nucleotide code present file: %s, seq: %s, position: %s=%s" % (filaln,seqid)) postchck == TRUE # need to be checked codons = [dnaseq[n:n+3] for n in range(0,len(dnaseq),3)] # parse cds seq by codon if stpcdn in codons: # check if stop codon in cds errors.append("Stop codon present file: %s, seq: %s, position: %s=%s\n" % (filaln,seqid)) postchck == TRUE # need to be checked if len(codons) != len(aaseq.replace('-','')): # check that length of cds match length of protein seq errors.append("Protein and cds seqs length differ: %s, seq: %s, position: %s=%s" % (filaln,seqid)) postchck == TRUE # need to be checked gdseq = "" # create empty string to append codons x = 0 # position of codons z = 1 # position of filtered aa in guidance file y = 1 # position of errors newnuc = [] # create list of all aa from alignment to check good translation newseq = [] # create a list of all codons to check good translation for nuc in aaseq: if nuc == "-": # gap in protein alignment file newseq.append("---") # gaps for codons newnuc.append("-") if z in fltcol: # check that aa pass filtering test from guidance gdseq = gdseq + "---" # if yes then append codon alignment y += 1 else: # nog gaps if "N" in codons[x]: # check if undefined nucleotidein codon if str(codons[x]).startswith(("AC","CT","CC","CG","GT","GC","GG","TC")): # check if undefined nucleotide (N) does not change aa codnew = str(codons[x]).replace("N","A") # no change of aa then a a random nucleotide to match both original aa with translated aa trslt_nuc = codon_table[codnew] # translate codon into aa if nuc != trslt_nuc: # the original aa is NOT similar to the translated one raise error and append errors.append("problem N for aln file: %s, seq: %s, position: %s=%s ==> " % (filaln,seqid,z,y) + nuc + "::" + trslt_nuc + "->" + str(codons[x])) postchck = True else: # presence of undefined nucleotide (N) lead to undefind aa trslt_nuc = "X" # translated aa is undefined else: trslt_nuc = codon_table[codons[x]] ## no Ns so translate codon into aa if z in fltcol: # check that aa pass filtering test from guidance gdseq = gdseq + codons[x] # if yes append original codon if nuc != trslt_nuc: # original aa and translated codon does NOT match raise error errors.append("problem for aln file: %s, seq: %s, position: %s=%s" % (filaln,seqid,z,y) + nuc + "::" + trslt_nuc + "->" + str(codons[x])) postchck = True ## state that sequences need a post check in order to match protein and cds seqs y += 1 newnuc.append(trslt_nuc) newseq.append(codons[x]) x += 1 z += 1 return gdseq, newnuc, newseq, postchck, errors def prot2cdn(pathguidance,pathcds,pathcdsaln): for files in os.listdir(pathguidance): remvcol = open("%s%s/MSA.PRANK.Guidance2_res_pair_col.scr.csv" % (pathguidance,files)) # iterate over guidnace filtering columns to find unreliable positions with remvcol as f: next(f) fltcol = [int(line.split(",")[0]) for line in remvcol if float(line.split(",")[-1].strip()) >= 0.93] # create list of unreliable alignment posiitons remvcol.close() cdsaln = open("%s%s.fasta" % (pathcds,files)) # open raw cds file cds_trsl = SeqIO.parse(cdsaln, "fasta") # iterate over cds file with SeqIO cds_t = AlignIO.read(cdsaln, "fasta") ## parse cds file to check number of seqs lencds = len(cds_t) # get number of seqs o = 0 # check that all seqs have been correctly transform according to protein alignment fileout = open("%s%s.fa" % (pathcdsaln,files),"w") ## write file of cds aligment for cdsseq in cds_trsl: seqid = cdsseq.id # store id of sequence to match with id of protein alignment seque = cdsseq.seq # sequence as string if "ATG" in seque[:3] or "CTG" in seque[:3] or "TTG" in seque[:3]: ## check if codon seq start with start codon needed for .translate() function newseq = SeqRecord(Seq(seque), id = cdsseq.id) protslt = newseq.translate(table="Standard",to_stop=False) ## translate codon seq into protein using standard code to check match with protein alignment else: ## if no start codon add start codon and then remove it at the end sequ = "ATG" + seque newseq = SeqRecord(Seq(sequ), id = cdsseq.id) protslt = newseq.translate(table="Standard",to_stop=False) protslt = protslt[1:] # remove added start codon gdfltaln = open("%s%s/UserMSA" % (pathguidance,files)) ## import protein alignment from guidance file aln = SeqIO.parse(gdfltaln,"fasta") ## parse it with SeqIO for record in aln: if record.id == seqid: ## match cds seq id with protein alignment seq id aaseq = str(record.seq) # seq as string mtchseqs = matchDNA_AA(files,newseq,aaseq,protslt.id,fltcol) if mtchseqs[3] == True: o += 1 # specificy that seqs has passed correctly from protein alignment to codon alignment fileout.write(">"+ str(seqid) + "\n" + str(mtchseqs[0]) + "\n") fileout.close() if o != lencds: # all codon seqs could not be aligned according to protein alignment os.system("rm %s%s.fa" % (pathcdsaln,files)) ## remove alignments prot2cdn('/user/Documents/PAML/guidance/','/user/Documents/PAML/raw_cds/','/user/Documents/PAML/aln_cds/') ########################################################## ########################################################## ##### R code to run functional category analyses ####### ########## using SUMSTAT methods ###################### ########################################################## ########################################################## #!/usr/bin/env Rscript require(topGO) require(Rgraphviz) require(permute) require(qvalue) require(ViSEAGO) ########################### #### SUMSTAT method ####### ########################### ## Category test based on score sums (home-made) Create home made SUMSTAT analysis if(!isGeneric("GOSumT")) setGeneric("GOSumT", function(object) standardGeneric("GOSumT")) setMethod("GOSumT", "classicScore", function(object) { nG <- numMembers(object) if(nG == 0) return(1) N <- sqrt(nG*var(allScore(object,FALSE))) ## compute the group sum (statistic) sObserved <- sum(membersScore(object)) zscore <- (sObserved -nG*mean(allScore(object,FALSE))) / N ## return(ifelse(zscore < 0,pnorm(zscore)*2,(1-pnorm(zscore))*2)) }) if(!isGeneric("Direct")) setGeneric("Direct", function(object) standardGeneric("Direct")) setMethod("Direct", "classicScore", function(object) { nG <- numMembers(object) if(nG == 0) return(1) N <- sqrt(nG*var(allScore(object,FALSE))) ## compute the group sum (statistic) sObserved <- sum(membersScore(object)) zscore <- (sObserved -nG*mean(allScore(object,FALSE))) / N ## return(ifelse(zscore < 0,-1,+1)) }) ### HERE define the significance threshold which is basically the LRT value for which FDR is < 0.1 siglvl <- 17.28^(1/4) # m8 = 12.55^(1/4) ############################### ##### Cellular Component ###### ############################### adpsumstc <- read.csv("aADscor_sumstat_GO_C.csv") # import GO file for cellular component from AD branch adpsumstc$count <- rep(1,length(adpsumstc[,1])) # create a column for counting how many GO category a gene has l <- tapply(adpsumstc$count,adpsumstc$GOterm,sum) # calculate the total number of gene per GO term adpsumstc$counttot <- l[adpsumstc$GOterm] # append this total number of genes per GO term in table head(adpsumstc) ## Format data for running analysis GO2geneIDc <- adpsumstc[,2] names(GO2geneIDc) = as.character(adpsumstc[,1]) geneID2GOc <- inverseList(GO2geneIDc) ## create function to define which genes are significant int.genes <- function(allScore) { return(allScore > siglvl) } ## get the LRT fourth root for a specific gene all.genesc <- tapply(adpsumstc$score,as.character(adpsumstc$GeneF),mean) ## create a subset of significant gene selgenec <- all.genesc[all.genesc > siglvl] ## import data in topGO go.objc <- new("topGOdata", ontology='CC', allGenes = all.genesc, geneSel = int.genes, annot = annFUN.gene2GO, gene2GO = geneID2GOc, nodeSize = 10) ## run SUMSTAT analysis test.state <- new("elimScore", testStatistic = GOSumT, name = "fisher",scoreOrder = "decreasing") resumTec <- getSigGroups(go.objc,test.state) showSigOfNodes(go.objc,score(resumTec),firstSigNodes = 5, useInfo = ' all ' ) printGraph(go.objc,resumTec, firstSigNodes = 5, fn.prefix = "tGO_CC_AD_no_wz_10n", useInfo = "all", pdfSW = TRUE) scTec <- score(resumTec) ## subset of significant GOs scec <- scTec[scTec <0.05] ## run test to get the direction of significance if enriched or depleted direc.teste <- new("elimScore", testStatistic = Direct, name = "fisher",scoreOrder = "decreasing") resumDec <- getSigGroups(go.objc,direc.teste) scDec <- score(resumDec) ## run SUMSTAT analysis without the elim algorithm test.statc <- new("classicScore", testStatistic = GOSumT, name = "fisher",scoreOrder = "decreasing") resumTcc <- getSigGroups(go.objc,test.statc) scTcc <- score(resumTcc) ## subset of significant GOs sccc <- scTcc[scTcc <0.05] ## run FDR FDRelimc <- qvalue(score(resumTec),pfdr = TRUE) pvalFDRc <- FDRelimc$lfdr tnodc <- ifelse(length(pvalFDRc[pvalFDRc < 0.2]) < 2, 3,length(pvalFDRc[pvalFDRc < 0.2])) ## Get table of significant results allResc <- GenTable(go.objc, classic = resumTcc, KS = resumTec, orderBy = "KS", ranksOf = "classic", topNodes = tnodc) allResc ## Get all results resultsc <- as.data.frame(allResc) resultsc$SumStfdr <- pvalFDRc[resultsc$GO.ID] resultsc$dir <- ifelse(scDec[resultsc$GO.ID] < 0,"-","+") resultsc ## Get mean score of GO depending on its number of genes alnodc <- length(pvalFDRc) aResc <- GenTable(go.objc, classic = resumTcc, KS = resumTec, orderBy = "KS", ranksOf = "classic", topNodes = alnodc) alresc <- as.data.frame(aResc) alresc$SumStfdr <- pvalFDRc[alresc$GO.ID] alresc$dir <- ifelse(scDec[alresc$GO.ID] < 0,"-","+") allSc <- geneScore(go.objc, use.names = TRUE) GOscorec <- list() for(i in 1:alnodc){ GOid = alresc[i,1] groupMembers <- genesInTerm(go.objc, GOid)[[1]] GOscorec[i] <- sum(allSc[names(allSc) %in% groupMembers]) } alresc$GO_score <- unlist(GOscorec) alresc$ratio_GOscore <- alresc$GO_score/alresc$Annotated ## Create graphs to check up results and significance of GOs in function of LRT values of its genes allSc <- geneScore(go.objc, use.names = TRUE) gonsigc <- ifelse(length(resultsc[resultsc$SumStfdr < 0.2,1]) < 1,2,length(resultsc[resultsc$SumStfdr < 0.2,1])) gosigc <- resultsc[c(1:gonsigc),] GOgec <- list() for(i in 1:length(gosigc[,1])){ GOid = gosigc[i,1] groupMembers <- genesInTerm(go.objc, GOid)[[1]] GOgec[i] <- list(allSc[names(allSc) %in% groupMembers]) } names(GOgec) <- gosigc[,1] pdf("Check_density_of_sig_GOs_CC_no_AD_wz_10n.pdf",width=6.5,height=6) for(i in 1:length(gosigc[,1])){ plot(density(GOgec[[i]])) } dev.off() GO_idc <- list() GenFc <- list() Scorgc <- list() for(i in 1:length(gosigc[,1])){ goid <- rep(gosigc[i,1],length(GOgec[[i]])) GO_idc <- c(GO_idc,goid) genf <- names(GOgec[[i]]) GenFc <- c(GenFc,genf) scg <- GOgec[[i]] Scorgc <- c(Scorgc,scg) } gGO_idc <- unlist(GO_idc) gGenFc <- unlist(GenFc) gScorgc <- unlist(Scorgc) chec <- cbind(gGO_idc,gGenFc,gScorgc) write.csv(chec,"SigGOs_GF&scor_AD_no_wz_10n_C.csv") ############################### ##### Molecular Function ###### ############################### adpsumstf <- read.csv("aADscor_sumstat_GO_F.csv") adpsumstf$count <- rep(1,length(adpsumstf[,1])) l <- tapply(adpsumstf$count,adpsumstf$GOterm,sum) adpsumstf$counttot <- l[adpsumstf$GOterm] GO2geneIDf <- adpsumstf[,2] names(GO2geneIDf) = as.character(adpsumstf[,1]) geneID2GOf <- inverseList(GO2geneIDf) int.genes <- function(allScore) { return(allScore > siglvl) } all.genesf <- tapply(adpsumstf$score,adpsumstf$GeneF,mean) selgenef <- all.genesf[all.genesf > siglvl] go.objf <- new("topGOdata", ontology='MF', allGenes = all.genesf, geneSel = int.genes, annot = annFUN.gene2GO, gene2GO = geneID2GOf, nodeSize = 10) test.state <- new("elimScore", testStatistic = GOSumT, name = "fisher",scoreOrder = "decreasing") resumTef <- getSigGroups(go.objf,test.state) showSigOfNodes(go.objf,score(resumTef),firstSigNodes = 5, useInfo = ' all ' ) printGraph(go.objf,resumTef, firstSigNodes = 5, fn.prefix = "tGO_MF_AD_no_wz_10n", useInfo = "all", pdfSW = TRUE) scTef <- score(resumTef) scef <- scTef[scTef <0.05] test.statc <- new("classicScore", testStatistic = GOSumT, name = "fisher",scoreOrder = "decreasing") resumTcf <- getSigGroups(go.objf,test.statc) scTcf <- score(resumTcf) sccf <- scTcf[scTcf <0.05] direc.teste <- new("elimScore", testStatistic = Direct, name = "fisher",scoreOrder = "decreasing") resumDef <- getSigGroups(go.objf,direc.teste) scDef <- score(resumDef) FDRelimf <- qvalue(score(resumTef),pfdr = TRUE) pvalFDRf <- FDRelimf$lfdr tnodf <- ifelse(length(pvalFDRf[pvalFDRf < 0.2]) < 2, 3,length(pvalFDRf[pvalFDRf < 0.2])) allResf <- GenTable(go.objf, classic = resumTcf, KS = resumTef, orderBy = "KS", ranksOf = "classic", topNodes = tnodf) allResf resultsf <- as.data.frame(allResf) resultsf$SumStfdr <- pvalFDRf[resultsf$GO.ID] resultsf$dir <- ifelse(scDef[resultsf$GO.ID] < 0,"-","+") resultsf alnodf <- length(pvalFDRf) aResf <- GenTable(go.objf, classic = resumTcf, KS = resumTef, orderBy = "KS", ranksOf = "classic", topNodes = alnodf) alresf <- as.data.frame(aResf) alresf$SumStfdr <- pvalFDRf[alresf$GO.ID] alresf$dir <- ifelse(scDef[alresf$GO.ID] < 0,"-","+") allSf <- geneScore(go.objf, use.names = TRUE) GOscoref <- list() for(i in 1:alnodf){ GOid = alresf[i,1] groupMembers <- genesInTerm(go.objf, GOid)[[1]] GOscoref[i] <- sum(allSf[names(allSf) %in% groupMembers]) } alresf$GO_score <- unlist(GOscoref) alresf$ratio_GOscore <- alresf$GO_score/alresf$Annotated allSf <- geneScore(go.objf, use.names = TRUE) gonsigf <- ifelse(length(resultsf[resultsf$SumStfdr < 0.2,1]) < 1,2,length(resultsf[resultsf$SumStfdr < 0.2,1])) gosigf <- resultsf[c(1:gonsigf),] GOgef <- list() for(i in 1:length(gosigf[,1])){ GOid = gosigf[i,1] groupMembers <- genesInTerm(go.objf, GOid)[[1]] GOgef[i] <- list(allSf[names(allSf) %in% groupMembers]) } names(GOgef) <- gosigf[,1] pdf("Check_density_of_sig_GOs_MF_no_AD_wz_10n.pdf",width=6.5,height=6) for(i in 1:length(gosigf[,1])){ plot(density(GOgef[[i]])) } dev.off() GO_idf <- list() GenFf <- list() Scorgf <- list() for(i in 1:length(gosigf[,1])){ goid <- rep(gosigf[i,1],length(GOgef[[i]])) GO_idf <- c(GO_idf,goid) genf <- names(GOgef[[i]]) GenFf <- c(GenFf,genf) scg <- GOgef[[i]] Scorgf <- c(Scorgf,scg) } gGO_idf <- unlist(GO_idf) gGenFf <- unlist(GenFf) gScorgf <- unlist(Scorgf) chef <- cbind(gGO_idf,gGenFf,gScorgf) write.csv(chef,"SigGOs_GF&scor_AD_no_wz_10n_F.csv") ################################# ##### Biological Processes ###### ################################# adpsumstp <- read.csv("aADscor_sumstat_GO_P.csv") adpsumstp$count <- rep(1,length(adpsumstp[,1])) l <- tapply(adpsumstp$count,adpsumstp$GOterm,sum) adpsumstp$counttot <- l[adpsumstp$GOterm] GO2geneIDp <- adpsumstp[,2] names(GO2geneIDp) = as.character(adpsumstp[,1]) geneID2GOp <- inverseList(GO2geneIDp) int.genes <- function(allScore) { return(allScore > siglvl) } all.genesp <- tapply(adpsumstp$score,adpsumstp$GeneF,mean) selgenep <- all.genesp[all.genesp > siglvl] go.objp <- new("topGOdata", ontology='BP', allGenes = all.genesp, geneSel = int.genes, annot = annFUN.gene2GO, gene2GO = geneID2GOp, nodeSize = 10) test.state <- new("elimScore", testStatistic = GOSumT, name = "fisher",scoreOrder = "decreasing") resumTep <- getSigGroups(go.objp,test.state) showSigOfNodes(go.objp,score(resumTep),firstSigNodes = 5, useInfo = ' all ' ) printGraph(go.objp,resumTep, firstSigNodes = 5, fn.prefix = "tGO_BP_AD_no_wz_10n", useInfo = "all", pdfSW = TRUE) scTep <- score(resumTep) scep <- scTep[scTep <0.05] test.statc <- new("classicScore", testStatistic = GOSumT, name = "fisher",scoreOrder = "decreasing") resumTcp <- getSigGroups(go.objp,test.statc) scTcp <- score(resumTcp) sccp <- scTcp[scTcp <0.05] direc.teste <- new("elimScore", testStatistic = Direct, name = "fisher",scoreOrder = "decreasing") resumDep <- getSigGroups(go.objp,direc.teste) scDep <- score(resumDep) allGOres <- c(scTec,scTef,scTep) FDRelimp <- qvalue(scTep,pfdr = TRUE) pvalFDRp <- FDRelimp$lfdr tnodp <- ifelse(length(pvalFDRp[pvalFDRp < 0.2]) < 2, 3,length(pvalFDRp[pvalFDRp < 0.2])) allResp <- GenTable(go.objp, classic = resumTcp, KS = resumTep, orderBy = "KS", ranksOf = "classic", topNodes = tnodp) allResp resultsp <- as.data.frame(allResp) resultsp$SumStfdr <- pvalFDRp[resultsp$GO.ID] resultsp$dir <- ifelse(scDep[resultsp$GO.ID] < 0,"-","+") resultsp alnodp <- length(pvalFDRp) aResp <- GenTable(go.objp, classic = resumTcp, KS = resumTep, orderBy = "KS", ranksOf = "classic", topNodes = alnodp) alresp <- as.data.frame(aResp) alresp$SumStfdr <- pvalFDRp[alresp$GO.ID] alresp$dir <- ifelse(scDep[alresp$GO.ID] < 0,"-","+") allSp <- geneScore(go.objp, use.names = TRUE) GOscorep <- list() for(i in 1:alnodp){ GOid = alresp[i,1] groupMembers <- genesInTerm(go.objp, GOid)[[1]] GOscorep[i] <- sum(allSp[names(allSp) %in% groupMembers]) } alresp$GO_score <- unlist(GOscorep) alresp$ratio_GOscore <- alresp$GO_score/alresp$Annotated allSp <- geneScore(go.objp, use.names = TRUE) gonsigp <- ifelse(length(resultsp[resultsp$SumStfdr < 0.2,1]) < 1,2,length(resultsp[resultsp$SumStfdr < 0.2,1])) gonsigp gosigp <- resultsp[c(1:gonsigp),] GOgep <- list() for(i in 1:length(gosigp[,1])){ GOid = gosigp[i,1] groupMembers <- genesInTerm(go.objp, GOid)[[1]] GOgep[i] <- list(allSp[names(allSp) %in% groupMembers]) } names(GOgep) <- gosigp[,1] pdf("Check_density_of_sig_GOs_BP_no_AD_wz_10n.pdf",width=6.5,height=6) for(i in 1:length(gosigp[,1])){ plot(density(GOgep[[i]])) } dev.off() GO_idp <- list() GenFp <- list() Scorgp <- list() for(i in 1:length(gosigp[,1])){ goid <- rep(gosigp[i,1],length(GOgep[[i]])) GO_idp <- c(GO_idp,goid) genf <- names(GOgep[[i]]) GenFp <- c(GenFp,genf) scg <- GOgep[[i]] Scorgp <- c(Scorgp,scg) } gGO_idp <- unlist(GO_idp) gGenFp <- unlist(GenFp) gScorgp <- unlist(Scorgp) chep <- cbind(gGO_idp,gGenFp,gScorgp) write.csv(chep,"SigGOs_GF&scor_AD_no_wz_10n_P.csv") ## COMPILE results from GO analysis of CC, MF and BP allresults <- rbind(resultsc,resultsf,resultsp) allresults$Process <- c(rep("CC",length(resultsc[,1])),rep("MF",length(resultsf[,1])),rep("BP",length(resultsp[,1]))) write.table(allresults,"results_GO_AD_no_wz_10n.csv", sep="\t") alreslts <- rbind(alresc,alresf,alresp) alreslts$Process <- c(rep("CC",length(alresc[,1])),rep("MF",length(alresf[,1])),rep("BP",length(alresp[,1]))) write.table(alreslts,"all_results_GO_AD_no_wz_10n.csv", sep="\t")