#Note: SummarizeSeqV3 will identify conserved motifs present in database dt, provided as input, based, initially on a sequence of 5 bp surrounding the putative identified site (seqm2p2) and a quantitative weight of each site defined in the 'weightcol' parameter (in this case the median pseudouridylation level, 'medPS', is used). Iteratively, the sites which are both most abundant and with highest weights are identified, and the co-clustered along with other sequences based on 9-bp stretch surrounding the sites. The granularity of such clustering can be controlled via the 'mincor' and 'maxPSratdif' parameters. 

GetPSSM=function(seqs) {
  nucs=c("A","C","G","T")
  tt=as.data.frame(do.call(rbind,strsplit(as.character(seqs),"")))
  counts=apply(tt,2,function(x) { summary(factor(x,levels=nucs))[1:4] })
  p=prop.table(counts,2);
  p
}


NormColumns=function(data,relcols) {
  for (col in relcols) {
    ncol=paste("norm",col,sep="_")
    data[,ncol]=ecdf(data[,col])(data[,col])
  }
  
  normcols=paste("norm",relcols,sep="_")
  
  tmp=apply(data[,normcols],1,cumprod)
  data$normscore=t((tmp))[,length(normcols)]
  data1=data[order(data$normscore,decreasing=T),]
  
  
  data1
}



SummarizeSeqV3=function(dt,prefix=NULL,relseq="seqm2p2",weightcol="medPS",minnumsampperkmer=20,mincor=0.7,maxPSratdif=0.04) {
  dt$relseq=dt[,relseq]
  dt$medPS=dt[,weightcol]
  ll=sort(summary(dt$relseq,maxsum=100000))
  ll1=ll[! grepl("N",names(ll))]
  dt2=subset(dt,relseq %in% names(ll1))
  dt2$relseq=as.factor(as.character(dt2$relseq))
  men=tapply(dt2$medPS,dt2$relseq,mean)
  siz=tapply(dt2$medPS,dt2$relseq,length)
  tt=data.frame(seq=names(men),meanPS=men,num=siz)
  tt1=NormColumns(data=tt,relcols=c("meanPS","num"))
  tt2=subset(tt1,num>=minnumsampperkmer)
  tt2$nuc=substr(tt2$seq,3,3)
  m=t(do.call(rbind,lapply(1:nrow(tt2),function(i) {
    c(GetPSSM(substr(as.character(dt2$envseq[dt2$relseq==tt2$seq[i]]),6,14)))
  })))
  
  dt2$group="Other"
  cr=cor(m)
  rownames(cr)<-colnames(cr)<-tt2$seq
  allclust=as.data.frame(do.call(rbind,lapply(1:ncol(cr),function(i) {
    motifs=cr[cr[,i]>mincor,i,drop=F]
    thresh=tt2$meanPS[i]
    tt3=tt2[tt2$seq %in% rownames(motifs) ,,drop=F]
    
    tt4=subset(tt3,abs(thresh-meanPS)<maxPSratdif,drop=F)
    cr[as.character(tt4$seq),]<<-0
    cr[,as.character(tt4$seq)]<<-0
    
    
    mergedseqs=as.character(tt4$seq)
    dt2$group[dt2$relseq %in% mergedseqs]<<-paste(mergedseqs,collapse=",")
    fulldesc=apply(tt4[,c("seq","meanPS","num")],2,paste,collapse=",")
    meanPS=mean(tt4$meanPS)
    tot=sum(tt4$num)
    c(fulldesc,meanPS,tot,all(tt4$nuc=="T"))
  })))
  
  
  dt2$group=as.factor(dt2$group)
  cl2=allclust[allclust$V5!=0,]
  names(cl2)=c("allseqs","allmeanPS","allcount","meanPS","num","allT")
  cl2$rank=1:nrow(cl2)
  list(data=dt2,summary=cl2)
}


load("B:\\schwartz\\manuscripts\\WeizPapers\\PseudoUmammals\\Final Submission Version 4\\CarlileHelaData.RData")
ll=SummarizeSeqV3(dt=cands4,prefix=NULL,relseq="seqm2p2",minnumsampperkmer=20,mincor=0.7,maxPSratdif=0.04)
dt=ll$data
summaryTbl=ll$summary

#Note: the data.frame below will provide an identical value
cands5=cands4[,c("id","gcoords","seqm2p2","medPS","envseq")]
ll=SummarizeSeqV3(dt=cands5,prefix=NULL,relseq="seqm2p2",minnumsampperkmer=20,mincor=0.7,maxPSratdif=0.04)

