001    package calhoun.analysis.crf.features.tricycle13;
002    
003    import java.util.ArrayList;
004    import java.util.List;
005    
006    import org.apache.commons.logging.Log;
007    import org.apache.commons.logging.LogFactory;
008    
009    import calhoun.analysis.crf.AbstractFeatureManager;
010    import calhoun.analysis.crf.CacheStrategySpec;
011    import calhoun.analysis.crf.FeatureList;
012    import calhoun.analysis.crf.FeatureManagerNode;
013    import calhoun.analysis.crf.ModelManager;
014    import calhoun.analysis.crf.BeanModel.Node;
015    import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
016    import calhoun.analysis.crf.features.supporting.phylogenetic.ColumnConditionalLogProbability;
017    import calhoun.analysis.crf.io.InputSequence;
018    import calhoun.analysis.crf.io.TrainingSequence;
019    import calhoun.analysis.crf.io.MultipleAlignmentInputSequence.MultipleAlignmentColumn;
020    
021    public class FelsensteinFeatures extends AbstractFeatureManager<MultipleAlignmentColumn> implements FeatureManagerNode<MultipleAlignmentColumn> {
022            private static final long serialVersionUID = -7659288739348604129L;
023            private static final Log log = LogFactory.getLog(FelsensteinFeatures.class);
024            boolean debug = log.isDebugEnabled();
025            
026            /* We implement the following features:
027               feature_intronicp(y_i-1,y_i,x,i)
028                 = log(pr(multiplealignmentcolumn | y_i = {intronicp}, reference seq nuceotide, evolutionary model))
029                     * delta(y_i=intronicp),
030               and similar features for collections of states other than intronicp = {intron1,intron2,intron3},
031               as specified by configuration properties.
032               
033               Given the evolutionary model, including a phylogenetic tree, this probability can be
034               evaluated efficiently, even with missing data, using Felsenstein's algorithm.
035               
036               The evolutionary model must be trained using maximum likelihood.
037               For this, we will use a Nelder-Mead solver that requires function evaluations but
038               does not require gradient evaluations.
039            
040               We will take the topology and the relative branch lengths of the phylogenetic tree as given
041               (perhaps in the model configuration file).  It is the overall scaling and parameters such as
042               (for an HKY model) the ratio of transitions to transversions that must be determined by maximum
043               likelihood.
044               
045               Choice looming ahead: where should the phylogenetic tree with relative branchlengths be represented?
046                 a) within the Multiple alignment, ie input from file from the data
047                 b) within the model configuration file
048                 c) a third place altogether different
049               I think maybe choice a is best...but let's be open minded until need to choose.
050            */
051    
052            int startIx;  // The index of the first feature managed by this FeatureManager
053            ModelManager model;     
054            ColumnConditionalLogProbability mo;
055            boolean tieFlag = false;
056            List<int[]> clusterIndices;
057            private int eModelNum = 0; // See ColumnConditionalLogProbability for interpetation; 0 is the default.
058            
059            public FelsensteinFeatures(List<int[]> clusters) throws ClassNotFoundException {
060                    this.clusterIndices = clusters;
061            }
062    
063            public void setClusters(List<List<Node>> clusters) {
064                    clusterIndices = new ArrayList();
065                    for(List<Node> nodeList : clusters) {
066                            int[] cluster = new int[nodeList.size()]; 
067                            clusterIndices.add(cluster);
068                            for(int i=0; i<nodeList.size(); ++i) {
069                                    cluster[i] = nodeList.get(i).getIndex();
070                            }
071                    }
072            }
073            
074            public FelsensteinFeatures() { }        
075            
076            public FelsensteinFeatures(List<int[]> clusters, List<int[]> eModelNum) {
077                    this.clusterIndices = clusters;
078                    this.eModelNum = eModelNum.get(0)[0];
079            }
080            
081            public FelsensteinFeatures(List<int[]> clusters, List<int[]> eModelNum, List<int[]> flags) {
082                    tieFlag = true;
083                    this.clusterIndices = clusters;
084                    this.eModelNum = eModelNum.get(0)[0];
085            }
086            
087            public int getNumFeatures() {
088                    if (tieFlag) {
089                            return 1;
090                    } else {
091                            return mo.numClusters();                        
092                    }
093            }       
094            
095            public String getFeatureName(int featureIndex) {
096                    return "FelsensteinFeatures";
097            }
098    
099    
100            public void evaluateNode(InputSequence<? extends MultipleAlignmentColumn> seq, int pos, int state, FeatureList result) {
101                    int cl;
102                    if (tieFlag) {
103                            cl = 0;
104                    } else {
105                            cl = mo.state2cluster(state);
106                    }
107                    result.addFeature(startIx + cl, mo.condLogProb(seq,pos,state));
108            }
109    
110            
111            public void train(int startingIndex, ModelManager modelInfo, final List<? extends TrainingSequence<? extends MultipleAlignmentColumn>> data) {
112                    startIx = startingIndex;
113                    model = modelInfo;
114            
115                    mo = new ColumnConditionalLogProbability(clusterIndices,eModelNum);
116                    
117                    mo.train(modelInfo,data);
118            }
119            @Override
120            public CacheStrategySpec getCacheStrategy() {
121                    return new CacheStrategySpec(CacheStrategy.UNSPECIFIED);
122            }
123    }