001 package calhoun.analysis.crf.features.tricycle13;
002
003 import java.util.ArrayList;
004 import java.util.List;
005
006 import org.apache.commons.logging.Log;
007 import org.apache.commons.logging.LogFactory;
008
009 import calhoun.analysis.crf.AbstractFeatureManager;
010 import calhoun.analysis.crf.CacheStrategySpec;
011 import calhoun.analysis.crf.FeatureList;
012 import calhoun.analysis.crf.FeatureManagerNode;
013 import calhoun.analysis.crf.ModelManager;
014 import calhoun.analysis.crf.BeanModel.Node;
015 import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
016 import calhoun.analysis.crf.features.supporting.phylogenetic.ColumnConditionalLogProbability;
017 import calhoun.analysis.crf.io.InputSequence;
018 import calhoun.analysis.crf.io.TrainingSequence;
019 import calhoun.analysis.crf.io.MultipleAlignmentInputSequence.MultipleAlignmentColumn;
020
021 public class FelsensteinFeatures extends AbstractFeatureManager<MultipleAlignmentColumn> implements FeatureManagerNode<MultipleAlignmentColumn> {
022 private static final long serialVersionUID = -7659288739348604129L;
023 private static final Log log = LogFactory.getLog(FelsensteinFeatures.class);
024 boolean debug = log.isDebugEnabled();
025
026 /* We implement the following features:
027 feature_intronicp(y_i-1,y_i,x,i)
028 = log(pr(multiplealignmentcolumn | y_i = {intronicp}, reference seq nuceotide, evolutionary model))
029 * delta(y_i=intronicp),
030 and similar features for collections of states other than intronicp = {intron1,intron2,intron3},
031 as specified by configuration properties.
032
033 Given the evolutionary model, including a phylogenetic tree, this probability can be
034 evaluated efficiently, even with missing data, using Felsenstein's algorithm.
035
036 The evolutionary model must be trained using maximum likelihood.
037 For this, we will use a Nelder-Mead solver that requires function evaluations but
038 does not require gradient evaluations.
039
040 We will take the topology and the relative branch lengths of the phylogenetic tree as given
041 (perhaps in the model configuration file). It is the overall scaling and parameters such as
042 (for an HKY model) the ratio of transitions to transversions that must be determined by maximum
043 likelihood.
044
045 Choice looming ahead: where should the phylogenetic tree with relative branchlengths be represented?
046 a) within the Multiple alignment, ie input from file from the data
047 b) within the model configuration file
048 c) a third place altogether different
049 I think maybe choice a is best...but let's be open minded until need to choose.
050 */
051
052 int startIx; // The index of the first feature managed by this FeatureManager
053 ModelManager model;
054 ColumnConditionalLogProbability mo;
055 boolean tieFlag = false;
056 List<int[]> clusterIndices;
057 private int eModelNum = 0; // See ColumnConditionalLogProbability for interpetation; 0 is the default.
058
059 public FelsensteinFeatures(List<int[]> clusters) throws ClassNotFoundException {
060 this.clusterIndices = clusters;
061 }
062
063 public void setClusters(List<List<Node>> clusters) {
064 clusterIndices = new ArrayList();
065 for(List<Node> nodeList : clusters) {
066 int[] cluster = new int[nodeList.size()];
067 clusterIndices.add(cluster);
068 for(int i=0; i<nodeList.size(); ++i) {
069 cluster[i] = nodeList.get(i).getIndex();
070 }
071 }
072 }
073
074 public FelsensteinFeatures() { }
075
076 public FelsensteinFeatures(List<int[]> clusters, List<int[]> eModelNum) {
077 this.clusterIndices = clusters;
078 this.eModelNum = eModelNum.get(0)[0];
079 }
080
081 public FelsensteinFeatures(List<int[]> clusters, List<int[]> eModelNum, List<int[]> flags) {
082 tieFlag = true;
083 this.clusterIndices = clusters;
084 this.eModelNum = eModelNum.get(0)[0];
085 }
086
087 public int getNumFeatures() {
088 if (tieFlag) {
089 return 1;
090 } else {
091 return mo.numClusters();
092 }
093 }
094
095 public String getFeatureName(int featureIndex) {
096 return "FelsensteinFeatures";
097 }
098
099
100 public void evaluateNode(InputSequence<? extends MultipleAlignmentColumn> seq, int pos, int state, FeatureList result) {
101 int cl;
102 if (tieFlag) {
103 cl = 0;
104 } else {
105 cl = mo.state2cluster(state);
106 }
107 result.addFeature(startIx + cl, mo.condLogProb(seq,pos,state));
108 }
109
110
111 public void train(int startingIndex, ModelManager modelInfo, final List<? extends TrainingSequence<? extends MultipleAlignmentColumn>> data) {
112 startIx = startingIndex;
113 model = modelInfo;
114
115 mo = new ColumnConditionalLogProbability(clusterIndices,eModelNum);
116
117 mo.train(modelInfo,data);
118 }
119 @Override
120 public CacheStrategySpec getCacheStrategy() {
121 return new CacheStrategySpec(CacheStrategy.UNSPECIFIED);
122 }
123 }