001    package calhoun.analysis.crf.features.interval13;
002    
003    import java.util.ArrayList;
004    import java.util.List;
005    
006    import org.apache.commons.logging.Log;
007    import org.apache.commons.logging.LogFactory;
008    
009    import calhoun.analysis.crf.CacheStrategySpec;
010    import calhoun.analysis.crf.FeatureList;
011    import calhoun.analysis.crf.FeatureManagerNodeExplicitLength;
012    import calhoun.analysis.crf.ModelManager;
013    import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
014    import calhoun.analysis.crf.io.InputSequence;
015    import calhoun.analysis.crf.io.SequenceConverter;
016    import calhoun.analysis.crf.io.TrainingSequence;
017    import calhoun.analysis.crf.statistics.MixtureOfGammas;
018    import calhoun.util.Assert;
019    
020    public class StateLengthLogprobInterval13 implements FeatureManagerNodeExplicitLength<Character> {        
021            private static final long serialVersionUID = 8685543199212865835L;
022    
023            /* Returns the log probability of a state having the specified duration
024             * returned value depends only on state and length
025             * 
026             * trained using mixture of gammas model, with a few tweaks:
027             *   a) if all values identical (eg always have exactly 200 intergenic bases in training examples), then model as exponential length
028             *   b) Otherwise introduce priors at 90 and 110% of median and at 95% and 105% of median, and train mixture of gammas using EM.
029             *      This prevents convergence to a case where one of the two components is a single data point and likelihood is infinite.
030             *     
031             * Some notes on normalization: (see also mixGamma):
032             *    If you have a given state and evaluate these probabilities (before taking logs) and add up over all possible lengths,
033             *    the result should be approximately one, but not exactly.  Two reasons for discrepancy:
034             *    a) mixGamma, integrated from to to infinity, should add up to 1.0.  But summing at 1,2,3,4,... is a discrete
035             *      approximation and might not agree exactly.
036             *    b) The summation isn't from 1,2,3,4,... but from minLength[state] to maxLength[state].  However, this feature does not
037             *      have access to that min/max length information, so normalization for this reason must happen downstream.
038             */
039            private static final Log log = LogFactory.getLog(StateLengthLogprobInterval13.class);
040            
041            private int startIx;
042            ModelManager mi;
043            private String inputComponentName;
044    
045            MixtureOfGammas intergenicMixGamma;
046            MixtureOfGammas exonMixGamma;
047            MixtureOfGammas intronMixGamma;
048            
049            private boolean forceExponential = false;
050            private boolean exonExponential = false;
051            private boolean intronExponential = false;
052            private boolean multipleFeatures = false;
053            private boolean noIntergenic = false;
054    
055            public void setForceExponential(boolean forceExponential) {
056                    this.forceExponential = forceExponential;
057            }
058            
059            public void setExonExponential(boolean exonExponential) {
060                    this.exonExponential = exonExponential;
061            }
062            
063            public void setIntronExponential(boolean intronExponential) {
064                    this.intronExponential = intronExponential;
065            }       
066            
067            public void evaluateNodeLength(InputSequence<? extends Character> seq, int pos, int length, int state, FeatureList result) {
068                    Assert.a(length>0);
069                    
070                    MixtureOfGammas mg = null;
071                    int indexOffset = Integer.MIN_VALUE;
072                    switch (state) {
073                    case (0):
074                            if(noIntergenic)
075                                    return;
076                            indexOffset = 0;
077                            mg = intergenicMixGamma;
078                    break;
079                    case(1):
080                    case(2):
081                    case(3):
082                    case(7):
083                    case(8):
084                    case(9):
085                            indexOffset = 1;
086                            mg = exonMixGamma;
087                    break;
088                    case(4):
089                    case(5):
090                    case(6):
091                    case(10):
092                    case(11):
093                    case(12):
094                            indexOffset = 2;
095                            mg = intronMixGamma;
096                    break;
097                    default:
098                            Assert.a(false);
099                    }
100                    
101                    double val = mg.logEvaluate((double) length);
102                    Assert.a((val != Double.NEGATIVE_INFINITY) && (val != Double.POSITIVE_INFINITY) && (!Double.isNaN(val)));
103                    if (val>0) {
104                            log.warn("About to return a state length logprob evaluation that is greater than zero, see notes in source code.");
105                    }
106                    result.addFeature(startIx + (multipleFeatures ? indexOffset : 0),val);
107            }
108    
109            public CacheStrategySpec getCacheStrategy() {
110                    return new CacheStrategySpec(CacheStrategy.LENGTHFUNCTION);
111            }
112    
113            public String getFeatureName(int featureIndex) {
114                    if(multipleFeatures) {
115                            String[] vals = new String[] { "Intergenic", "Exon", "Intron"};
116                            String type = vals[featureIndex - startIx];
117                            return type+" lengths";
118                    }
119                    else {
120                            return "StateDurationLogProbForModelInterval13";
121                    }
122            }
123    
124            public String getInputComponent() {
125                    return inputComponentName;
126            }
127    
128            public void setInputComponent(String name) {
129                    inputComponentName = name;              
130            }
131    
132            public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends Character>> data) {
133                    startIx = startingIndex;
134                    mi = modelInfo;
135                    Assert.a(mi.getNumStates()==13);
136                    
137                    ArrayList<ArrayList<Integer>> stateDurations;
138                    
139                    stateDurations = SequenceConverter.stateVector2StateLengths(data,mi.getNumStates());
140                    
141                    List<Integer> exonLengths = new ArrayList<Integer>();
142                    List<Integer> intronLengths = new ArrayList<Integer>();
143                    List<Integer> intergenicLengths = new ArrayList<Integer>();
144                    
145                    intergenicLengths.addAll(stateDurations.get(0));
146                    
147                    exonLengths.addAll(stateDurations.get(1));
148                    exonLengths.addAll(stateDurations.get(2));
149                    exonLengths.addAll(stateDurations.get(3));
150                    exonLengths.addAll(stateDurations.get(7));
151                    exonLengths.addAll(stateDurations.get(8));
152                    exonLengths.addAll(stateDurations.get(9));
153                    
154                    intronLengths.addAll(stateDurations.get(4));
155                    intronLengths.addAll(stateDurations.get(5));
156                    intronLengths.addAll(stateDurations.get(6));
157                    intronLengths.addAll(stateDurations.get(10));
158                    intronLengths.addAll(stateDurations.get(11));
159                    intronLengths.addAll(stateDurations.get(12));
160                    
161                    double[] inter = new double[intergenicLengths.size()];
162                    for (int j=0; j<intergenicLengths.size(); j++) {
163                            inter[j] = (double) intergenicLengths.get(j);
164                    }
165                    
166                    double[] exon = new double[exonLengths.size()];
167                    for (int j=0; j<exonLengths.size(); j++) {
168                            exon[j] = (double) exonLengths.get(j);
169                    }
170                    
171                    double[] intron = new double[intronLengths.size()];
172                    for (int j=0; j<intronLengths.size(); j++) {
173                            intron[j] = (double) intronLengths.get(j);
174                    }
175    
176                    if (forceExponential) {
177                            intergenicMixGamma = new MixtureOfGammas(inter,true);
178                            exonMixGamma       = new MixtureOfGammas(exon,true);
179                            intronMixGamma     = new MixtureOfGammas(intron,true);
180                    } else if (exonExponential) {
181                            intergenicMixGamma = new MixtureOfGammas(inter,true);  
182                            exonMixGamma       = new MixtureOfGammas(exon,true);
183                            intronMixGamma     = new MixtureOfGammas(intron);       
184                    } else if (intronExponential) {
185                            intergenicMixGamma = new MixtureOfGammas(inter,true);  
186                            exonMixGamma       = new MixtureOfGammas(exon);
187                            intronMixGamma     = new MixtureOfGammas(intron,true);                  
188                    } else {
189                            // by default, only intergenic regions modeled with exp length distributions
190                            intergenicMixGamma = new MixtureOfGammas(inter,true);  
191                            exonMixGamma       = new MixtureOfGammas(exon);
192                            intronMixGamma     = new MixtureOfGammas(intron);                       
193                    }
194            }
195    
196            public int getNumFeatures() {
197                    return multipleFeatures ? 3 : 1;
198            }
199    
200            /** 
201             * @return Returns the multipleFeatures.
202             */
203            public boolean isMultipleFeatures() {
204                    return multipleFeatures;
205            }
206    
207            /** set to true to indicate that intergenic, exonic, and intergenic lengths should each get a separate weight.
208             * @param multipleFeatures The multipleFeatures to set.
209             */
210            public void setMultipleFeatures(boolean multipleFeatures) {
211                    this.multipleFeatures = multipleFeatures;
212            }
213    
214            /**
215             * @return Returns the noIntergenic.
216             */
217            public boolean isNoIntergenic() {
218                    return noIntergenic;
219            }
220    
221            /**
222             * @param noIntergenic The noIntergenic to set.
223             */
224            public void setNoIntergenic(boolean noIntergenic) {
225                    this.noIntergenic = noIntergenic;
226            }
227    }