001    package calhoun.analysis.crf.features.interval13;
002    
003    import java.util.ArrayList;
004    import java.util.Arrays;
005    import java.util.List;
006    
007    import org.apache.commons.logging.Log;
008    import org.apache.commons.logging.LogFactory;
009    
010    import calhoun.analysis.crf.AbstractFeatureManager;
011    import calhoun.analysis.crf.CacheStrategySpec;
012    import calhoun.analysis.crf.FeatureList;
013    import calhoun.analysis.crf.FeatureManagerNode;
014    import calhoun.analysis.crf.ModelManager;
015    import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
016    import calhoun.analysis.crf.features.supporting.phylogenetic.EvolutionaryModel;
017    import calhoun.analysis.crf.features.supporting.phylogenetic.Kimura80Model;
018    import calhoun.analysis.crf.features.supporting.phylogenetic.PhylogeneticTreeFelsensteinOrder;
019    import calhoun.analysis.crf.io.InputSequence;
020    import calhoun.analysis.crf.io.TrainingSequence;
021    import calhoun.analysis.crf.io.MultipleAlignmentInputSequence.MultipleAlignmentColumn;
022    import calhoun.seq.KmerHasher;
023    import calhoun.util.Assert;
024    import flanagan.math.Minimisation;
025    import flanagan.math.MinimisationFunction;
026    
027    public class PhylogeneticLogprobInterval13 extends AbstractFeatureManager<MultipleAlignmentColumn> implements FeatureManagerNode<MultipleAlignmentColumn> {
028            private static final long serialVersionUID = -7659288739348604129L;
029            private static final Log log = LogFactory.getLog(PhylogeneticLogprobInterval13.class);
030            
031            
032            int startIx;  // The index of the first feature managed by this FeatureManager
033            ModelManager model;     
034            boolean multipleFeatures = false;
035            
036            EvolutionaryModel emodelIntergenic;      // one model for a column of aligned sequence in intergenic region
037            EvolutionaryModel emodelIntronic;        // one model for intronic regions
038            ArrayList<EvolutionaryModel> emodelExonic; // a model for positions 0,1,2 = (A,T,G) of a codon n a coding exon.
039            
040            static KmerHasher hforward = new KmerHasher(KmerHasher.ACGTother,1);    // a character hasher for forward strand
041            static KmerHasher hbackward = new KmerHasher(KmerHasher.ACGTotherRC,1); // a character hasher for reverse strand
042            
043            ///////////////////////////////////////////////////////////////////////////////
044                    
045            public PhylogeneticLogprobInterval13() { }        // a constructor with no arguments
046            
047            public int getNumFeatures() {  // there is exactly one feature
048                    return multipleFeatures ? 5 : 1;
049            }       
050             
051            public String getFeatureName(int featureIndex) {
052                    if(multipleFeatures) {
053                            String[] vals = new String[] { "Intergenic", "Exon pos.", "Intron pos.", "Exon neg.", "Intron neg."};
054                            int feat = featureIndex - startIx;
055                            String table = vals[feat];
056                            return table+" phylogeny";
057                    }
058                    else {
059                            return "PhylogeneticLogProbInterval13";
060                    }
061            }
062            
063            
064            public void evaluateNode(InputSequence<? extends MultipleAlignmentColumn> seq, int pos, int state, FeatureList result) {
065    
066                    Assert.a(state < model.getNumStates());
067                    MultipleAlignmentColumn col = seq.getX(pos);
068                    
069                    double val = 0.0;
070                    int ephase;
071                    int featureOffset = Integer.MIN_VALUE;
072                    switch (state) {
073                    case 0:
074                            val = emodelIntergenic.logprob(col,true);
075                            featureOffset = 0;
076                            break;
077                    case 1:
078                    case 2:
079                    case 3:
080                            ephase = ((pos-state+1)%3+3)%3;  //((pos-(state-1))%3 +3)%3;
081                            //val = emodelExonic.get(0).logprob(col,true);
082                            val = emodelExonic.get(ephase).logprob(col,true);
083                            featureOffset = 1; // + ephase;
084                            break;
085                    case 4:
086                    case 5:
087                    case 6:
088                            val = emodelIntronic.logprob(col,true);
089                            featureOffset = 2;
090                            break;
091                    case 7:
092                    case 8:
093                    case 9:
094                            ephase = ((-pos+state+1)%3+3)%3;   // ((-pos+2+(state-7))%3 +3)%3;
095                            val = emodelExonic.get(ephase).logprobRC(col,true);
096                            featureOffset = 3; // + ephase;
097                            break;                  
098                    case 10:
099                    case 11:
100                    case 12:
101                            val = emodelIntronic.logprobRC(col,true);
102                            featureOffset = 4;
103                            break;
104                    default:
105                            Assert.a(false);
106                    }
107                    
108                    result.addFeature(startIx + (multipleFeatures ? featureOffset : 0), val);
109            }
110            
111            
112            public void train(int startingIndex, ModelManager modelInfo, final List<? extends TrainingSequence<? extends MultipleAlignmentColumn>> data) {
113                    startIx = startingIndex;
114                    model = modelInfo;
115                                    
116                    final PhylogeneticTreeFelsensteinOrder felsOrder = data.get(0).getX(0).getMultipleAlignment().getFelsensteinOrder();
117                    
118                    ArrayList<boolean[]> flagsForward  = new ArrayList<boolean[]>();
119                    ArrayList<boolean[]> flagsBackward = new ArrayList<boolean[]>();
120                    for(int seqNum=0; seqNum<data.size(); seqNum++) {
121                            TrainingSequence<? extends MultipleAlignmentColumn> aln = data.get(seqNum);
122                            int len = aln.length();
123                            flagsForward.add(new boolean[len]);
124                            flagsBackward.add(new boolean[len]);                                    
125                    }
126                    
127                    
128                    log.debug("Training model for intergenic regions...");
129                    for(int seqNum=0; seqNum<data.size(); seqNum++) {
130                            TrainingSequence<? extends MultipleAlignmentColumn> aln = data.get(seqNum);
131                            int len = aln.length();
132                            
133                            boolean[] ff = flagsForward.get(seqNum);
134                            Assert.a(ff.length == len);
135                            
136                            boolean[] fb = flagsBackward.get(seqNum);                       
137                            Assert.a(fb.length == len);
138                            
139                            for (int pos=0; pos<len; pos++) {
140                                    int y = aln.getY(pos);
141                                    if (y == 0) {
142                                            ff[pos] = true;
143                                            fb[pos] = true;
144                                    } else {
145                                            ff[pos] = false;
146                                            fb[pos] = false;
147                                    }
148                            }               
149                    }               
150                    emodelIntergenic = trainEvolutionaryModel(felsOrder,data,flagsForward, flagsBackward);
151                    log.debug("Evolutionary model for intergenic regions:");
152                    emodelIntergenic.summarize();
153    
154                    
155    
156                    log.debug("Training model for intronic regions...");
157                    for(int seqNum=0; seqNum<data.size(); seqNum++) {
158                            TrainingSequence<? extends MultipleAlignmentColumn> aln = data.get(seqNum);
159                            int len = aln.length();
160                            
161                            boolean[] ff = flagsForward.get(seqNum);
162                            Assert.a(ff.length == len);
163                            
164                            boolean[] fb = flagsBackward.get(seqNum);                       
165                            Assert.a(fb.length == len);
166                            
167                            for (int pos=0; pos<len; pos++) {
168                                    int y = aln.getY(pos);
169                                    if ( (y == 4) || (y == 5) || (y == 6) ) {
170                                            ff[pos] = true;
171                                    } else {
172                                            ff[pos] = false;
173                                    }
174                                    if ( (y == 10) || (y == 11) || (y == 12) ) {
175                                            fb[pos] = true;
176                                    } else {
177                                            fb[pos] = false;
178                                    }
179                            }               
180                    }               
181                    emodelIntronic = trainEvolutionaryModel(felsOrder,data,flagsForward, flagsBackward);
182                    log.debug("Evolutionary model for intronic regions:");
183                    emodelIntronic.summarize();
184    
185                    //        ephase = ((pos-state+1)%3+3)%3; for states 1,2,3
186                    //    ephase = ((-pos+state+1)%3+3)%3; for states 10,11,12
187                    
188                    emodelExonic = new ArrayList<EvolutionaryModel>();
189                    for (int phase =0; phase<3; phase++) {
190                            log.debug("Training model for exonic regions...");
191                            for(int seqNum=0; seqNum<data.size(); seqNum++) {
192                                    TrainingSequence<? extends MultipleAlignmentColumn> aln = data.get(seqNum);
193                                    int len = aln.length();
194                                    
195                                    boolean[] ff = flagsForward.get(seqNum);
196                                    Assert.a(ff.length == len);
197                                    
198                                    boolean[] fb = flagsBackward.get(seqNum);                       
199                                    Assert.a(fb.length == len);
200                                    
201                                    for (int pos=0; pos<len; pos++) {
202                                            int y = aln.getY(pos);
203                                            int pstate = ((pos-phase)%3 +3)%3 + 1;
204                                            int mstate = ((phase+pos-2)%3 + 3)%3 + 7;
205                                            if ( y == pstate ) {
206                                                    ff[pos] = true;
207                                            } else {
208                                                    ff[pos] = false;
209                                            }
210                                            if ( y==mstate ) {
211                                                    fb[pos] = true;
212                                            } else {
213                                                    fb[pos] = false;
214                                            }
215                                    }               
216                            }               
217                            emodelExonic.add(trainEvolutionaryModel(felsOrder,data,flagsForward, flagsBackward));
218                            log.debug("Evolutionary model for intronic regions:");
219                            emodelExonic.get(phase).summarize();
220                    }
221                    
222                    log.debug("Just trained all evolutionary models");
223            }
224            
225            private EvolutionaryModel trainEvolutionaryModel(final PhylogeneticTreeFelsensteinOrder felsOrder,
226                            final List<? extends TrainingSequence<? extends MultipleAlignmentColumn>> data,
227                            final ArrayList<boolean[]> flagsForward,
228                            final ArrayList<boolean[]> flagsBackward) {
229                    
230                    Assert.a(flagsForward.size() == data.size());
231                    Assert.a(flagsBackward.size() == data.size());
232                    
233                    // Estimate pi based on the nucleotide frequencies in the reference sequence
234                    final double[] pi = new double[]{1.0,1.0,1.0,1.0};
235                    for(int seqNum=0; seqNum<data.size(); seqNum++) {
236                            TrainingSequence<? extends MultipleAlignmentColumn> aln = data.get(seqNum);
237                            int len = aln.length();
238                            
239                            boolean[] ff = flagsForward.get(seqNum);
240                            Assert.a(ff.length == len);
241                            
242                            boolean[] fb = flagsBackward.get(seqNum);                       
243                            Assert.a(fb.length == len);                     
244                            
245                            for (int ix=0; ix<len; ix++) {                               
246                                    if (ff[ix]) {
247                                            int x = hforward.hash(aln.getX(ix).nucleotide(0));
248                                            if (x<4) { pi[x] += 1.0; }                                                           
249                                    }
250                                    
251                                    if (fb[ix]) {
252                                            int x = hbackward.hash(aln.getX(ix).nucleotide(0));
253                                            if (x<4) { pi[x] += 1.0; }                                                           
254                                    }                               
255                            }
256                    }
257                    double total = pi[0] + pi[1] + pi[2] + pi[3];
258                    pi[0]/=total; pi[1]/=total; pi[2]/=total; pi[3]/=total;
259    
260                    
261                    MinimisationFunction mFunc = new MinimisationFunction() {
262                            public double function(double[] d) {
263                                    double[] ed = new double[2];
264                                    ed[0] = Math.exp(d[0]);
265                                    ed[1] = Math.exp(d[1]);
266                                    
267                                    Kimura80Model R = new Kimura80Model(ed);
268                                    EvolutionaryModel M = new EvolutionaryModel(felsOrder,pi,R);
269                                    
270                                    double ret = 0;
271                                    for(int seqNum=0; seqNum<data.size(); seqNum++) {
272                                            TrainingSequence<? extends MultipleAlignmentColumn> aln = data.get(seqNum);
273                                            int len = aln.length();                                                 
274                                            
275                                            boolean[] ff = flagsForward.get(seqNum);
276                                            Assert.a(ff.length == len);
277                                            
278                                            boolean[] fb = flagsBackward.get(seqNum);                       
279                                            Assert.a(fb.length == len);
280                                            
281                                            for (int ix=0; ix<len; ix++) {
282                                                    if (ff[ix]) {
283                                                            ret += M.logprob(aln.getX(ix),true);                                                                    
284                                                    }
285                                                    if (fb[ix]) {
286                                                            ret += M.logprobRC(aln.getX(ix),true);
287                                                    }
288                                            }
289                                    }                               
290                                    return -ret;
291                            }
292                    };                                              
293                    
294                    // The standard mantra for minimizing the function mFunc defined above 
295                    int maxIter = 50;
296                    final int nParm   = 2;
297                    Minimisation m = new Minimisation();
298                    m.setNmax(maxIter);
299                    double[] starts = new double[nParm];
300                    Arrays.fill(starts, 0.1);
301                    double[] steps = new double[nParm];
302                    Arrays.fill(steps, 0.1);                
303                    m.nelderMead(mFunc, starts, steps);
304                    if(!m.getConvStatus()) {
305                            log.warn("WARNING - Nelder-Mead routine says convergence was not reached");
306                    }
307                    double[] results = m.getParamValues();
308                    double[] eresults = new double[]{Math.exp(results[0]),Math.exp(results[1])};
309                    
310                    return (new EvolutionaryModel(felsOrder,pi,new Kimura80Model(eresults)) );              
311            }
312    
313            @Override
314            public CacheStrategySpec getCacheStrategy() {
315                    return new CacheStrategySpec(CacheStrategy.DENSE);
316            }
317    
318            /**
319             * @return Returns the multipleFeatures.
320             */
321            public boolean isMultipleFeatures() {
322                    return multipleFeatures;
323            }
324    
325            /**
326             * @param multipleFeatures The multipleFeatures to set.
327             */
328            public void setMultipleFeatures(boolean multipleFeatures) {
329                    this.multipleFeatures = multipleFeatures;
330            }
331    }