001 package calhoun.analysis.crf.features.interval13;
002
003 import java.util.ArrayList;
004 import java.util.Arrays;
005 import java.util.List;
006
007 import org.apache.commons.logging.Log;
008 import org.apache.commons.logging.LogFactory;
009
010 import calhoun.analysis.crf.AbstractFeatureManager;
011 import calhoun.analysis.crf.CacheStrategySpec;
012 import calhoun.analysis.crf.FeatureList;
013 import calhoun.analysis.crf.FeatureManagerNode;
014 import calhoun.analysis.crf.ModelManager;
015 import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
016 import calhoun.analysis.crf.features.supporting.phylogenetic.EvolutionaryModel;
017 import calhoun.analysis.crf.features.supporting.phylogenetic.Kimura80Model;
018 import calhoun.analysis.crf.features.supporting.phylogenetic.PhylogeneticTreeFelsensteinOrder;
019 import calhoun.analysis.crf.io.InputSequence;
020 import calhoun.analysis.crf.io.TrainingSequence;
021 import calhoun.analysis.crf.io.MultipleAlignmentInputSequence.MultipleAlignmentColumn;
022 import calhoun.seq.KmerHasher;
023 import calhoun.util.Assert;
024 import flanagan.math.Minimisation;
025 import flanagan.math.MinimisationFunction;
026
027 public class PhylogeneticLogprobInterval13 extends AbstractFeatureManager<MultipleAlignmentColumn> implements FeatureManagerNode<MultipleAlignmentColumn> {
028 private static final long serialVersionUID = -7659288739348604129L;
029 private static final Log log = LogFactory.getLog(PhylogeneticLogprobInterval13.class);
030
031
032 int startIx; // The index of the first feature managed by this FeatureManager
033 ModelManager model;
034 boolean multipleFeatures = false;
035
036 EvolutionaryModel emodelIntergenic; // one model for a column of aligned sequence in intergenic region
037 EvolutionaryModel emodelIntronic; // one model for intronic regions
038 ArrayList<EvolutionaryModel> emodelExonic; // a model for positions 0,1,2 = (A,T,G) of a codon n a coding exon.
039
040 static KmerHasher hforward = new KmerHasher(KmerHasher.ACGTother,1); // a character hasher for forward strand
041 static KmerHasher hbackward = new KmerHasher(KmerHasher.ACGTotherRC,1); // a character hasher for reverse strand
042
043 ///////////////////////////////////////////////////////////////////////////////
044
045 public PhylogeneticLogprobInterval13() { } // a constructor with no arguments
046
047 public int getNumFeatures() { // there is exactly one feature
048 return multipleFeatures ? 5 : 1;
049 }
050
051 public String getFeatureName(int featureIndex) {
052 if(multipleFeatures) {
053 String[] vals = new String[] { "Intergenic", "Exon pos.", "Intron pos.", "Exon neg.", "Intron neg."};
054 int feat = featureIndex - startIx;
055 String table = vals[feat];
056 return table+" phylogeny";
057 }
058 else {
059 return "PhylogeneticLogProbInterval13";
060 }
061 }
062
063
064 public void evaluateNode(InputSequence<? extends MultipleAlignmentColumn> seq, int pos, int state, FeatureList result) {
065
066 Assert.a(state < model.getNumStates());
067 MultipleAlignmentColumn col = seq.getX(pos);
068
069 double val = 0.0;
070 int ephase;
071 int featureOffset = Integer.MIN_VALUE;
072 switch (state) {
073 case 0:
074 val = emodelIntergenic.logprob(col,true);
075 featureOffset = 0;
076 break;
077 case 1:
078 case 2:
079 case 3:
080 ephase = ((pos-state+1)%3+3)%3; //((pos-(state-1))%3 +3)%3;
081 //val = emodelExonic.get(0).logprob(col,true);
082 val = emodelExonic.get(ephase).logprob(col,true);
083 featureOffset = 1; // + ephase;
084 break;
085 case 4:
086 case 5:
087 case 6:
088 val = emodelIntronic.logprob(col,true);
089 featureOffset = 2;
090 break;
091 case 7:
092 case 8:
093 case 9:
094 ephase = ((-pos+state+1)%3+3)%3; // ((-pos+2+(state-7))%3 +3)%3;
095 val = emodelExonic.get(ephase).logprobRC(col,true);
096 featureOffset = 3; // + ephase;
097 break;
098 case 10:
099 case 11:
100 case 12:
101 val = emodelIntronic.logprobRC(col,true);
102 featureOffset = 4;
103 break;
104 default:
105 Assert.a(false);
106 }
107
108 result.addFeature(startIx + (multipleFeatures ? featureOffset : 0), val);
109 }
110
111
112 public void train(int startingIndex, ModelManager modelInfo, final List<? extends TrainingSequence<? extends MultipleAlignmentColumn>> data) {
113 startIx = startingIndex;
114 model = modelInfo;
115
116 final PhylogeneticTreeFelsensteinOrder felsOrder = data.get(0).getX(0).getMultipleAlignment().getFelsensteinOrder();
117
118 ArrayList<boolean[]> flagsForward = new ArrayList<boolean[]>();
119 ArrayList<boolean[]> flagsBackward = new ArrayList<boolean[]>();
120 for(int seqNum=0; seqNum<data.size(); seqNum++) {
121 TrainingSequence<? extends MultipleAlignmentColumn> aln = data.get(seqNum);
122 int len = aln.length();
123 flagsForward.add(new boolean[len]);
124 flagsBackward.add(new boolean[len]);
125 }
126
127
128 log.debug("Training model for intergenic regions...");
129 for(int seqNum=0; seqNum<data.size(); seqNum++) {
130 TrainingSequence<? extends MultipleAlignmentColumn> aln = data.get(seqNum);
131 int len = aln.length();
132
133 boolean[] ff = flagsForward.get(seqNum);
134 Assert.a(ff.length == len);
135
136 boolean[] fb = flagsBackward.get(seqNum);
137 Assert.a(fb.length == len);
138
139 for (int pos=0; pos<len; pos++) {
140 int y = aln.getY(pos);
141 if (y == 0) {
142 ff[pos] = true;
143 fb[pos] = true;
144 } else {
145 ff[pos] = false;
146 fb[pos] = false;
147 }
148 }
149 }
150 emodelIntergenic = trainEvolutionaryModel(felsOrder,data,flagsForward, flagsBackward);
151 log.debug("Evolutionary model for intergenic regions:");
152 emodelIntergenic.summarize();
153
154
155
156 log.debug("Training model for intronic regions...");
157 for(int seqNum=0; seqNum<data.size(); seqNum++) {
158 TrainingSequence<? extends MultipleAlignmentColumn> aln = data.get(seqNum);
159 int len = aln.length();
160
161 boolean[] ff = flagsForward.get(seqNum);
162 Assert.a(ff.length == len);
163
164 boolean[] fb = flagsBackward.get(seqNum);
165 Assert.a(fb.length == len);
166
167 for (int pos=0; pos<len; pos++) {
168 int y = aln.getY(pos);
169 if ( (y == 4) || (y == 5) || (y == 6) ) {
170 ff[pos] = true;
171 } else {
172 ff[pos] = false;
173 }
174 if ( (y == 10) || (y == 11) || (y == 12) ) {
175 fb[pos] = true;
176 } else {
177 fb[pos] = false;
178 }
179 }
180 }
181 emodelIntronic = trainEvolutionaryModel(felsOrder,data,flagsForward, flagsBackward);
182 log.debug("Evolutionary model for intronic regions:");
183 emodelIntronic.summarize();
184
185 // ephase = ((pos-state+1)%3+3)%3; for states 1,2,3
186 // ephase = ((-pos+state+1)%3+3)%3; for states 10,11,12
187
188 emodelExonic = new ArrayList<EvolutionaryModel>();
189 for (int phase =0; phase<3; phase++) {
190 log.debug("Training model for exonic regions...");
191 for(int seqNum=0; seqNum<data.size(); seqNum++) {
192 TrainingSequence<? extends MultipleAlignmentColumn> aln = data.get(seqNum);
193 int len = aln.length();
194
195 boolean[] ff = flagsForward.get(seqNum);
196 Assert.a(ff.length == len);
197
198 boolean[] fb = flagsBackward.get(seqNum);
199 Assert.a(fb.length == len);
200
201 for (int pos=0; pos<len; pos++) {
202 int y = aln.getY(pos);
203 int pstate = ((pos-phase)%3 +3)%3 + 1;
204 int mstate = ((phase+pos-2)%3 + 3)%3 + 7;
205 if ( y == pstate ) {
206 ff[pos] = true;
207 } else {
208 ff[pos] = false;
209 }
210 if ( y==mstate ) {
211 fb[pos] = true;
212 } else {
213 fb[pos] = false;
214 }
215 }
216 }
217 emodelExonic.add(trainEvolutionaryModel(felsOrder,data,flagsForward, flagsBackward));
218 log.debug("Evolutionary model for intronic regions:");
219 emodelExonic.get(phase).summarize();
220 }
221
222 log.debug("Just trained all evolutionary models");
223 }
224
225 private EvolutionaryModel trainEvolutionaryModel(final PhylogeneticTreeFelsensteinOrder felsOrder,
226 final List<? extends TrainingSequence<? extends MultipleAlignmentColumn>> data,
227 final ArrayList<boolean[]> flagsForward,
228 final ArrayList<boolean[]> flagsBackward) {
229
230 Assert.a(flagsForward.size() == data.size());
231 Assert.a(flagsBackward.size() == data.size());
232
233 // Estimate pi based on the nucleotide frequencies in the reference sequence
234 final double[] pi = new double[]{1.0,1.0,1.0,1.0};
235 for(int seqNum=0; seqNum<data.size(); seqNum++) {
236 TrainingSequence<? extends MultipleAlignmentColumn> aln = data.get(seqNum);
237 int len = aln.length();
238
239 boolean[] ff = flagsForward.get(seqNum);
240 Assert.a(ff.length == len);
241
242 boolean[] fb = flagsBackward.get(seqNum);
243 Assert.a(fb.length == len);
244
245 for (int ix=0; ix<len; ix++) {
246 if (ff[ix]) {
247 int x = hforward.hash(aln.getX(ix).nucleotide(0));
248 if (x<4) { pi[x] += 1.0; }
249 }
250
251 if (fb[ix]) {
252 int x = hbackward.hash(aln.getX(ix).nucleotide(0));
253 if (x<4) { pi[x] += 1.0; }
254 }
255 }
256 }
257 double total = pi[0] + pi[1] + pi[2] + pi[3];
258 pi[0]/=total; pi[1]/=total; pi[2]/=total; pi[3]/=total;
259
260
261 MinimisationFunction mFunc = new MinimisationFunction() {
262 public double function(double[] d) {
263 double[] ed = new double[2];
264 ed[0] = Math.exp(d[0]);
265 ed[1] = Math.exp(d[1]);
266
267 Kimura80Model R = new Kimura80Model(ed);
268 EvolutionaryModel M = new EvolutionaryModel(felsOrder,pi,R);
269
270 double ret = 0;
271 for(int seqNum=0; seqNum<data.size(); seqNum++) {
272 TrainingSequence<? extends MultipleAlignmentColumn> aln = data.get(seqNum);
273 int len = aln.length();
274
275 boolean[] ff = flagsForward.get(seqNum);
276 Assert.a(ff.length == len);
277
278 boolean[] fb = flagsBackward.get(seqNum);
279 Assert.a(fb.length == len);
280
281 for (int ix=0; ix<len; ix++) {
282 if (ff[ix]) {
283 ret += M.logprob(aln.getX(ix),true);
284 }
285 if (fb[ix]) {
286 ret += M.logprobRC(aln.getX(ix),true);
287 }
288 }
289 }
290 return -ret;
291 }
292 };
293
294 // The standard mantra for minimizing the function mFunc defined above
295 int maxIter = 50;
296 final int nParm = 2;
297 Minimisation m = new Minimisation();
298 m.setNmax(maxIter);
299 double[] starts = new double[nParm];
300 Arrays.fill(starts, 0.1);
301 double[] steps = new double[nParm];
302 Arrays.fill(steps, 0.1);
303 m.nelderMead(mFunc, starts, steps);
304 if(!m.getConvStatus()) {
305 log.warn("WARNING - Nelder-Mead routine says convergence was not reached");
306 }
307 double[] results = m.getParamValues();
308 double[] eresults = new double[]{Math.exp(results[0]),Math.exp(results[1])};
309
310 return (new EvolutionaryModel(felsOrder,pi,new Kimura80Model(eresults)) );
311 }
312
313 @Override
314 public CacheStrategySpec getCacheStrategy() {
315 return new CacheStrategySpec(CacheStrategy.DENSE);
316 }
317
318 /**
319 * @return Returns the multipleFeatures.
320 */
321 public boolean isMultipleFeatures() {
322 return multipleFeatures;
323 }
324
325 /**
326 * @param multipleFeatures The multipleFeatures to set.
327 */
328 public void setMultipleFeatures(boolean multipleFeatures) {
329 this.multipleFeatures = multipleFeatures;
330 }
331 }