001 package calhoun.analysis.crf.features.interval13;
002
003 import java.util.List;
004
005 import org.apache.commons.logging.Log;
006 import org.apache.commons.logging.LogFactory;
007
008 import calhoun.analysis.crf.AbstractFeatureManager;
009 import calhoun.analysis.crf.CacheStrategySpec;
010 import calhoun.analysis.crf.FeatureList;
011 import calhoun.analysis.crf.FeatureManagerEdge;
012 import calhoun.analysis.crf.FeatureManagerNode;
013 import calhoun.analysis.crf.ModelManager;
014 import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
015 import calhoun.analysis.crf.io.InputSequence;
016 import calhoun.analysis.crf.io.TrainingSequence;
017 import calhoun.util.Assert;
018
019 /** Implements basic constraints on gene calls.
020 * 1) Intergenic - start must occur at ATG
021 * 2) Splice sites must be canonical GT/AG or GC/AG
022 * 3) Exon-stop must be followed by a start codon
023 */
024 public class GeneConstraintsInterval13 extends AbstractFeatureManager<Character> implements FeatureManagerEdge<Character>, FeatureManagerNode<Character> {
025 @SuppressWarnings("unused")
026 private static final Log log = LogFactory.getLog(GeneConstraintsInterval13.class);
027 private static final long serialVersionUID = 3041359216265032511L;
028
029 public String getFeatureName(int featureIndex) {
030 return "Gene constraints for the model Interval13";
031 }
032
033 /** This is a constraint class, so we don't return features */
034 public int getNumFeatures() {
035 return 0;
036 }
037
038 public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends Character>> data) {
039 Interval13Tools.verify(modelInfo);
040 }
041
042 public void evaluateEdge(InputSequence<? extends Character> seq, int pos, int prevState, int state, FeatureList result) {
043 boolean valid = true;
044
045 int eind,iind;
046
047 switch(Interval13Tools.edgeConstraints[prevState*Interval13Tools.numStates + state]) {
048 case NONE:
049 break;
050 case NEVER:
051 Assert.a(false);
052 break;
053 case PSTART:
054 eind = Interval13Tools.check012(state-1);
055 if ((pos-eind)%3 != 0) { valid = false; break; }
056 valid = startConstraintPlus(seq, pos);
057 break;
058 case PDON:
059 iind = Interval13Tools.check012(state-4);
060 eind = Interval13Tools.check012(prevState-1);
061 if ((pos-eind+iind)%3 != 0) { valid = false; break; }
062 valid = donorConstraintPlus(seq, pos);
063 break;
064 case PACC:
065 iind = Interval13Tools.check012(prevState-4);
066 eind = Interval13Tools.check012(state-1);
067 if ((pos-eind+iind)%3 != 0) { valid = false; break; }
068 valid = acceptorConstraintPlus(seq, pos);
069 break;
070 case PSTOP:
071 eind = Interval13Tools.check012(prevState-1);
072 if ((pos-eind)%3 != 0) { valid = false; break; }
073 valid = stopEdgeConstraintPlus(seq, pos);
074 break;
075 case MSTART:
076 eind = Interval13Tools.check012(prevState-7);
077 if ((pos-eind)%3 != 0) { valid = false; break; }
078 valid = startConstraintMinus(seq, pos);
079 break;
080 case MDON:
081 iind = Interval13Tools.check012(prevState-10);
082 eind = Interval13Tools.check012(state-7);
083 if ((pos-eind-iind)%3 != 0) { valid = false; break; }
084 valid = donorConstraintMinus(seq, pos);
085 break;
086 case MACC:
087 iind = Interval13Tools.check012(state-10);
088 eind = Interval13Tools.check012(prevState-7);
089 if ((pos-eind-iind)%3 != 0) { valid = false; break; }
090 valid = acceptorConstraintMinus(seq, pos);
091 break;
092 case MSTOP:
093 eind = Interval13Tools.check012(state-7);
094 if ((pos-eind)%3 != 0) { valid = false; break; }
095 valid = stopEdgeConstraintMinus(seq, pos);
096 break;
097 case PCODE: // redundant with node invalidation below
098 eind = Interval13Tools.check012(state-1);
099 if ( (pos-eind)%3 == 2) {
100 valid = !stopNodeConstraintPlus(seq, pos);
101 }
102 break;
103 case MCODE: // redundant iwth node evaluation below
104 eind = Interval13Tools.check012(state-7);
105 if ( (pos-eind)%3==0) {
106 valid = !stopNodeConstraintMinus(seq, pos);
107 }
108 break;
109 default:
110 Assert.a(false);
111 }
112
113 // This debugging code is pretty clutch
114 // if(!valid) {
115 // String str = "";
116 // for (int i = -8; i < 9; i++) {
117 // str += seq.getX(pos+i);
118 // }
119 // System.out.println(" v ");
120 // System.out.println(str);
121 // System.out.println(seq.toString());
122 // }
123
124 if(valid == false)
125 result.invalidate();
126 }
127
128 public void evaluateNode(InputSequence<? extends Character> seq, int pos, int state, FeatureList result) {
129 boolean valid = true;
130
131 int eind;
132
133 switch(Interval13Tools.nodeConstraints[state]) {
134 case NONE:
135 break;
136 case NEVER:
137 Assert.a(false);
138 break;
139 case PCODE:
140 eind = Interval13Tools.check012(state-1);
141 if ( (pos-eind)%3 == 2) {
142 valid = !stopNodeConstraintPlus(seq, pos);
143 }
144 break;
145 case MCODE:
146 eind = Interval13Tools.check012(state-7);
147 if ( (pos-eind)%3==0) {
148 valid = !stopNodeConstraintMinus(seq, pos);
149 }
150 break;
151 default:
152 Assert.a(false);
153 }
154 if(valid == false)
155 result.invalidate();
156 }
157
158
159 private boolean startConstraintPlus(InputSequence<? extends Character> seq, int pos) {
160 return (seq.length() > pos + 2) && seq.getX(pos) == 'A' && seq.getX(pos+1) == 'T' && seq.getX(pos+2) == 'G';
161 }
162
163 private boolean startConstraintMinus(InputSequence<? extends Character> seq, int pos) {
164 return (pos >= 3) && seq.getX(pos-3) == 'C' && seq.getX(pos-2) == 'A' && seq.getX(pos-1) == 'T';
165 }
166
167 private boolean donorConstraintPlus(InputSequence<? extends Character> seq, int pos) {
168 return (seq.length() > pos + 1) && seq.getX(pos) == 'G' && (seq.getX(pos+1) == 'T' || seq.getX(pos+1) == 'C');
169 }
170
171 private boolean donorConstraintMinus(InputSequence<? extends Character> seq, int pos) {
172 boolean ret = (pos >= 2) && (seq.getX(pos-2) == 'A' || seq.getX(pos-2) == 'G') && seq.getX(pos-1) == 'C';
173 //if(!ret) log.warn("Seq wrong at MDON");
174 return ret;
175 }
176
177 private boolean acceptorConstraintPlus(InputSequence<? extends Character> seq, int pos) {
178 return (pos > 1) && seq.getX(pos-2) == 'A' && seq.getX(pos-1) == 'G';
179 }
180
181 private boolean acceptorConstraintMinus(InputSequence<? extends Character> seq, int pos) {
182 boolean ret = (seq.length() > pos + 1) && seq.getX(pos) == 'C' && seq.getX(pos+1) == 'T';
183 //if(!ret) log.warn("Seq wrong at MACC - expected CT but was "+seq.getX(pos)+seq.getX(pos+1));
184 return ret;
185 }
186
187 //////////////////////////////////
188
189
190 private boolean stopEdgeConstraintPlus(InputSequence<? extends Character> seq, int pos) {
191 if(pos < (seq.length()-2) && seq.getX(pos) == 'T') {
192 return (seq.getX(pos+1) == 'A' && (seq.getX(pos+2) == 'G' || seq.getX(pos+2) == 'A'))
193 || (seq.getX(pos+1) == 'G' && seq.getX(pos+2) == 'A');
194 }
195 return false;
196 }
197
198 private boolean stopEdgeConstraintMinus(InputSequence<? extends Character> seq, int pos) {
199 if(pos>=3 && seq.getX(pos-1) == 'A') {
200 boolean ret = (seq.getX(pos-2) == 'T' && (seq.getX(pos-3) == 'C' || seq.getX(pos-3) == 'T'))
201 || (seq.getX(pos-2) == 'C' && seq.getX(pos-3) == 'T');
202 //if(!ret) log.warn("Seq wrong at MSTOP edge entry");
203 return ret;
204 }
205 //log.warn("Seq wrong at MSTOP edge exit");
206 return false;
207 }
208
209 /////////////////////////////////////////
210
211 private boolean stopNodeConstraintPlus(InputSequence<? extends Character> seq, int pos) {
212 if(pos >= 2 && seq.getX(pos-2) == 'T') {
213 return (seq.getX(pos-1) == 'A' && (seq.getX(pos) == 'G' || seq.getX(pos) == 'A'))
214 || (seq.getX(pos-1) == 'G' && seq.getX(pos) == 'A');
215 }
216 return false;
217 }
218
219 private boolean stopNodeConstraintMinus(InputSequence<? extends Character> seq, int pos) {
220 if(pos<(seq.length()-2) && seq.getX(pos+2) == 'A') {
221 boolean ret = (seq.getX(pos+1) == 'T' && (seq.getX(pos) == 'C' || seq.getX(pos) == 'T'))
222 || (seq.getX(pos+1) == 'C' && seq.getX(pos) == 'T');
223 return ret;
224 }
225 return false;
226 }
227
228 @Override
229 public CacheStrategySpec getCacheStrategy() {
230 return new CacheStrategySpec(CacheStrategy.UNSPECIFIED);
231 }
232
233 }