001 package calhoun.analysis.crf.features.tricycle13;
002
003 import java.util.List;
004
005 import calhoun.analysis.crf.AbstractFeatureManager;
006 import calhoun.analysis.crf.FeatureList;
007 import calhoun.analysis.crf.FeatureManagerEdge;
008 import calhoun.analysis.crf.ModelManager;
009 import calhoun.analysis.crf.io.InputSequence;
010 import calhoun.analysis.crf.io.TrainingSequence;
011
012 /** lmplements basic constraints on gene calls.
013 * <ol>
014 * <li>Transition from intergenic to start must occur at ATG
015 * <li>Splice sites must be canonical GT/AG or GC/AG
016 * <li>Transition from exon to stop must be followed by a stop codon
017 * </ol>
018 */
019
020 // NOTE: I don't think these constraints look at frame, so I recommend using a version of this class adapted to the specific model yu're considering. JPV 20060629
021
022 public class GeneConstraints extends AbstractFeatureManager<Character> implements FeatureManagerEdge<Character> {
023
024 private static final long serialVersionUID = -3045999729941973608L;
025
026 enum Constraint {NONE, START, DONOR, ACCEPTOR, STOP, START_MINUS, DONOR_MINUS, ACCEPTOR_MINUS, STOP_MINUS, CODING, CODING_MINUS};
027 Constraint[] constraints;
028 int numStates;
029
030 public String getFeatureName(int featureIndex) {
031 return "Gene constraints";
032 }
033
034 /** This is a constraint class, so we don't return features */
035 public int getNumFeatures() {
036 return 0;
037 }
038
039 /** Set up the matrix
040 * Depends on states starting with the words 'intergenic, intron, and exon'. Also depends on the negative strand states ending in m.
041 */
042 public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends Character>> data) {
043 numStates = modelInfo.getNumStates();
044 constraints = new Constraint[numStates * numStates];
045 for(int i=0; i<numStates; ++i) {
046 for(int j=0; j<numStates; ++j) {
047 String lastState = modelInfo.getStateName(i);
048 String currentState = modelInfo.getStateName(j);
049 Constraint c = Constraint.NONE;
050 if(lastState.startsWith("intergenic") && currentState.startsWith("exon")) {
051 c = currentState.endsWith("m") ? Constraint.STOP_MINUS : Constraint.START;
052 }
053 else if(lastState.startsWith("exon") && currentState.startsWith("intron")) {
054 c = currentState.endsWith("m") ? Constraint.ACCEPTOR_MINUS : Constraint.DONOR;
055 }
056 else if(lastState.startsWith("intron") && currentState.startsWith("exon")) {
057 c = currentState.endsWith("m") ? Constraint.DONOR_MINUS : Constraint.ACCEPTOR;
058 }
059 else if(lastState.startsWith("exon") && currentState.startsWith("intergenic")) {
060 c = lastState.endsWith("m") ? Constraint.START_MINUS : Constraint.STOP;
061 }
062 else if(lastState.equals("exon3") && currentState.equals("exon1")) {
063 c = Constraint.CODING;
064 }
065 else if(lastState.equals("exon1m") && currentState.equals("exon3m")) {
066 c = Constraint.CODING_MINUS;
067 }
068 constraints[i*numStates + j] = c;
069 }
070 }
071 }
072
073 public void evaluateEdge(InputSequence<? extends Character> seq, int pos, int prevState, int state, FeatureList result) {
074 boolean valid = true;
075
076 switch(constraints[prevState*numStates + state]) {
077 case START:
078 valid = startConstraintPlus(seq, pos);
079 break;
080 case DONOR:
081 valid = donorConstraintPlus(seq, pos);
082 break;
083 case ACCEPTOR:
084 valid = acceptorConstraintPlus(seq, pos);
085 break;
086 case STOP:
087 valid = stopConstraintPlus(seq, pos);
088 break;
089 case START_MINUS:
090 valid = startConstraintMinus(seq, pos);
091 break;
092 case DONOR_MINUS:
093 valid = donorConstraintMinus(seq, pos);
094 break;
095 case ACCEPTOR_MINUS:
096 valid = acceptorConstraintMinus(seq, pos);
097 break;
098 case STOP_MINUS:
099 valid = stopConstraintMinus(seq, pos);
100 break;
101 case CODING:
102 valid = !stopConstraintPlus(seq, pos);
103 break;
104 case CODING_MINUS:
105 valid = !stopConstraintMinus(seq, pos);
106 break;
107 }
108
109 if(valid == false)
110 result.invalidate();
111 }
112
113 boolean startConstraintPlus(InputSequence<? extends Character> seq, int pos) {
114 return (seq.length() > pos + 2) && seq.getX(pos) == 'A' && seq.getX(pos+1) == 'T' && seq.getX(pos+2) == 'G';
115 }
116
117 boolean startConstraintMinus(InputSequence<? extends Character> seq, int pos) {
118 return (pos >= 3) && seq.getX(pos-3) == 'C' && seq.getX(pos-2) == 'A' && seq.getX(pos-1) == 'T';
119 }
120
121 boolean donorConstraintPlus(InputSequence<? extends Character> seq, int pos) {
122 return (seq.length() > pos + 1) && seq.getX(pos) == 'G' && (seq.getX(pos+1) == 'T' || seq.getX(pos+1) == 'C');
123 }
124
125 /** CCCGTCCCAGCCC
126 * GGGCAGGGTCGGG
127 */
128 boolean donorConstraintMinus(InputSequence<? extends Character> seq, int pos) {
129 return (pos >= 2) && (seq.getX(pos-2) == 'A' || seq.getX(pos-2) == 'G') && seq.getX(pos-1) == 'C';
130 }
131
132 boolean acceptorConstraintPlus(InputSequence<? extends Character> seq, int pos) {
133 return (pos > 1) && seq.getX(pos-2) == 'A' && seq.getX(pos-1) == 'G';
134 }
135
136 boolean acceptorConstraintMinus(InputSequence<? extends Character> seq, int pos) {
137 return (seq.length() > pos + 1) && seq.getX(pos) == 'C' && seq.getX(pos+1) == 'T';
138 }
139
140 boolean stopConstraintPlus(InputSequence<? extends Character> seq, int pos) {
141 if(seq.length() > pos + 2 && seq.getX(pos) == 'T') {
142 return (seq.getX(pos+1) == 'A' && (seq.getX(pos+2) == 'G' || seq.getX(pos+2) == 'A'))
143 || (seq.getX(pos+1) == 'G' && seq.getX(pos+2) == 'A');
144 }
145 return false;
146 }
147
148 boolean stopConstraintMinus(InputSequence<? extends Character> seq, int pos) {
149 if(pos >= 3 && seq.getX(pos-1) == 'A') {
150 return (seq.getX(pos-2) == 'T' && (seq.getX(pos-3) == 'C' || seq.getX(pos-3) == 'T'))
151 || (seq.getX(pos-2) == 'C' && seq.getX(pos-3) == 'T');
152 }
153 return false;
154 }
155 }