001    package calhoun.analysis.crf.features.tricycle13;
002    
003    import java.util.List;
004    
005    import calhoun.analysis.crf.AbstractFeatureManager;
006    import calhoun.analysis.crf.FeatureList;
007    import calhoun.analysis.crf.FeatureManagerEdge;
008    import calhoun.analysis.crf.ModelManager;
009    import calhoun.analysis.crf.io.InputSequence;
010    import calhoun.analysis.crf.io.TrainingSequence;
011    
012    /** lmplements basic constraints on gene calls.
013     * <ol>
014     * <li>Transition from intergenic to start must occur at ATG
015     * <li>Splice sites must be canonical GT/AG or GC/AG
016     * <li>Transition from exon to stop must be followed by a stop codon
017     * </ol>
018     */
019    
020    // NOTE: I don't think these constraints look at frame, so I recommend using a version of this class adapted to the specific model yu're considering.  JPV 20060629
021    
022    public class GeneConstraints extends AbstractFeatureManager<Character> implements FeatureManagerEdge<Character> {
023    
024            private static final long serialVersionUID = -3045999729941973608L;
025    
026            enum Constraint {NONE, START, DONOR, ACCEPTOR, STOP, START_MINUS, DONOR_MINUS, ACCEPTOR_MINUS, STOP_MINUS, CODING, CODING_MINUS};
027            Constraint[] constraints;
028            int numStates;
029            
030            public String getFeatureName(int featureIndex) {
031                    return "Gene constraints";
032            }
033    
034            /** This is a constraint class, so we don't return features */
035            public int getNumFeatures() {
036                    return 0;
037            }
038    
039            /** Set up the matrix
040             * Depends on states starting with the words 'intergenic, intron, and exon'.  Also depends on the negative strand states ending in m.
041             */
042            public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends Character>> data) {
043                    numStates = modelInfo.getNumStates();
044                    constraints = new Constraint[numStates * numStates];
045                    for(int i=0; i<numStates; ++i) {
046                            for(int j=0; j<numStates; ++j) {
047                                    String lastState = modelInfo.getStateName(i);
048                                    String currentState = modelInfo.getStateName(j);
049                                    Constraint c = Constraint.NONE;
050                                    if(lastState.startsWith("intergenic") && currentState.startsWith("exon")) {
051                                            c = currentState.endsWith("m") ? Constraint.STOP_MINUS : Constraint.START;
052                                    }
053                                    else if(lastState.startsWith("exon") && currentState.startsWith("intron")) {
054                                            c = currentState.endsWith("m") ? Constraint.ACCEPTOR_MINUS : Constraint.DONOR;
055                                    }
056                                    else if(lastState.startsWith("intron") && currentState.startsWith("exon")) {
057                                            c = currentState.endsWith("m") ? Constraint.DONOR_MINUS : Constraint.ACCEPTOR;
058                                    }
059                                    else if(lastState.startsWith("exon") && currentState.startsWith("intergenic")) {
060                                            c = lastState.endsWith("m") ? Constraint.START_MINUS : Constraint.STOP;
061                                    }
062                                    else if(lastState.equals("exon3") && currentState.equals("exon1")) {
063                                            c = Constraint.CODING;
064                                    }
065                                    else if(lastState.equals("exon1m") && currentState.equals("exon3m")) {
066                                            c = Constraint.CODING_MINUS;
067                                    }
068                                    constraints[i*numStates + j] = c;
069                            }
070                    }
071            }
072            
073            public void evaluateEdge(InputSequence<? extends Character> seq, int pos, int prevState, int state, FeatureList result) {
074                    boolean valid = true;
075    
076                    switch(constraints[prevState*numStates + state]) {
077                            case START:
078                                    valid = startConstraintPlus(seq, pos);
079                                    break;
080                            case DONOR:
081                                    valid = donorConstraintPlus(seq, pos);
082                                    break;
083                            case ACCEPTOR:
084                                    valid = acceptorConstraintPlus(seq, pos);
085                                    break;
086                            case STOP:
087                                    valid = stopConstraintPlus(seq, pos);
088                                    break;
089                            case START_MINUS:
090                                    valid = startConstraintMinus(seq, pos);
091                                    break;
092                            case DONOR_MINUS:
093                                    valid = donorConstraintMinus(seq, pos);
094                                    break;
095                            case ACCEPTOR_MINUS:
096                                    valid = acceptorConstraintMinus(seq, pos);
097                                    break;
098                            case STOP_MINUS:
099                                    valid = stopConstraintMinus(seq, pos);
100                                    break;
101                            case CODING:
102                                    valid = !stopConstraintPlus(seq, pos);
103                                    break;
104                            case CODING_MINUS:
105                                    valid = !stopConstraintMinus(seq, pos);
106                                    break;
107                    }
108                    
109                    if(valid == false)
110                            result.invalidate();
111            }
112            
113            boolean startConstraintPlus(InputSequence<? extends Character> seq, int pos) {
114                    return (seq.length() > pos + 2) && seq.getX(pos) == 'A' && seq.getX(pos+1) == 'T' && seq.getX(pos+2) == 'G';
115            }
116    
117            boolean startConstraintMinus(InputSequence<? extends Character> seq, int pos) {
118                    return (pos >= 3) && seq.getX(pos-3) == 'C' && seq.getX(pos-2) == 'A' && seq.getX(pos-1) == 'T';
119            }
120    
121            boolean donorConstraintPlus(InputSequence<? extends Character> seq, int pos) {
122                    return (seq.length() > pos + 1) && seq.getX(pos) == 'G' && (seq.getX(pos+1) == 'T' || seq.getX(pos+1) == 'C');
123            }
124    
125            /**  CCCGTCCCAGCCC 
126             *   GGGCAGGGTCGGG
127             */
128            boolean donorConstraintMinus(InputSequence<? extends Character> seq, int pos) {
129                    return (pos >= 2) && (seq.getX(pos-2) == 'A' || seq.getX(pos-2) == 'G') && seq.getX(pos-1) == 'C';
130            }
131    
132            boolean acceptorConstraintPlus(InputSequence<? extends Character> seq, int pos) {
133                    return (pos > 1) && seq.getX(pos-2) == 'A' && seq.getX(pos-1) == 'G';
134            }
135    
136            boolean acceptorConstraintMinus(InputSequence<? extends Character> seq, int pos) {
137                    return (seq.length() > pos + 1) && seq.getX(pos) == 'C' && seq.getX(pos+1) == 'T';
138            }
139    
140            boolean stopConstraintPlus(InputSequence<? extends Character> seq, int pos) {
141                    if(seq.length() > pos + 2 && seq.getX(pos) == 'T') {
142                            return (seq.getX(pos+1) == 'A' && (seq.getX(pos+2) == 'G' || seq.getX(pos+2) == 'A'))
143                                            || (seq.getX(pos+1) == 'G' && seq.getX(pos+2) == 'A');
144                    }
145                    return false;
146            }
147    
148            boolean stopConstraintMinus(InputSequence<? extends Character> seq, int pos) {
149                    if(pos >= 3 && seq.getX(pos-1) == 'A') {
150                            return (seq.getX(pos-2) == 'T' && (seq.getX(pos-3) == 'C' || seq.getX(pos-3) == 'T'))
151                                            || (seq.getX(pos-2) == 'C' && seq.getX(pos-3) == 'T');
152                    }
153                    return false;
154            }
155    }