001    package calhoun.analysis.crf.features.tricycle13;
002    
003    import java.util.List;
004    
005    import org.apache.commons.logging.Log;
006    import org.apache.commons.logging.LogFactory;
007    
008    import calhoun.analysis.crf.AbstractFeatureManager;
009    import calhoun.analysis.crf.CacheStrategySpec;
010    import calhoun.analysis.crf.FeatureList;
011    import calhoun.analysis.crf.FeatureManagerNode;
012    import calhoun.analysis.crf.ModelManager;
013    import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
014    import calhoun.analysis.crf.io.InputSequence;
015    import calhoun.analysis.crf.io.MultipleAlignmentInputSequence;
016    import calhoun.analysis.crf.io.TrainingSequence;
017    import calhoun.analysis.crf.io.MultipleAlignmentInputSequence.MultipleAlignmentColumn;
018    import calhoun.seq.KmerHasher;
019    import calhoun.util.Assert;
020    
021    public class GapConjunctionFeatures extends AbstractFeatureManager<MultipleAlignmentColumn> implements FeatureManagerNode<MultipleAlignmentColumn> {
022            private static final long serialVersionUID = -7659288739348604129L;
023            private static final Log log = LogFactory.getLog(GapConjunctionFeatures.class);
024            boolean debug = log.isDebugEnabled();
025            
026            /* Let G(i) and F(i) be booleans, one per position, that indicate whether there is
027             * a gap in the multiple alignment for which either 1) The first non-gap character
028             * of the reference sequence to the right of the gap is at position i, or 2) The last
029             * non-gap character of the reference sequence to the left of the gap is at position i;
030             * and 3) the gap is a multiple of 3.  G stands for "gap"
031             * 
032             * F(i) is the same thing except for a non-multiple of three length.
033             * F stands for "frameshifter".
034             * 
035             * The we define indicator features for the following conjunctions:
036             * 1) F(i) & (coding)
037             * 2) F(i) & (intronic)
038             * 3) F(i) & (intergenic)
039             * 4) G(i) & (coding)
040             * 5) G(i) & (intronic)
041             * 6) G(i) & (intergenic)
042             */
043            
044            int startIx;  
045            ModelManager model;
046            KmerHasher h = new KmerHasher(KmerHasher.ACGTN,1);
047            
048            int maxSeqLength;
049            
050            int nFeatures = 6;
051            int nStates;
052            
053            Boolean[] gapboundary, frameshifter;
054            Boolean[] isStateCoding, isStateIntronic, isStateIntergenic;
055    
056            int lastSeqLength = -1;
057            int lastpos = -1;
058            
059            
060            public GapConjunctionFeatures() {       
061            }
062    
063            public int getNumFeatures() {
064                    return nFeatures;
065            }       
066            
067            public String getFeatureName(int featureIndex) {
068                    int raw = featureIndex - startIx;
069                    Assert.a(raw<nFeatures);
070                    String ret = "GapConjunctionFeature." + raw;
071                    return ret;
072            }
073    
074            InputSequence<? extends MultipleAlignmentColumn> lastSeq = null;
075            
076            public void evaluateNode(InputSequence<? extends MultipleAlignmentColumn> seq, int pos, int state, FeatureList result) {
077                    // Try out one of the following two lines.
078                    if ( (seq != lastSeq) ) {
079                    //if( (seq.length() != lastSeqLength)  || (pos < lastpos) ) {
080                            log.debug(seq);
081                            log.debug(lastSeq);
082                            log.debug("Performing precomputations for seq of length " + seq.length() + " at position " + pos);
083                            performPrecomputations(seq.getX(0).getMultipleAlignment());
084                            lastSeqLength = seq.length();
085                            lastpos = pos;
086                            lastSeq = seq;
087                    }
088    
089                    if (isStateCoding[state]     && frameshifter[pos])          { result.addFeature(startIx+0, 1.0); }
090                    if (isStateIntronic[state]   && frameshifter[pos])          { result.addFeature(startIx+1, 1.0); }
091                    if (isStateIntergenic[state] && frameshifter[pos])          { result.addFeature(startIx+2, 1.0); }
092                    if (isStateCoding[state]     && gapboundary[pos])           { result.addFeature(startIx+3, 1.0); }
093                    if (isStateIntronic[state]   && gapboundary[pos])           { result.addFeature(startIx+4, 1.0); }
094                    if (isStateIntergenic[state] && gapboundary[pos])           { result.addFeature(startIx+5, 1.0); }
095            }
096    
097            private void performPrecomputations(MultipleAlignmentInputSequence seq) {
098                    // In this method is where you need to set the boolean vectors frameshifter[i] and gapboundary[i] using the multiple alignment seq.
099    
100                    // Maybe a little inefficient with memory allocation but hopefully not too much
101                    if (seq.length() > frameshifter.length) {
102                            frameshifter = new Boolean[seq.length()];
103                            gapboundary  = new Boolean[seq.length()];
104                    }
105                    
106                    for (int j=0; j<seq.length(); j++) {
107                            frameshifter[j] = false;
108                            gapboundary[j] = false;
109                    }
110                    
111                    int numSpecies = seq.numSpecies();
112                    int consensusLength = seq.getConsensusLength();
113                    
114                    for (int spec = 0; spec<numSpecies; spec++) {
115                            boolean inGap = false;
116                            int conGapStart = -1;
117                            for (int cpos = 1; cpos< consensusLength; cpos++ ) {
118                                    if ( (!inGap) && (h.hashable(seq.characterInPaddedAlignment(cpos-1,spec))) && (!h.hashable(seq.characterInPaddedAlignment(cpos,spec))) ) {
119                                            inGap = true;
120                                            conGapStart = cpos;
121                                    }
122                                    if ( (inGap) && (!h.hashable(seq.characterInPaddedAlignment(cpos-1,spec))) && (h.hashable(seq.characterInPaddedAlignment(cpos,spec))) ) {
123                                            inGap = false;
124                                            int conGapEnd = cpos-1;
125                                            int gaplen = conGapEnd - conGapStart + 1;
126                                            if (gaplen <=60) {
127                                                    if ( (gaplen%3) == 0) {
128                                                            gapboundary[seq.con2refLeft(conGapStart)] = true;
129                                                            gapboundary[seq.con2refRight(conGapEnd)] = true;
130                                                    } else {
131                                                            frameshifter[seq.con2refLeft(conGapStart)] = true;
132                                                            frameshifter[seq.con2refRight(conGapEnd)] = true;                                                       
133                                                    }
134                                                    
135                                            }
136                                            
137                                    }
138                                            
139                            }
140                            
141                    }
142                    
143            }
144    
145            public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends MultipleAlignmentColumn>> data) {
146    
147                    startIx = startingIndex;
148                    model = modelInfo;
149                    nStates = model.getNumStates();
150    
151                    maxSeqLength = 0;                       
152                    for(TrainingSequence<? extends MultipleAlignmentColumn> seq : data) {
153                            int len = seq.length();
154                            if (len > maxSeqLength) { maxSeqLength = len; }
155                    }
156                    frameshifter = new Boolean[maxSeqLength];
157                    gapboundary  = new Boolean[maxSeqLength];
158                    
159                    
160                    isStateCoding = new Boolean[nStates];       for (int j=0; j<nStates; j++) { isStateCoding[j] = false; }
161                    isStateCoding[model.getStateIndex("exon1")] = true;
162                    isStateCoding[model.getStateIndex("exon2")] = true;
163                    isStateCoding[model.getStateIndex("exon3")] = true;
164                    isStateCoding[model.getStateIndex("exon1m")] = true;
165                    isStateCoding[model.getStateIndex("exon2m")] = true;
166                    isStateCoding[model.getStateIndex("exon3m")] = true;            
167    
168                    isStateIntronic = new Boolean[nStates];     for (int j=0; j<nStates; j++) { isStateIntronic[j] = false; }
169                    isStateIntronic[model.getStateIndex("intron1")] = true;
170                    isStateIntronic[model.getStateIndex("intron2")] = true;
171                    isStateIntronic[model.getStateIndex("intron3")] = true;
172                    isStateIntronic[model.getStateIndex("intron1m")] = true;
173                    isStateIntronic[model.getStateIndex("intron2m")] = true;
174                    isStateIntronic[model.getStateIndex("intron3m")] = true;
175    
176                    isStateIntergenic = new Boolean[nStates];   for (int j=0; j<nStates; j++) { isStateIntergenic[j] = false; }
177                    isStateIntergenic[model.getStateIndex("intergenic")] = true;
178                    
179            }
180            @Override
181            public CacheStrategySpec getCacheStrategy() {
182                    return new CacheStrategySpec(CacheStrategy.UNSPECIFIED);
183            }
184    }
185