001    package calhoun.analysis.crf.features.interval29;
002    
003    import java.util.List;
004    
005    import org.apache.commons.logging.Log;
006    import org.apache.commons.logging.LogFactory;
007    
008    import calhoun.analysis.crf.AbstractFeatureManager;
009    import calhoun.analysis.crf.CacheStrategySpec;
010    import calhoun.analysis.crf.FeatureList;
011    import calhoun.analysis.crf.FeatureManagerNode;
012    import calhoun.analysis.crf.ModelManager;
013    import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
014    import calhoun.analysis.crf.features.interval13.GapFeaturesInterval13;
015    import calhoun.analysis.crf.io.InputSequence;
016    import calhoun.analysis.crf.io.MultipleAlignmentInputSequence;
017    import calhoun.analysis.crf.io.TrainingSequence;
018    import calhoun.analysis.crf.io.MultipleAlignmentInputSequence.MultipleAlignmentColumn;
019    import calhoun.seq.KmerHasher;
020    import calhoun.util.Assert;
021    
022    public class GapFeaturesInterval29 extends AbstractFeatureManager<MultipleAlignmentColumn> implements FeatureManagerNode<MultipleAlignmentColumn>{
023            private static final long serialVersionUID = 8484623829139711313L;
024            private static final Log log = LogFactory.getLog(GapFeaturesInterval29.class);
025            boolean debug = log.isDebugEnabled();
026            String[] featureNames = new String[] { "Frameshift coding", "Frameshift intron", "Frameshift exon", "Mod 3 gap coding", "Mod 3 gap intron", "Mod 3 gap exon"};
027            
028            /* Let G(i) and F(i) be booleans, one per position, that indicate whether there is
029             * a gap in the multiple alignment for which either 1) The first non-gap character
030             * of the reference sequence to the right of the gap is at position i, or 2) The last
031             * non-gap character of the reference sequence to the left of the gap is at position i;
032             * and 3) the gap is a multiple of 3.  G stands for "gap"
033             * 
034             * F(i) is the same thing except for a non-multiple of three length.
035             * F stands for "frameshifter".
036             * 
037             * The we define indicator features for the following conjunctions:
038             * 1) F(i) & (coding)
039             * 2) F(i) & (intronic)
040             * 3) F(i) & (intergenic)
041             * 4) G(i) & (coding)
042             * 5) G(i) & (intronic)
043             * 6) G(i) & (intergenic)
044             */
045            
046            // This code is essentially unchanged from the tricycle13 feature of similar name.
047            
048            int startIx;  
049            ModelManager model;
050            KmerHasher h = new KmerHasher(KmerHasher.ACGTN,1);
051            
052            int maxSeqLength;
053            
054            int nFeatures = 6;
055            int nStates;
056            
057            Boolean[] gapboundary, frameshifter;
058            Boolean[] isStateCoding, isStateIntronic, isStateIntergenic;
059    
060            int lastSeqLength = -1;
061            int lastpos = -1;
062            
063            
064            public GapFeaturesInterval29() {        
065            }
066    
067            public int getNumFeatures() {
068                    return nFeatures;
069            }       
070            
071            public String getFeatureName(int featureIndex) {
072                    String[] names = new String[] {"Coding frameshift", "Intron frameshift", "Intergenic frameshift", "Coding mod3 gap", "Intron mod3 gap", "Intergenic mod3 gap"};
073                    int raw = featureIndex - startIx;
074                    Assert.a(raw<nFeatures);
075                    String ret = names[raw];
076                    return ret;
077            }
078    
079            transient InputSequence<? extends MultipleAlignmentColumn> lastSeq = null;
080            
081            public void evaluateNode(InputSequence<? extends MultipleAlignmentColumn> seq, int pos, int state, FeatureList result) {
082                    // Try out one of the following two lines.
083                    if ( (seq != lastSeq) ) {
084                    //if( (seq.length() != lastSeqLength)  || (pos < lastpos) ) {
085                            log.debug("Performing precomputations for seq of length " + seq.length() + " at position " + pos);
086                            performPrecomputations(seq.getX(0).getMultipleAlignment());
087                            lastSeqLength = seq.length();
088                            lastpos = pos;
089                            lastSeq = seq;
090                    }
091    
092                    if (isStateCoding[state]     && frameshifter[pos])          { result.addFeature(startIx+0, 1.0); }
093                    if (isStateIntronic[state]   && frameshifter[pos])          { result.addFeature(startIx+1, 1.0); }
094                    if (isStateIntergenic[state] && frameshifter[pos])          { result.addFeature(startIx+2, 1.0); }
095                    if (isStateCoding[state]     && gapboundary[pos])           { result.addFeature(startIx+3, 1.0); }
096                    if (isStateIntronic[state]   && gapboundary[pos])           { result.addFeature(startIx+4, 1.0); }
097                    if (isStateIntergenic[state] && gapboundary[pos])           { result.addFeature(startIx+5, 1.0); }
098            }
099    
100            private void performPrecomputations(MultipleAlignmentInputSequence seq) {
101                    // In this method is where you need to set the boolean vectors frameshifter[i] and gapboundary[i] using the multiple alignment seq.
102    
103                    // Maybe a little inefficient with memory allocation but hopefully not too much
104                    if (seq.length() > frameshifter.length) {
105                            frameshifter = new Boolean[seq.length()];
106                            gapboundary  = new Boolean[seq.length()];
107                    }
108                    
109                    for (int j=0; j<seq.length(); j++) {
110                            frameshifter[j] = false;
111                            gapboundary[j] = false;
112                    }
113                    
114                    int numSpecies = seq.numSpecies();
115                    int consensusLength = seq.getConsensusLength();
116                    
117                    for (int spec = 0; spec<numSpecies; spec++) {
118                            boolean inGap = false;
119                            int conGapStart = -1;
120                            for (int cpos = 1; cpos< consensusLength; cpos++ ) {
121                                    if ( (!inGap) && (h.hashable(seq.characterInPaddedAlignment(cpos-1,spec))) && (!h.hashable(seq.characterInPaddedAlignment(cpos,spec))) ) {
122                                            inGap = true;
123                                            conGapStart = cpos;
124                                    }
125                                    if ( (inGap) && (!h.hashable(seq.characterInPaddedAlignment(cpos-1,spec))) && (h.hashable(seq.characterInPaddedAlignment(cpos,spec))) ) {
126                                            inGap = false;
127                                            int conGapEnd = cpos-1;
128                                            int gaplen = conGapEnd - conGapStart + 1;
129                                            if (gaplen <=60) {
130                                                    if ( (gaplen%3) == 0) {
131                                                            gapboundary[seq.con2refLeft(conGapStart)] = true;
132                                                            gapboundary[seq.con2refRight(conGapEnd)] = true;
133                                                    } else {
134                                                            frameshifter[seq.con2refLeft(conGapStart)] = true;
135                                                            frameshifter[seq.con2refRight(conGapEnd)] = true;                                                       
136                                                    }
137                                                    
138                                            }
139                                            
140                                    }
141                                            
142                            }
143                            
144                    }
145                    
146            }
147    
148            public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends MultipleAlignmentColumn>> data) {
149    
150                    startIx = startingIndex;
151                    model = modelInfo;
152                    nStates = model.getNumStates();
153    
154                    maxSeqLength = 0;                       
155                    for(TrainingSequence<? extends MultipleAlignmentColumn> seq : data) {
156                            int len = seq.length();
157                            if (len > maxSeqLength) { maxSeqLength = len; }
158                    }
159                    frameshifter = new Boolean[maxSeqLength];
160                    gapboundary  = new Boolean[maxSeqLength];
161                                    
162                    isStateCoding = new Boolean[nStates];       for (int j=0; j<nStates; j++) { isStateCoding[j] = false; }
163                    isStateCoding[1] = true;
164                    isStateCoding[2] = true;
165                    isStateCoding[3] = true;
166                    isStateCoding[7] = true;
167                    isStateCoding[8] = true;
168                    isStateCoding[9] = true;                
169    
170                    isStateIntronic = new Boolean[nStates];     for (int j=0; j<nStates; j++) { isStateIntronic[j] = false; }
171                    isStateIntronic[4] = true;
172                    isStateIntronic[5] = true;
173                    isStateIntronic[6] = true;
174                    isStateIntronic[10] = true;
175                    isStateIntronic[11] = true;
176                    isStateIntronic[12] = true;
177                    isStateIntronic[15] = true;
178                    isStateIntronic[16] = true;
179                    isStateIntronic[17] = true;
180                    isStateIntronic[18] = true;
181                    isStateIntronic[19] = true;
182                    isStateIntronic[20] = true;
183                    isStateIntronic[23] = true;
184                    isStateIntronic[24] = true;
185                    isStateIntronic[25] = true;
186                    isStateIntronic[26] = true;
187                    isStateIntronic[27] = true;
188                    isStateIntronic[28] = true;
189    
190                    isStateIntergenic = new Boolean[nStates];   for (int j=0; j<nStates; j++) { isStateIntergenic[j] = false; }
191                    isStateIntergenic[0] = true;
192                    isStateIntergenic[13] = true;
193                    isStateIntergenic[14] = true;
194                    isStateIntergenic[21] = true;
195                    isStateIntergenic[22] = true;
196                    
197            }
198            @Override
199            public CacheStrategySpec getCacheStrategy() {
200                    return new CacheStrategySpec(CacheStrategy.SPARSE);
201            }
202    }