001    package calhoun.analysis.crf.features.interval13;
002    
003    import java.util.List;
004    
005    import org.apache.commons.logging.Log;
006    import org.apache.commons.logging.LogFactory;
007    
008    import calhoun.analysis.crf.AbstractFeatureManager;
009    import calhoun.analysis.crf.CacheStrategySpec;
010    import calhoun.analysis.crf.FeatureList;
011    import calhoun.analysis.crf.FeatureManagerNode;
012    import calhoun.analysis.crf.ModelManager;
013    import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
014    import calhoun.analysis.crf.io.InputSequence;
015    import calhoun.analysis.crf.io.MultipleAlignmentInputSequence;
016    import calhoun.analysis.crf.io.TrainingSequence;
017    import calhoun.analysis.crf.io.MultipleAlignmentInputSequence.MultipleAlignmentColumn;
018    import calhoun.seq.KmerHasher;
019    import calhoun.util.Assert;
020    
021    public class GapFeaturesInterval13 extends AbstractFeatureManager<MultipleAlignmentColumn> implements FeatureManagerNode<MultipleAlignmentColumn> {
022            private static final long serialVersionUID = -7659288739348604129L;
023            private static final Log log = LogFactory.getLog(GapFeaturesInterval13.class);
024            boolean debug = log.isDebugEnabled();
025            String[] featureNames = new String[] { "Frameshift coding", "Frameshift intron", "Frameshift exon", "Mod 3 gap coding", "Mod 3 gap intron", "Mod 3 gap exon"};
026            
027            /* Let G(i) and F(i) be booleans, one per position, that indicate whether there is
028             * a gap in the multiple alignment for which either 1) The first non-gap character
029             * of the reference sequence to the right of the gap is at position i, or 2) The last
030             * non-gap character of the reference sequence to the left of the gap is at position i;
031             * and 3) the gap is a multiple of 3.  G stands for "gap"
032             * 
033             * F(i) is the same thing except for a non-multiple of three length.
034             * F stands for "frameshifter".
035             * 
036             * The we define indicator features for the following conjunctions:
037             * 1) F(i) & (coding)
038             * 2) F(i) & (intronic)
039             * 3) F(i) & (intergenic)
040             * 4) G(i) & (coding)
041             * 5) G(i) & (intronic)
042             * 6) G(i) & (intergenic)
043             */
044            
045            // This code is essentially unchanged from the tricycle13 feature of similar name.
046            
047            int startIx;  
048            ModelManager model;
049            KmerHasher h = new KmerHasher(KmerHasher.ACGTN,1);
050            
051            int maxSeqLength;
052            
053            int nFeatures = 6;
054            int nStates;
055            
056            Boolean[] gapboundary, frameshifter;
057            Boolean[] isStateCoding, isStateIntronic, isStateIntergenic;
058    
059            int lastSeqLength = -1;
060            int lastpos = -1;
061            
062            
063            public GapFeaturesInterval13() {        
064            }
065    
066            public int getNumFeatures() {
067                    return nFeatures;
068            }       
069            
070            public String getFeatureName(int featureIndex) {
071                    String[] names = new String[] {"Coding frameshift", "Intron frameshift", "Intergenic frameshift", "Coding mod3 gap", "Intron mod3 gap", "Intergenic mod3 gap"};
072                    int raw = featureIndex - startIx;
073                    Assert.a(raw<nFeatures);
074                    String ret = names[raw];
075                    return ret;
076            }
077    
078            transient InputSequence<? extends MultipleAlignmentColumn> lastSeq = null;
079            
080            public void evaluateNode(InputSequence<? extends MultipleAlignmentColumn> seq, int pos, int state, FeatureList result) {
081                    // Try out one of the following two lines.
082                    if ( (seq != lastSeq) ) {
083                    //if( (seq.length() != lastSeqLength)  || (pos < lastpos) ) {
084                            log.debug("Performing precomputations for seq of length " + seq.length() + " at position " + pos);
085                            performPrecomputations(seq.getX(0).getMultipleAlignment());
086                            lastSeqLength = seq.length();
087                            lastpos = pos;
088                            lastSeq = seq;
089                    }
090    
091                    if (isStateCoding[state]     && frameshifter[pos])          { result.addFeature(startIx+0, 1.0); }
092                    if (isStateIntronic[state]   && frameshifter[pos])          { result.addFeature(startIx+1, 1.0); }
093                    if (isStateIntergenic[state] && frameshifter[pos])          { result.addFeature(startIx+2, 1.0); }
094                    if (isStateCoding[state]     && gapboundary[pos])           { result.addFeature(startIx+3, 1.0); }
095                    if (isStateIntronic[state]   && gapboundary[pos])           { result.addFeature(startIx+4, 1.0); }
096                    if (isStateIntergenic[state] && gapboundary[pos])           { result.addFeature(startIx+5, 1.0); }
097            }
098    
099            private void performPrecomputations(MultipleAlignmentInputSequence seq) {
100                    // In this method is where you need to set the boolean vectors frameshifter[i] and gapboundary[i] using the multiple alignment seq.
101    
102                    // Maybe a little inefficient with memory allocation but hopefully not too much
103                    if (seq.length() > frameshifter.length) {
104                            frameshifter = new Boolean[seq.length()];
105                            gapboundary  = new Boolean[seq.length()];
106                    }
107                    
108                    for (int j=0; j<seq.length(); j++) {
109                            frameshifter[j] = false;
110                            gapboundary[j] = false;
111                    }
112                    
113                    int numSpecies = seq.numSpecies();
114                    int consensusLength = seq.getConsensusLength();
115                    
116                    for (int spec = 0; spec<numSpecies; spec++) {
117                            boolean inGap = false;
118                            int conGapStart = -1;
119                            for (int cpos = 1; cpos< consensusLength; cpos++ ) {
120                                    if ( (!inGap) && (h.hashable(seq.characterInPaddedAlignment(cpos-1,spec))) && (!h.hashable(seq.characterInPaddedAlignment(cpos,spec))) ) {
121                                            inGap = true;
122                                            conGapStart = cpos;
123                                    }
124                                    if ( (inGap) && (!h.hashable(seq.characterInPaddedAlignment(cpos-1,spec))) && (h.hashable(seq.characterInPaddedAlignment(cpos,spec))) ) {
125                                            inGap = false;
126                                            int conGapEnd = cpos-1;
127                                            int gaplen = conGapEnd - conGapStart + 1;
128                                            if (gaplen <=60) {
129                                                    if ( (gaplen%3) == 0) {
130                                                            gapboundary[seq.con2refLeft(conGapStart)] = true;
131                                                            gapboundary[seq.con2refRight(conGapEnd)] = true;
132                                                    } else {
133                                                            frameshifter[seq.con2refLeft(conGapStart)] = true;
134                                                            frameshifter[seq.con2refRight(conGapEnd)] = true;                                                       
135                                                    }
136                                                    
137                                            }
138                                            
139                                    }
140                                            
141                            }
142                            
143                    }
144                    
145            }
146    
147            public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends MultipleAlignmentColumn>> data) {
148    
149                    startIx = startingIndex;
150                    model = modelInfo;
151                    nStates = model.getNumStates();
152    
153                    maxSeqLength = 0;                       
154                    for(TrainingSequence<? extends MultipleAlignmentColumn> seq : data) {
155                            int len = seq.length();
156                            if (len > maxSeqLength) { maxSeqLength = len; }
157                    }
158                    frameshifter = new Boolean[maxSeqLength];
159                    gapboundary  = new Boolean[maxSeqLength];
160                                    
161                    isStateCoding = new Boolean[nStates];       for (int j=0; j<nStates; j++) { isStateCoding[j] = false; }
162                    isStateCoding[1] = true;
163                    isStateCoding[2] = true;
164                    isStateCoding[3] = true;
165                    isStateCoding[7] = true;
166                    isStateCoding[8] = true;
167                    isStateCoding[9] = true;                
168    
169                    isStateIntronic = new Boolean[nStates];     for (int j=0; j<nStates; j++) { isStateIntronic[j] = false; }
170                    isStateIntronic[4] = true;
171                    isStateIntronic[5] = true;
172                    isStateIntronic[6] = true;
173                    isStateIntronic[10] = true;
174                    isStateIntronic[11] = true;
175                    isStateIntronic[12] = true;
176    
177                    isStateIntergenic = new Boolean[nStates];   for (int j=0; j<nStates; j++) { isStateIntergenic[j] = false; }
178                    isStateIntergenic[0] = true;
179                    
180            }
181            @Override
182            public CacheStrategySpec getCacheStrategy() {
183                    return new CacheStrategySpec(CacheStrategy.SPARSE);
184            }
185            
186    }
187