001 package calhoun.analysis.crf.features.interval29;
002
003 import java.util.List;
004
005 import org.apache.commons.logging.Log;
006 import org.apache.commons.logging.LogFactory;
007
008 import calhoun.analysis.crf.AbstractFeatureManager;
009 import calhoun.analysis.crf.CacheStrategySpec;
010 import calhoun.analysis.crf.FeatureList;
011 import calhoun.analysis.crf.FeatureManagerNode;
012 import calhoun.analysis.crf.ModelManager;
013 import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
014 import calhoun.analysis.crf.features.interval13.GapFeaturesInterval13;
015 import calhoun.analysis.crf.io.InputSequence;
016 import calhoun.analysis.crf.io.MultipleAlignmentInputSequence;
017 import calhoun.analysis.crf.io.TrainingSequence;
018 import calhoun.analysis.crf.io.MultipleAlignmentInputSequence.MultipleAlignmentColumn;
019 import calhoun.seq.KmerHasher;
020 import calhoun.util.Assert;
021
022 public class GapFeaturesInterval29 extends AbstractFeatureManager<MultipleAlignmentColumn> implements FeatureManagerNode<MultipleAlignmentColumn>{
023 private static final long serialVersionUID = 8484623829139711313L;
024 private static final Log log = LogFactory.getLog(GapFeaturesInterval29.class);
025 boolean debug = log.isDebugEnabled();
026 String[] featureNames = new String[] { "Frameshift coding", "Frameshift intron", "Frameshift exon", "Mod 3 gap coding", "Mod 3 gap intron", "Mod 3 gap exon"};
027
028 /* Let G(i) and F(i) be booleans, one per position, that indicate whether there is
029 * a gap in the multiple alignment for which either 1) The first non-gap character
030 * of the reference sequence to the right of the gap is at position i, or 2) The last
031 * non-gap character of the reference sequence to the left of the gap is at position i;
032 * and 3) the gap is a multiple of 3. G stands for "gap"
033 *
034 * F(i) is the same thing except for a non-multiple of three length.
035 * F stands for "frameshifter".
036 *
037 * The we define indicator features for the following conjunctions:
038 * 1) F(i) & (coding)
039 * 2) F(i) & (intronic)
040 * 3) F(i) & (intergenic)
041 * 4) G(i) & (coding)
042 * 5) G(i) & (intronic)
043 * 6) G(i) & (intergenic)
044 */
045
046 // This code is essentially unchanged from the tricycle13 feature of similar name.
047
048 int startIx;
049 ModelManager model;
050 KmerHasher h = new KmerHasher(KmerHasher.ACGTN,1);
051
052 int maxSeqLength;
053
054 int nFeatures = 6;
055 int nStates;
056
057 Boolean[] gapboundary, frameshifter;
058 Boolean[] isStateCoding, isStateIntronic, isStateIntergenic;
059
060 int lastSeqLength = -1;
061 int lastpos = -1;
062
063
064 public GapFeaturesInterval29() {
065 }
066
067 public int getNumFeatures() {
068 return nFeatures;
069 }
070
071 public String getFeatureName(int featureIndex) {
072 String[] names = new String[] {"Coding frameshift", "Intron frameshift", "Intergenic frameshift", "Coding mod3 gap", "Intron mod3 gap", "Intergenic mod3 gap"};
073 int raw = featureIndex - startIx;
074 Assert.a(raw<nFeatures);
075 String ret = names[raw];
076 return ret;
077 }
078
079 transient InputSequence<? extends MultipleAlignmentColumn> lastSeq = null;
080
081 public void evaluateNode(InputSequence<? extends MultipleAlignmentColumn> seq, int pos, int state, FeatureList result) {
082 // Try out one of the following two lines.
083 if ( (seq != lastSeq) ) {
084 //if( (seq.length() != lastSeqLength) || (pos < lastpos) ) {
085 log.debug("Performing precomputations for seq of length " + seq.length() + " at position " + pos);
086 performPrecomputations(seq.getX(0).getMultipleAlignment());
087 lastSeqLength = seq.length();
088 lastpos = pos;
089 lastSeq = seq;
090 }
091
092 if (isStateCoding[state] && frameshifter[pos]) { result.addFeature(startIx+0, 1.0); }
093 if (isStateIntronic[state] && frameshifter[pos]) { result.addFeature(startIx+1, 1.0); }
094 if (isStateIntergenic[state] && frameshifter[pos]) { result.addFeature(startIx+2, 1.0); }
095 if (isStateCoding[state] && gapboundary[pos]) { result.addFeature(startIx+3, 1.0); }
096 if (isStateIntronic[state] && gapboundary[pos]) { result.addFeature(startIx+4, 1.0); }
097 if (isStateIntergenic[state] && gapboundary[pos]) { result.addFeature(startIx+5, 1.0); }
098 }
099
100 private void performPrecomputations(MultipleAlignmentInputSequence seq) {
101 // In this method is where you need to set the boolean vectors frameshifter[i] and gapboundary[i] using the multiple alignment seq.
102
103 // Maybe a little inefficient with memory allocation but hopefully not too much
104 if (seq.length() > frameshifter.length) {
105 frameshifter = new Boolean[seq.length()];
106 gapboundary = new Boolean[seq.length()];
107 }
108
109 for (int j=0; j<seq.length(); j++) {
110 frameshifter[j] = false;
111 gapboundary[j] = false;
112 }
113
114 int numSpecies = seq.numSpecies();
115 int consensusLength = seq.getConsensusLength();
116
117 for (int spec = 0; spec<numSpecies; spec++) {
118 boolean inGap = false;
119 int conGapStart = -1;
120 for (int cpos = 1; cpos< consensusLength; cpos++ ) {
121 if ( (!inGap) && (h.hashable(seq.characterInPaddedAlignment(cpos-1,spec))) && (!h.hashable(seq.characterInPaddedAlignment(cpos,spec))) ) {
122 inGap = true;
123 conGapStart = cpos;
124 }
125 if ( (inGap) && (!h.hashable(seq.characterInPaddedAlignment(cpos-1,spec))) && (h.hashable(seq.characterInPaddedAlignment(cpos,spec))) ) {
126 inGap = false;
127 int conGapEnd = cpos-1;
128 int gaplen = conGapEnd - conGapStart + 1;
129 if (gaplen <=60) {
130 if ( (gaplen%3) == 0) {
131 gapboundary[seq.con2refLeft(conGapStart)] = true;
132 gapboundary[seq.con2refRight(conGapEnd)] = true;
133 } else {
134 frameshifter[seq.con2refLeft(conGapStart)] = true;
135 frameshifter[seq.con2refRight(conGapEnd)] = true;
136 }
137
138 }
139
140 }
141
142 }
143
144 }
145
146 }
147
148 public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends MultipleAlignmentColumn>> data) {
149
150 startIx = startingIndex;
151 model = modelInfo;
152 nStates = model.getNumStates();
153
154 maxSeqLength = 0;
155 for(TrainingSequence<? extends MultipleAlignmentColumn> seq : data) {
156 int len = seq.length();
157 if (len > maxSeqLength) { maxSeqLength = len; }
158 }
159 frameshifter = new Boolean[maxSeqLength];
160 gapboundary = new Boolean[maxSeqLength];
161
162 isStateCoding = new Boolean[nStates]; for (int j=0; j<nStates; j++) { isStateCoding[j] = false; }
163 isStateCoding[1] = true;
164 isStateCoding[2] = true;
165 isStateCoding[3] = true;
166 isStateCoding[7] = true;
167 isStateCoding[8] = true;
168 isStateCoding[9] = true;
169
170 isStateIntronic = new Boolean[nStates]; for (int j=0; j<nStates; j++) { isStateIntronic[j] = false; }
171 isStateIntronic[4] = true;
172 isStateIntronic[5] = true;
173 isStateIntronic[6] = true;
174 isStateIntronic[10] = true;
175 isStateIntronic[11] = true;
176 isStateIntronic[12] = true;
177 isStateIntronic[15] = true;
178 isStateIntronic[16] = true;
179 isStateIntronic[17] = true;
180 isStateIntronic[18] = true;
181 isStateIntronic[19] = true;
182 isStateIntronic[20] = true;
183 isStateIntronic[23] = true;
184 isStateIntronic[24] = true;
185 isStateIntronic[25] = true;
186 isStateIntronic[26] = true;
187 isStateIntronic[27] = true;
188 isStateIntronic[28] = true;
189
190 isStateIntergenic = new Boolean[nStates]; for (int j=0; j<nStates; j++) { isStateIntergenic[j] = false; }
191 isStateIntergenic[0] = true;
192 isStateIntergenic[13] = true;
193 isStateIntergenic[14] = true;
194 isStateIntergenic[21] = true;
195 isStateIntergenic[22] = true;
196
197 }
198 @Override
199 public CacheStrategySpec getCacheStrategy() {
200 return new CacheStrategySpec(CacheStrategy.SPARSE);
201 }
202 }