001 package calhoun.analysis.crf.features.interval13;
002
003 import java.util.List;
004
005 import org.apache.commons.logging.Log;
006 import org.apache.commons.logging.LogFactory;
007
008 import calhoun.analysis.crf.AbstractFeatureManager;
009 import calhoun.analysis.crf.CacheStrategySpec;
010 import calhoun.analysis.crf.FeatureList;
011 import calhoun.analysis.crf.FeatureManagerNode;
012 import calhoun.analysis.crf.ModelManager;
013 import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
014 import calhoun.analysis.crf.io.InputSequence;
015 import calhoun.analysis.crf.io.MultipleAlignmentInputSequence;
016 import calhoun.analysis.crf.io.TrainingSequence;
017 import calhoun.analysis.crf.io.MultipleAlignmentInputSequence.MultipleAlignmentColumn;
018 import calhoun.seq.KmerHasher;
019 import calhoun.util.Assert;
020
021 public class GapFeaturesInterval13 extends AbstractFeatureManager<MultipleAlignmentColumn> implements FeatureManagerNode<MultipleAlignmentColumn> {
022 private static final long serialVersionUID = -7659288739348604129L;
023 private static final Log log = LogFactory.getLog(GapFeaturesInterval13.class);
024 boolean debug = log.isDebugEnabled();
025 String[] featureNames = new String[] { "Frameshift coding", "Frameshift intron", "Frameshift exon", "Mod 3 gap coding", "Mod 3 gap intron", "Mod 3 gap exon"};
026
027 /* Let G(i) and F(i) be booleans, one per position, that indicate whether there is
028 * a gap in the multiple alignment for which either 1) The first non-gap character
029 * of the reference sequence to the right of the gap is at position i, or 2) The last
030 * non-gap character of the reference sequence to the left of the gap is at position i;
031 * and 3) the gap is a multiple of 3. G stands for "gap"
032 *
033 * F(i) is the same thing except for a non-multiple of three length.
034 * F stands for "frameshifter".
035 *
036 * The we define indicator features for the following conjunctions:
037 * 1) F(i) & (coding)
038 * 2) F(i) & (intronic)
039 * 3) F(i) & (intergenic)
040 * 4) G(i) & (coding)
041 * 5) G(i) & (intronic)
042 * 6) G(i) & (intergenic)
043 */
044
045 // This code is essentially unchanged from the tricycle13 feature of similar name.
046
047 int startIx;
048 ModelManager model;
049 KmerHasher h = new KmerHasher(KmerHasher.ACGTN,1);
050
051 int maxSeqLength;
052
053 int nFeatures = 6;
054 int nStates;
055
056 Boolean[] gapboundary, frameshifter;
057 Boolean[] isStateCoding, isStateIntronic, isStateIntergenic;
058
059 int lastSeqLength = -1;
060 int lastpos = -1;
061
062
063 public GapFeaturesInterval13() {
064 }
065
066 public int getNumFeatures() {
067 return nFeatures;
068 }
069
070 public String getFeatureName(int featureIndex) {
071 String[] names = new String[] {"Coding frameshift", "Intron frameshift", "Intergenic frameshift", "Coding mod3 gap", "Intron mod3 gap", "Intergenic mod3 gap"};
072 int raw = featureIndex - startIx;
073 Assert.a(raw<nFeatures);
074 String ret = names[raw];
075 return ret;
076 }
077
078 transient InputSequence<? extends MultipleAlignmentColumn> lastSeq = null;
079
080 public void evaluateNode(InputSequence<? extends MultipleAlignmentColumn> seq, int pos, int state, FeatureList result) {
081 // Try out one of the following two lines.
082 if ( (seq != lastSeq) ) {
083 //if( (seq.length() != lastSeqLength) || (pos < lastpos) ) {
084 log.debug("Performing precomputations for seq of length " + seq.length() + " at position " + pos);
085 performPrecomputations(seq.getX(0).getMultipleAlignment());
086 lastSeqLength = seq.length();
087 lastpos = pos;
088 lastSeq = seq;
089 }
090
091 if (isStateCoding[state] && frameshifter[pos]) { result.addFeature(startIx+0, 1.0); }
092 if (isStateIntronic[state] && frameshifter[pos]) { result.addFeature(startIx+1, 1.0); }
093 if (isStateIntergenic[state] && frameshifter[pos]) { result.addFeature(startIx+2, 1.0); }
094 if (isStateCoding[state] && gapboundary[pos]) { result.addFeature(startIx+3, 1.0); }
095 if (isStateIntronic[state] && gapboundary[pos]) { result.addFeature(startIx+4, 1.0); }
096 if (isStateIntergenic[state] && gapboundary[pos]) { result.addFeature(startIx+5, 1.0); }
097 }
098
099 private void performPrecomputations(MultipleAlignmentInputSequence seq) {
100 // In this method is where you need to set the boolean vectors frameshifter[i] and gapboundary[i] using the multiple alignment seq.
101
102 // Maybe a little inefficient with memory allocation but hopefully not too much
103 if (seq.length() > frameshifter.length) {
104 frameshifter = new Boolean[seq.length()];
105 gapboundary = new Boolean[seq.length()];
106 }
107
108 for (int j=0; j<seq.length(); j++) {
109 frameshifter[j] = false;
110 gapboundary[j] = false;
111 }
112
113 int numSpecies = seq.numSpecies();
114 int consensusLength = seq.getConsensusLength();
115
116 for (int spec = 0; spec<numSpecies; spec++) {
117 boolean inGap = false;
118 int conGapStart = -1;
119 for (int cpos = 1; cpos< consensusLength; cpos++ ) {
120 if ( (!inGap) && (h.hashable(seq.characterInPaddedAlignment(cpos-1,spec))) && (!h.hashable(seq.characterInPaddedAlignment(cpos,spec))) ) {
121 inGap = true;
122 conGapStart = cpos;
123 }
124 if ( (inGap) && (!h.hashable(seq.characterInPaddedAlignment(cpos-1,spec))) && (h.hashable(seq.characterInPaddedAlignment(cpos,spec))) ) {
125 inGap = false;
126 int conGapEnd = cpos-1;
127 int gaplen = conGapEnd - conGapStart + 1;
128 if (gaplen <=60) {
129 if ( (gaplen%3) == 0) {
130 gapboundary[seq.con2refLeft(conGapStart)] = true;
131 gapboundary[seq.con2refRight(conGapEnd)] = true;
132 } else {
133 frameshifter[seq.con2refLeft(conGapStart)] = true;
134 frameshifter[seq.con2refRight(conGapEnd)] = true;
135 }
136
137 }
138
139 }
140
141 }
142
143 }
144
145 }
146
147 public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends MultipleAlignmentColumn>> data) {
148
149 startIx = startingIndex;
150 model = modelInfo;
151 nStates = model.getNumStates();
152
153 maxSeqLength = 0;
154 for(TrainingSequence<? extends MultipleAlignmentColumn> seq : data) {
155 int len = seq.length();
156 if (len > maxSeqLength) { maxSeqLength = len; }
157 }
158 frameshifter = new Boolean[maxSeqLength];
159 gapboundary = new Boolean[maxSeqLength];
160
161 isStateCoding = new Boolean[nStates]; for (int j=0; j<nStates; j++) { isStateCoding[j] = false; }
162 isStateCoding[1] = true;
163 isStateCoding[2] = true;
164 isStateCoding[3] = true;
165 isStateCoding[7] = true;
166 isStateCoding[8] = true;
167 isStateCoding[9] = true;
168
169 isStateIntronic = new Boolean[nStates]; for (int j=0; j<nStates; j++) { isStateIntronic[j] = false; }
170 isStateIntronic[4] = true;
171 isStateIntronic[5] = true;
172 isStateIntronic[6] = true;
173 isStateIntronic[10] = true;
174 isStateIntronic[11] = true;
175 isStateIntronic[12] = true;
176
177 isStateIntergenic = new Boolean[nStates]; for (int j=0; j<nStates; j++) { isStateIntergenic[j] = false; }
178 isStateIntergenic[0] = true;
179
180 }
181 @Override
182 public CacheStrategySpec getCacheStrategy() {
183 return new CacheStrategySpec(CacheStrategy.SPARSE);
184 }
185
186 }
187