001 package calhoun.analysis.crf.features.tricycle13;
002
003 import java.util.List;
004
005 import org.apache.commons.logging.Log;
006 import org.apache.commons.logging.LogFactory;
007
008 import calhoun.analysis.crf.AbstractFeatureManager;
009 import calhoun.analysis.crf.CacheStrategySpec;
010 import calhoun.analysis.crf.FeatureList;
011 import calhoun.analysis.crf.FeatureManagerNode;
012 import calhoun.analysis.crf.ModelManager;
013 import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
014 import calhoun.analysis.crf.io.InputSequence;
015 import calhoun.analysis.crf.io.MultipleAlignmentInputSequence;
016 import calhoun.analysis.crf.io.TrainingSequence;
017 import calhoun.analysis.crf.io.MultipleAlignmentInputSequence.MultipleAlignmentColumn;
018 import calhoun.seq.KmerHasher;
019 import calhoun.util.Assert;
020
021 public class GapConjunctionFeatures extends AbstractFeatureManager<MultipleAlignmentColumn> implements FeatureManagerNode<MultipleAlignmentColumn> {
022 private static final long serialVersionUID = -7659288739348604129L;
023 private static final Log log = LogFactory.getLog(GapConjunctionFeatures.class);
024 boolean debug = log.isDebugEnabled();
025
026 /* Let G(i) and F(i) be booleans, one per position, that indicate whether there is
027 * a gap in the multiple alignment for which either 1) The first non-gap character
028 * of the reference sequence to the right of the gap is at position i, or 2) The last
029 * non-gap character of the reference sequence to the left of the gap is at position i;
030 * and 3) the gap is a multiple of 3. G stands for "gap"
031 *
032 * F(i) is the same thing except for a non-multiple of three length.
033 * F stands for "frameshifter".
034 *
035 * The we define indicator features for the following conjunctions:
036 * 1) F(i) & (coding)
037 * 2) F(i) & (intronic)
038 * 3) F(i) & (intergenic)
039 * 4) G(i) & (coding)
040 * 5) G(i) & (intronic)
041 * 6) G(i) & (intergenic)
042 */
043
044 int startIx;
045 ModelManager model;
046 KmerHasher h = new KmerHasher(KmerHasher.ACGTN,1);
047
048 int maxSeqLength;
049
050 int nFeatures = 6;
051 int nStates;
052
053 Boolean[] gapboundary, frameshifter;
054 Boolean[] isStateCoding, isStateIntronic, isStateIntergenic;
055
056 int lastSeqLength = -1;
057 int lastpos = -1;
058
059
060 public GapConjunctionFeatures() {
061 }
062
063 public int getNumFeatures() {
064 return nFeatures;
065 }
066
067 public String getFeatureName(int featureIndex) {
068 int raw = featureIndex - startIx;
069 Assert.a(raw<nFeatures);
070 String ret = "GapConjunctionFeature." + raw;
071 return ret;
072 }
073
074 InputSequence<? extends MultipleAlignmentColumn> lastSeq = null;
075
076 public void evaluateNode(InputSequence<? extends MultipleAlignmentColumn> seq, int pos, int state, FeatureList result) {
077 // Try out one of the following two lines.
078 if ( (seq != lastSeq) ) {
079 //if( (seq.length() != lastSeqLength) || (pos < lastpos) ) {
080 log.debug(seq);
081 log.debug(lastSeq);
082 log.debug("Performing precomputations for seq of length " + seq.length() + " at position " + pos);
083 performPrecomputations(seq.getX(0).getMultipleAlignment());
084 lastSeqLength = seq.length();
085 lastpos = pos;
086 lastSeq = seq;
087 }
088
089 if (isStateCoding[state] && frameshifter[pos]) { result.addFeature(startIx+0, 1.0); }
090 if (isStateIntronic[state] && frameshifter[pos]) { result.addFeature(startIx+1, 1.0); }
091 if (isStateIntergenic[state] && frameshifter[pos]) { result.addFeature(startIx+2, 1.0); }
092 if (isStateCoding[state] && gapboundary[pos]) { result.addFeature(startIx+3, 1.0); }
093 if (isStateIntronic[state] && gapboundary[pos]) { result.addFeature(startIx+4, 1.0); }
094 if (isStateIntergenic[state] && gapboundary[pos]) { result.addFeature(startIx+5, 1.0); }
095 }
096
097 private void performPrecomputations(MultipleAlignmentInputSequence seq) {
098 // In this method is where you need to set the boolean vectors frameshifter[i] and gapboundary[i] using the multiple alignment seq.
099
100 // Maybe a little inefficient with memory allocation but hopefully not too much
101 if (seq.length() > frameshifter.length) {
102 frameshifter = new Boolean[seq.length()];
103 gapboundary = new Boolean[seq.length()];
104 }
105
106 for (int j=0; j<seq.length(); j++) {
107 frameshifter[j] = false;
108 gapboundary[j] = false;
109 }
110
111 int numSpecies = seq.numSpecies();
112 int consensusLength = seq.getConsensusLength();
113
114 for (int spec = 0; spec<numSpecies; spec++) {
115 boolean inGap = false;
116 int conGapStart = -1;
117 for (int cpos = 1; cpos< consensusLength; cpos++ ) {
118 if ( (!inGap) && (h.hashable(seq.characterInPaddedAlignment(cpos-1,spec))) && (!h.hashable(seq.characterInPaddedAlignment(cpos,spec))) ) {
119 inGap = true;
120 conGapStart = cpos;
121 }
122 if ( (inGap) && (!h.hashable(seq.characterInPaddedAlignment(cpos-1,spec))) && (h.hashable(seq.characterInPaddedAlignment(cpos,spec))) ) {
123 inGap = false;
124 int conGapEnd = cpos-1;
125 int gaplen = conGapEnd - conGapStart + 1;
126 if (gaplen <=60) {
127 if ( (gaplen%3) == 0) {
128 gapboundary[seq.con2refLeft(conGapStart)] = true;
129 gapboundary[seq.con2refRight(conGapEnd)] = true;
130 } else {
131 frameshifter[seq.con2refLeft(conGapStart)] = true;
132 frameshifter[seq.con2refRight(conGapEnd)] = true;
133 }
134
135 }
136
137 }
138
139 }
140
141 }
142
143 }
144
145 public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends MultipleAlignmentColumn>> data) {
146
147 startIx = startingIndex;
148 model = modelInfo;
149 nStates = model.getNumStates();
150
151 maxSeqLength = 0;
152 for(TrainingSequence<? extends MultipleAlignmentColumn> seq : data) {
153 int len = seq.length();
154 if (len > maxSeqLength) { maxSeqLength = len; }
155 }
156 frameshifter = new Boolean[maxSeqLength];
157 gapboundary = new Boolean[maxSeqLength];
158
159
160 isStateCoding = new Boolean[nStates]; for (int j=0; j<nStates; j++) { isStateCoding[j] = false; }
161 isStateCoding[model.getStateIndex("exon1")] = true;
162 isStateCoding[model.getStateIndex("exon2")] = true;
163 isStateCoding[model.getStateIndex("exon3")] = true;
164 isStateCoding[model.getStateIndex("exon1m")] = true;
165 isStateCoding[model.getStateIndex("exon2m")] = true;
166 isStateCoding[model.getStateIndex("exon3m")] = true;
167
168 isStateIntronic = new Boolean[nStates]; for (int j=0; j<nStates; j++) { isStateIntronic[j] = false; }
169 isStateIntronic[model.getStateIndex("intron1")] = true;
170 isStateIntronic[model.getStateIndex("intron2")] = true;
171 isStateIntronic[model.getStateIndex("intron3")] = true;
172 isStateIntronic[model.getStateIndex("intron1m")] = true;
173 isStateIntronic[model.getStateIndex("intron2m")] = true;
174 isStateIntronic[model.getStateIndex("intron3m")] = true;
175
176 isStateIntergenic = new Boolean[nStates]; for (int j=0; j<nStates; j++) { isStateIntergenic[j] = false; }
177 isStateIntergenic[model.getStateIndex("intergenic")] = true;
178
179 }
180 @Override
181 public CacheStrategySpec getCacheStrategy() {
182 return new CacheStrategySpec(CacheStrategy.UNSPECIFIED);
183 }
184 }
185