001 package calhoun.analysis.crf.features.interval29;
002
003 import java.util.List;
004
005 import org.apache.commons.logging.Log;
006 import org.apache.commons.logging.LogFactory;
007
008 import calhoun.analysis.crf.AbstractFeatureManager;
009 import calhoun.analysis.crf.CacheStrategySpec;
010 import calhoun.analysis.crf.FeatureList;
011 import calhoun.analysis.crf.FeatureManagerNode;
012 import calhoun.analysis.crf.ModelManager;
013 import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
014 import calhoun.analysis.crf.features.interval13.FootprintsInterval13;
015 import calhoun.analysis.crf.io.InputSequence;
016 import calhoun.analysis.crf.io.TrainingSequence;
017 import calhoun.analysis.crf.io.MultipleAlignmentInputSequence.MultipleAlignmentColumn;
018 import calhoun.seq.KmerHasher;
019 import calhoun.util.Assert;
020
021 public class FootprintsInterval29 extends AbstractFeatureManager<MultipleAlignmentColumn> implements FeatureManagerNode<MultipleAlignmentColumn> {
022 private static final long serialVersionUID = -885708304411544895L;
023 private static final Log log = LogFactory.getLog(FootprintsInterval29.class);
024 boolean debug = log.isDebugEnabled();
025
026 /* Features are the conjunction of "species X is present in multiple alignment" with hidden state is "exonic, intronic, intergenic"
027 * Is the number of features allowed to depend on the number of species inmultiple alignment??
028 */
029
030 List<String> speciesNames;
031 int startIx;
032 ModelManager model;
033 KmerHasher h = new KmerHasher(KmerHasher.ACGTN,1);
034
035 int maxSeqLength;
036
037 int nFeatures = -1;
038
039 Boolean[] isStateCoding, isStateIntronic, isStateIntergenic;
040
041
042 public FootprintsInterval29() {
043 }
044
045 public int getNumFeatures() {
046 return nFeatures;
047 }
048
049 public String getFeatureName(int featureIndex) {
050 String[] type = new String[] { "intergenic", "exonic", "intronic"};
051 int raw = featureIndex - startIx;
052 Assert.a(raw<nFeatures);
053 if(speciesNames == null) {
054 return "Species "+((raw/3) + 1) + " "+type[raw%3]+" footprint";
055 }
056 return speciesNames.get((raw/3) + 1) + " "+type[raw%3]+" footprint";
057 }
058
059
060 public void evaluateNode(InputSequence<? extends MultipleAlignmentColumn> seq, int pos, int state, FeatureList result) {
061 MultipleAlignmentColumn mac = seq.getX(pos);
062 for (int species = 1; species<mac.numSpecies(); species++) {
063 if (mac.nucleotide(species) == '-') continue;
064
065 if (isStateIntergenic[state]) { result.addFeature(startIx+((species-1)*3+0), 1.0); }
066 if (isStateCoding[state]) { result.addFeature(startIx+((species-1)*3+1), 1.0); }
067 if (isStateIntronic[state]) { result.addFeature(startIx+((species-1)*3+2), 1.0); }
068 }
069 }
070
071
072 public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends MultipleAlignmentColumn>> data) {
073 TrainingSequence<? extends MultipleAlignmentColumn> seq = data.get(0);
074 speciesNames = seq.getX(0).getMultipleAlignment().getSpeciesNames();
075
076 startIx = startingIndex;
077 model = modelInfo;
078 int nStates = model.getNumStates();
079
080 nFeatures = 3*(data.get(0).getX(0).numSpecies()-1); // Assumes this is the same for all alignments
081
082 isStateCoding = new Boolean[nStates]; for (int j=0; j<nStates; j++) { isStateCoding[j] = false; }
083 isStateCoding[1] = true;
084 isStateCoding[2] = true;
085 isStateCoding[3] = true;
086 isStateCoding[7] = true;
087 isStateCoding[8] = true;
088 isStateCoding[9] = true;
089
090 isStateIntronic = new Boolean[nStates]; for (int j=0; j<nStates; j++) { isStateIntronic[j] = false; }
091 isStateIntronic[4] = true;
092 isStateIntronic[5] = true;
093 isStateIntronic[6] = true;
094 isStateIntronic[10] = true;
095 isStateIntronic[11] = true;
096 isStateIntronic[12] = true;
097 isStateIntronic[15] = true;
098 isStateIntronic[16] = true;
099 isStateIntronic[17] = true;
100 isStateIntronic[18] = true;
101 isStateIntronic[19] = true;
102 isStateIntronic[20] = true;
103 isStateIntronic[23] = true;
104 isStateIntronic[24] = true;
105 isStateIntronic[25] = true;
106 isStateIntronic[26] = true;
107 isStateIntronic[27] = true;
108 isStateIntronic[28] = true;
109
110 isStateIntergenic = new Boolean[nStates]; for (int j=0; j<nStates; j++) { isStateIntergenic[j] = false; }
111 isStateIntergenic[0] = true;
112 isStateIntergenic[13] = true;
113 isStateIntergenic[14] = true;
114 isStateIntergenic[21] = true;
115 isStateIntergenic[22] = true;
116
117 }
118 @Override
119 public CacheStrategySpec getCacheStrategy() {
120 return new CacheStrategySpec(CacheStrategy.DENSE);
121 }
122 }