001 package calhoun.analysis.crf.features.interval13;
002
003 import java.util.List;
004
005 import org.apache.commons.logging.Log;
006 import org.apache.commons.logging.LogFactory;
007
008 import calhoun.analysis.crf.AbstractFeatureManager;
009 import calhoun.analysis.crf.CacheStrategySpec;
010 import calhoun.analysis.crf.FeatureList;
011 import calhoun.analysis.crf.FeatureManagerNode;
012 import calhoun.analysis.crf.ModelManager;
013 import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
014 import calhoun.analysis.crf.io.InputSequence;
015 import calhoun.analysis.crf.io.TrainingSequence;
016 import calhoun.analysis.crf.io.MultipleAlignmentInputSequence.MultipleAlignmentColumn;
017 import calhoun.seq.KmerHasher;
018 import calhoun.util.Assert;
019
020 public class FootprintsInterval13 extends AbstractFeatureManager<MultipleAlignmentColumn> implements FeatureManagerNode<MultipleAlignmentColumn> {
021 private static final long serialVersionUID = -7659288739348604129L;
022 private static final Log log = LogFactory.getLog(FootprintsInterval13.class);
023 boolean debug = log.isDebugEnabled();
024
025 /* Features are the conjunction of "species X is present in multiple alignment" with hidden state is "exonic, intronic, intergenic"
026 * Is the number of features allowed to depend on the number of species inmultiple alignment??
027 */
028
029 List<String> speciesNames;
030 int startIx;
031 ModelManager model;
032 KmerHasher h = new KmerHasher(KmerHasher.ACGTN,1);
033
034 int maxSeqLength;
035
036 int nFeatures = -1;
037
038 Boolean[] isStateCoding, isStateIntronic, isStateIntergenic;
039
040
041 public FootprintsInterval13() {
042 }
043
044 public int getNumFeatures() {
045 return nFeatures;
046 }
047
048 public String getFeatureName(int featureIndex) {
049 String[] type = new String[] { "intergenic", "exonic", "intronic"};
050 int raw = featureIndex - startIx;
051 Assert.a(raw<nFeatures);
052 if(speciesNames == null) {
053 return "Species "+((raw/3) + 1) + " "+type[raw%3]+" footprint";
054 }
055 return speciesNames.get((raw/3) + 1) + " "+type[raw%3]+" footprint";
056 }
057
058
059 public void evaluateNode(InputSequence<? extends MultipleAlignmentColumn> seq, int pos, int state, FeatureList result) {
060 MultipleAlignmentColumn mac = seq.getX(pos);
061 for (int species = 1; species<mac.numSpecies(); species++) {
062 if (mac.nucleotide(species) == '-') continue;
063
064 if (isStateIntergenic[state]) { result.addFeature(startIx+((species-1)*3+0), 1.0); }
065 if (isStateCoding[state]) { result.addFeature(startIx+((species-1)*3+1), 1.0); }
066 if (isStateIntronic[state]) { result.addFeature(startIx+((species-1)*3+2), 1.0); }
067 }
068 }
069
070
071 public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends MultipleAlignmentColumn>> data) {
072 TrainingSequence<? extends MultipleAlignmentColumn> seq = data.get(0);
073 speciesNames = seq.getX(0).getMultipleAlignment().getSpeciesNames();
074
075 startIx = startingIndex;
076 model = modelInfo;
077 int nStates = model.getNumStates();
078
079 nFeatures = 3*(data.get(0).getX(0).numSpecies()-1); // Assumes this is the same for all alignments
080
081 isStateCoding = new Boolean[nStates]; for (int j=0; j<nStates; j++) { isStateCoding[j] = false; }
082 isStateCoding[1] = true;
083 isStateCoding[2] = true;
084 isStateCoding[3] = true;
085 isStateCoding[7] = true;
086 isStateCoding[8] = true;
087 isStateCoding[9] = true;
088
089 isStateIntronic = new Boolean[nStates]; for (int j=0; j<nStates; j++) { isStateIntronic[j] = false; }
090 isStateIntronic[4] = true;
091 isStateIntronic[5] = true;
092 isStateIntronic[6] = true;
093 isStateIntronic[10] = true;
094 isStateIntronic[11] = true;
095 isStateIntronic[12] = true;
096
097 isStateIntergenic = new Boolean[nStates]; for (int j=0; j<nStates; j++) { isStateIntergenic[j] = false; }
098 isStateIntergenic[0] = true;
099
100 }
101 @Override
102 public CacheStrategySpec getCacheStrategy() {
103 return new CacheStrategySpec(CacheStrategy.DENSE);
104 }
105
106 }
107