001    package calhoun.analysis.crf.features.interval13;
002    
003    import java.util.List;
004    
005    import org.apache.commons.logging.Log;
006    import org.apache.commons.logging.LogFactory;
007    
008    import calhoun.analysis.crf.AbstractFeatureManager;
009    import calhoun.analysis.crf.CacheStrategySpec;
010    import calhoun.analysis.crf.FeatureList;
011    import calhoun.analysis.crf.FeatureManagerNode;
012    import calhoun.analysis.crf.ModelManager;
013    import calhoun.analysis.crf.CacheStrategySpec.CacheStrategy;
014    import calhoun.analysis.crf.io.InputSequence;
015    import calhoun.analysis.crf.io.TrainingSequence;
016    import calhoun.analysis.crf.io.MultipleAlignmentInputSequence.MultipleAlignmentColumn;
017    import calhoun.seq.KmerHasher;
018    import calhoun.util.Assert;
019    
020    public class FootprintsInterval13 extends AbstractFeatureManager<MultipleAlignmentColumn> implements FeatureManagerNode<MultipleAlignmentColumn> {
021            private static final long serialVersionUID = -7659288739348604129L;
022            private static final Log log = LogFactory.getLog(FootprintsInterval13.class);
023            boolean debug = log.isDebugEnabled();
024            
025            /* Features are the conjunction of "species X is present in multiple alignment" with hidden state is "exonic, intronic, intergenic"
026             * Is the number of features allowed to depend on the number of species inmultiple alignment??
027             */
028    
029            List<String> speciesNames;
030            int startIx;  
031            ModelManager model;
032            KmerHasher h = new KmerHasher(KmerHasher.ACGTN,1);
033            
034            int maxSeqLength;
035            
036            int nFeatures = -1;
037            
038            Boolean[] isStateCoding, isStateIntronic, isStateIntergenic;
039    
040            
041            public FootprintsInterval13() { 
042            }
043    
044            public int getNumFeatures() {
045                    return nFeatures;
046            }       
047            
048            public String getFeatureName(int featureIndex) {
049                    String[] type = new String[] { "intergenic", "exonic", "intronic"};
050                    int raw = featureIndex - startIx;
051                    Assert.a(raw<nFeatures);
052                    if(speciesNames == null) {
053                            return "Species "+((raw/3) + 1) + " "+type[raw%3]+" footprint";
054                    }
055                    return speciesNames.get((raw/3) + 1) + " "+type[raw%3]+" footprint";
056            }
057            
058            
059            public void evaluateNode(InputSequence<? extends MultipleAlignmentColumn> seq, int pos, int state, FeatureList result) {
060                    MultipleAlignmentColumn mac = seq.getX(pos);
061                    for (int species = 1; species<mac.numSpecies(); species++) {
062                            if (mac.nucleotide(species) == '-') continue;
063                            
064                            if (isStateIntergenic[state]) { result.addFeature(startIx+((species-1)*3+0), 1.0); }
065                            if (isStateCoding[state])     { result.addFeature(startIx+((species-1)*3+1), 1.0); }
066                            if (isStateIntronic[state])   { result.addFeature(startIx+((species-1)*3+2), 1.0); }
067                    }
068            }
069    
070    
071            public void train(int startingIndex, ModelManager modelInfo, List<? extends TrainingSequence<? extends MultipleAlignmentColumn>> data) {
072                    TrainingSequence<? extends MultipleAlignmentColumn> seq = data.get(0);
073                    speciesNames = seq.getX(0).getMultipleAlignment().getSpeciesNames();
074                    
075                    startIx = startingIndex;
076                    model = modelInfo;
077                    int nStates = model.getNumStates();
078    
079                    nFeatures = 3*(data.get(0).getX(0).numSpecies()-1);  // Assumes this is the same for all alignments
080                                                    
081                    isStateCoding = new Boolean[nStates];       for (int j=0; j<nStates; j++) { isStateCoding[j] = false; }
082                    isStateCoding[1] = true;
083                    isStateCoding[2] = true;
084                    isStateCoding[3] = true;
085                    isStateCoding[7] = true;
086                    isStateCoding[8] = true;
087                    isStateCoding[9] = true;                
088    
089                    isStateIntronic = new Boolean[nStates];     for (int j=0; j<nStates; j++) { isStateIntronic[j] = false; }
090                    isStateIntronic[4] = true;
091                    isStateIntronic[5] = true;
092                    isStateIntronic[6] = true;
093                    isStateIntronic[10] = true;
094                    isStateIntronic[11] = true;
095                    isStateIntronic[12] = true;
096    
097                    isStateIntergenic = new Boolean[nStates];   for (int j=0; j<nStates; j++) { isStateIntergenic[j] = false; }
098                    isStateIntergenic[0] = true;
099                    
100            }
101            @Override
102            public CacheStrategySpec getCacheStrategy() {
103                    return new CacheStrategySpec(CacheStrategy.DENSE);
104            }
105            
106    }
107