001 package calhoun.analysis.crf.features.supporting;
002
003 import java.io.Serializable;
004
005 import org.apache.commons.logging.Log;
006 import org.apache.commons.logging.LogFactory;
007
008 import calhoun.analysis.crf.io.InputSequence;
009 import calhoun.seq.KmerHasher;
010 import calhoun.util.Assert;
011
012
013 public class LogProbLookup implements Serializable {
014 private static final long serialVersionUID = -9195647924401633963L;
015 private static final Log log = LogFactory.getLog(LogProbLookup.class);
016 final KmerHasher.CharacterHash hashForward = KmerHasher.ACGTother;
017 final KmerHasher.CharacterHash hashReverse = KmerHasher.ACGTotherRC;
018 boolean finalized;
019
020 final int mult = 4;
021 int maxLookBack;
022
023 double[] lookupTable;
024 int lookupTableSize;
025
026 public LogProbLookup(int lookBack, double pseudoCount) {
027 Assert.a(lookBack >= 0);
028 Assert.a(lookBack < 10);
029 this.maxLookBack = lookBack;
030 finalized = false;
031
032 lookupTableSize = 1;
033 for (int i=0; i<=lookBack; i++) {
034 lookupTableSize *= mult;
035 }
036 lookupTable = new double[lookupTableSize];
037
038 for (int i=0; i<lookupTableSize; i++) {
039 lookupTable[i] = pseudoCount;
040 }
041 }
042
043 private boolean isHistory(InputSequence<? extends Character> seq, int pos) {
044
045 for (int j=pos-maxLookBack; j<=pos+maxLookBack; j++) {
046 if (hashForward.hash(seq.getX(j))==4) { return false; }
047 // Above is identical to checking hashReverse
048 // If there are N's within history window in either dircetion, want to ignore this position
049 }
050 return true;
051 }
052
053 private int getInd(InputSequence<? extends Character> seq, int pos, boolean isPlus) {
054 int ind = 0;
055 if (isPlus) {
056 if (pos < maxLookBack) { return -1; }
057 for (int j=pos-maxLookBack; j<=pos; j++) {
058 int h = hashForward.hash( (char) seq.getX(j));
059 //int h = hashForward.hash('A');
060 if (h<4) {
061 ind *= mult;
062 ind += h;
063 } else {
064 return -1;
065 }
066 }
067 } else {
068 if (pos + maxLookBack >= seq.length()) { return -1; }
069 for (int j=pos+maxLookBack; j>=pos; j--) {
070 int h = hashReverse.hash( (char) seq.getX(j));
071 if (h<4) {
072 ind *= mult;
073 ind += h;
074 } else {
075 return -1;
076 }
077 }
078 }
079 return ind;
080 }
081
082 public void increment(InputSequence<? extends Character> seq, int pos, boolean isPlus) {
083 Assert.a(!finalized);
084 int ind = getInd(seq,pos,isPlus);
085 if (ind >=0) {
086 lookupTable[ind] += 1.0;
087 }
088 }
089
090 @Override
091 public void finalize() {
092 Assert.a(!finalized);
093 log.debug("finalizing a LogProbLookup, lookupTablesize="+lookupTableSize + " mult=" + mult);
094 for (int i=0; i<lookupTableSize/mult; i++) {
095 // System.out.println("finalizing i=" + i);
096 double sum = 0;
097 for (int j=mult*i; j<mult*(i+1); j++) {
098 sum += lookupTable[j];
099 }
100 for (int j=mult*i; j<mult*(i+1); j++) {
101 lookupTable[j] = Math.log(lookupTable[j]/sum);
102 }
103 }
104 finalized = true;
105 }
106
107 public double lookup(InputSequence<? extends Character> seq, int pos, boolean isPlus) {
108 Assert.a(finalized);
109 int ind = getInd(seq,pos,isPlus);
110 if (ind >= 0) {
111 return lookupTable[ind];
112 }
113 return 0.0;
114 }
115
116 }