001 package calhoun.analysis.crf.test;
002
003 import java.util.List;
004
005 import calhoun.analysis.crf.Conrad;
006 import calhoun.analysis.crf.features.interval13.ReferenceBasePredictorInterval13;
007 import calhoun.analysis.crf.features.supporting.LogProbLookup;
008 import calhoun.analysis.crf.io.InputSequenceCharacter;
009 import calhoun.analysis.crf.io.StringInput;
010 import calhoun.analysis.crf.io.TrainingSequence;
011 import calhoun.util.AbstractTestCase;
012
013 /** Tests that CRF is working with valid probabilities - the sum of all possible labelings is 1.
014 *
015 * Test that the code to walk through only the valid paths works correctly.
016 * Uses a two state model that disallows transitions to self. 010101... or 101010... are the only allowed paths. */
017 public class ReferenceBasePredictorTest extends AbstractTestCase {
018
019
020 public void testLogProbLookupEasy() throws Exception {
021
022 InputSequenceCharacter seq = new InputSequenceCharacter("CTCTCTCTCTCTCTCTCTC");
023
024 // lp1 will measure single nucleotide frequencies for the positive strand
025 // in this sequence there are 10 C's and 9 T's, but with pseudocounts of
026 // 1.0 this bumps up to 11 C's 10 T's 1 A and 1 G, total 23 letters.
027 LogProbLookup lp1 = new LogProbLookup(0,1.0);
028 for (int pos=0; pos<seq.length(); pos++) {
029 lp1.increment(seq,pos,true);
030 }
031 lp1.finalize();
032
033 assertEquals(lp1.lookup(seq,0,true),-0.737598,0.001); // log of 11/23 = -0.737599
034 assertEquals(lp1.lookup(seq,0,false),-3.135494,0.001); // log of 1/23 is -3.1354
035 assertEquals(lp1.lookup(seq,1,true),-0.832909,0.001); // log of 10/23 is -0.832909
036 assertEquals(lp1.lookup(seq,1,false),-3.135494,0.001); // log of 1/23 is -3.1354
037
038
039 // lp2 is like lp1 except the pseudocounts will be 0.25 instead of 1.0
040 LogProbLookup lp2 = new LogProbLookup(0,0.25);
041 for (int pos=0; pos<seq.length(); pos++) {
042 lp2.increment(seq,pos,true);
043 }
044 lp2.finalize();
045
046 assertEquals(lp2.lookup(seq,0,true),-0.66845,0.001); // log of C 10.25/20
047 assertEquals(lp2.lookup(seq,0,false),-4.38203,0.001); // log of G 0.25/20
048 assertEquals(lp2.lookup(seq,1,true),-0.77111,0.001); // log of T 9.25/20
049 assertEquals(lp2.lookup(seq,1,false),-4.38203,0.001); // log of A 0.25/20
050 }
051
052
053 public void testLogProbLookupHard() throws Exception {
054
055 InputSequenceCharacter seq1 = new InputSequenceCharacter("ACGTNCGTGTTCCATGGTAAC");
056 InputSequenceCharacter seq2 = new InputSequenceCharacter("GNTTACA");
057
058
059 // lp1 will measure probability of a letter based on previous two, pseudocounts=1.0
060 // lp1 being trained on both strands of seq1
061 LogProbLookup lp1 = new LogProbLookup(2,1.0);
062 for (int pos=0; pos<seq1.length(); pos++) {
063 lp1.increment(seq1,pos,true);
064 lp1.increment(seq1,pos,false);
065 }
066 lp1.finalize();
067
068 assertEquals(lp1.lookup(seq2,0,false),0.0,0.001); // should pop up as missing data, default to 0.0
069 assertEquals(lp1.lookup(seq2,3,true),0.0,0.001); // should pop up as missing data b/c of N at position 1, default to 0.0
070
071 assertEquals(lp1.lookup(seq2,4,true),-1.09861,0.001);
072 //the history here is "TT". In the training sequence occurs once on the positive strand,
073 // yielding C, and once on the negative strand yeiding the revcomp(A).
074 // Thus the pseudocounts are: C:2, A:2, G:1, T:1
075 // Here it's A so I expect log(2/6) = -1.09861
076 }
077
078
079 public void testReferenceBasePredictor() throws Exception {
080
081 String configFile = "test/input/interval13/config/markov.xml";
082
083 Conrad crf = new Conrad(configFile);
084
085 List<? extends TrainingSequence<Character>> train1 =
086 StringInput.prepareData(
087 "000000002222222666661111100000000" + "\n" +
088 "ACACACACATGCACAGTCAGACACATAGACACA" + "\n" +
089 "00000000077777CCCCCC7777000000000000" + "\n" +
090 "ACACACTTACACACCTACACACATACACACACACAC" + "\n");
091
092 System.out.println(train1);
093
094 ReferenceBasePredictorInterval13 bp = new ReferenceBasePredictorInterval13();
095
096 bp.train(0,crf.getModel(),train1);
097
098 }
099
100 }