001    package calhoun.analysis.crf.test;
002    
003    import java.util.List;
004    
005    import calhoun.analysis.crf.Conrad;
006    import calhoun.analysis.crf.features.interval13.ReferenceBasePredictorInterval13;
007    import calhoun.analysis.crf.features.supporting.LogProbLookup;
008    import calhoun.analysis.crf.io.InputSequenceCharacter;
009    import calhoun.analysis.crf.io.StringInput;
010    import calhoun.analysis.crf.io.TrainingSequence;
011    import calhoun.util.AbstractTestCase;
012    
013    /** Tests that CRF is working with valid probabilities - the sum of all possible labelings is 1.
014     *  
015     * Test that the code to walk through only the valid paths works correctly.
016     * Uses a two state model that disallows transitions to self.  010101... or 101010... are the only allowed paths. */
017    public class ReferenceBasePredictorTest extends AbstractTestCase {
018    
019    
020            public void testLogProbLookupEasy() throws Exception {
021                    
022                    InputSequenceCharacter seq = new InputSequenceCharacter("CTCTCTCTCTCTCTCTCTC");
023    
024                    // lp1 will measure single nucleotide frequencies for the positive strand
025                    // in this sequence there are 10 C's and 9 T's, but with pseudocounts of
026                    // 1.0 this bumps up to 11 C's 10 T's 1 A and 1 G, total 23 letters.
027                    LogProbLookup lp1 = new LogProbLookup(0,1.0);   
028                    for (int pos=0; pos<seq.length(); pos++) {
029                            lp1.increment(seq,pos,true);
030                    }
031                    lp1.finalize();
032    
033                    assertEquals(lp1.lookup(seq,0,true),-0.737598,0.001);  // log of 11/23 = -0.737599 
034                    assertEquals(lp1.lookup(seq,0,false),-3.135494,0.001); // log of 1/23 is -3.1354
035                    assertEquals(lp1.lookup(seq,1,true),-0.832909,0.001);  // log of 10/23 is -0.832909
036                    assertEquals(lp1.lookup(seq,1,false),-3.135494,0.001); // log of 1/23 is -3.1354
037                    
038    
039                    // lp2 is like lp1 except the pseudocounts will be 0.25 instead of 1.0
040                    LogProbLookup lp2 = new LogProbLookup(0,0.25);  
041                    for (int pos=0; pos<seq.length(); pos++) {
042                            lp2.increment(seq,pos,true);
043                    }
044                    lp2.finalize();
045    
046                    assertEquals(lp2.lookup(seq,0,true),-0.66845,0.001);  // log of C 10.25/20
047                    assertEquals(lp2.lookup(seq,0,false),-4.38203,0.001); // log of G 0.25/20
048                    assertEquals(lp2.lookup(seq,1,true),-0.77111,0.001);  // log of T 9.25/20
049                    assertEquals(lp2.lookup(seq,1,false),-4.38203,0.001); // log of A 0.25/20       
050            }
051            
052            
053            public void testLogProbLookupHard() throws Exception {
054                    
055                    InputSequenceCharacter seq1 = new InputSequenceCharacter("ACGTNCGTGTTCCATGGTAAC");
056                    InputSequenceCharacter seq2 = new InputSequenceCharacter("GNTTACA");
057                            
058                    
059                    // lp1 will measure probability of a letter based on previous two, pseudocounts=1.0
060                    // lp1 being trained on both strands of seq1
061                    LogProbLookup lp1 = new LogProbLookup(2,1.0);   
062                    for (int pos=0; pos<seq1.length(); pos++) {
063                            lp1.increment(seq1,pos,true);
064                            lp1.increment(seq1,pos,false);
065                    }
066                    lp1.finalize();
067    
068                    assertEquals(lp1.lookup(seq2,0,false),0.0,0.001); // should pop up as missing data, default to 0.0
069                    assertEquals(lp1.lookup(seq2,3,true),0.0,0.001); // should pop up as missing data b/c of N at position 1, default to 0.0
070                    
071                    assertEquals(lp1.lookup(seq2,4,true),-1.09861,0.001);
072                    //the history here is "TT".  In the training sequence occurs once on the positive strand,
073                    // yielding C, and once on the negative strand yeiding the revcomp(A).  
074                    // Thus the pseudocounts are: C:2, A:2, G:1, T:1
075                    // Here it's A so I expect log(2/6) = -1.09861
076            }       
077            
078            
079            public void testReferenceBasePredictor() throws Exception {
080            
081                    String configFile = "test/input/interval13/config/markov.xml";
082                    
083                    Conrad crf = new Conrad(configFile);
084                    
085                    List<? extends TrainingSequence<Character>> train1 =
086                            StringInput.prepareData(
087                                    "000000002222222666661111100000000" + "\n" +
088                                    "ACACACACATGCACAGTCAGACACATAGACACA" + "\n" +
089                                    "00000000077777CCCCCC7777000000000000" + "\n" +
090                                    "ACACACTTACACACCTACACACATACACACACACAC" + "\n");
091                    
092                    System.out.println(train1);
093                    
094                    ReferenceBasePredictorInterval13 bp = new ReferenceBasePredictorInterval13();
095                    
096                    bp.train(0,crf.getModel(),train1);
097                    
098            }
099            
100    }