001    package calhoun.analysis.crf.io;
002    
003    import java.io.IOException;
004    import java.util.ArrayList;
005    import java.util.Arrays;
006    import java.util.Collections;
007    import java.util.HashMap;
008    import java.util.Iterator;
009    import java.util.List;
010    import java.util.Map;
011    
012    import org.apache.commons.lang.builder.CompareToBuilder;
013    import org.apache.commons.lang.builder.EqualsBuilder;
014    import org.apache.commons.lang.builder.HashCodeBuilder;
015    import org.apache.commons.logging.Log;
016    import org.apache.commons.logging.LogFactory;
017    
018    import calhoun.analysis.crf.FeatureList;
019    import calhoun.analysis.crf.features.interval13.GeneConstraintsInterval13;
020    import calhoun.analysis.crf.solver.CacheProcessor.FeatureEvaluation;
021    import calhoun.util.Assert;
022    import calhoun.util.FileUtil;
023    
024    public class GTFInputInterval13 implements TrainingSequenceIO {
025            private static final long serialVersionUID = 4413724139445660883L;
026            private static final Log log = LogFactory.getLog(GTFInputInterval13.class);
027    
028            String nameComponent = "name";
029            
030            static class CDS implements Comparable<CDS> {
031                    String seq;
032                    String gene;
033                    int start;
034                    int stop;
035                    char strand;
036                    
037                    public int compareTo(CDS other) {
038                            // Compare order is seq, gene, start.  Using gene second ensures we don't have duplicated gene names or overlapping genes
039                         return new CompareToBuilder().append(seq, other.seq).append(gene, other.gene).append(start, other.start).toComparison();
040                    }
041    
042                    public boolean equals(CDS other) {
043                         return new EqualsBuilder().append(seq, other.seq).append(gene, other.gene).append(start, other.start).isEquals();
044                    }
045    
046                    public long hashcode() {
047                         return new HashCodeBuilder().append(seq).append(gene).append(start).hashCode();
048                    }
049            }
050            
051            public void readTrainingSequences(Object location, List<TrainingSequence<Map<String, Object>>> seqs) throws IOException {
052                    String[][] gtf = FileUtil.readFlatFile((String) location);
053            
054                    // Create a list of exons for each sequence
055                    Map<String, List<CDS>> exonLists = new HashMap();
056                    for(TrainingSequence<Map<String, Object>> seq : seqs) {
057                            exonLists.put((String) seq.getInputSequence().getComponent("name").getX(0), new ArrayList());
058                    }
059                    int threwout = 0;
060                    int kept = 0;
061                    // First read in all of the exons into an array and sort by sequence and position
062                    for(String[] row : gtf) {
063                            if(row[2].equalsIgnoreCase("cds")) {
064                                    CDS cds = new CDS();
065                                    cds.seq = row[0];
066                                    cds.start = Integer.parseInt(row[3]);
067                                    cds.stop = Integer.parseInt(row[4]);
068                                    cds.strand = row[6].charAt(0);
069                                    
070                                    // Parse out the gene identifier
071                                    String attributes = row[8];
072                                    //log.warn(attributes);
073                                    for(String pair : attributes.split(";")) {
074                                            String[] keyValue = pair.trim().split(" ");
075                                            
076                                            // Strip quotes if they surround the ids
077                                            if(keyValue[1].charAt(0)=='"')
078                                                    keyValue[1] = keyValue[1].substring(1,keyValue[1].length()-1);
079    
080                                            // Check for something that indicates where this CDS belongs
081                                            if(keyValue[0].equals("gene_id") || keyValue[0].equals("Parent")) {
082                                                    cds.gene = keyValue[1];
083                                            }
084                                            //log.warn("Key="+keyValue[0]+" Value="+keyValue[1]);
085                                    }
086                                    Assert.a(cds.gene != null);
087                                    if (exonLists.containsKey(cds.seq)) {
088                                            exonLists.get(cds.seq).add(cds);
089                                            kept++;
090                                    } else {
091                                            threwout++;
092                                    }
093                                    
094                            }
095                    }
096                    if (threwout > 0) {
097                            log.warn("Threw out " + threwout + " of " + (threwout + kept) + " exons");
098                    }
099                    // Now go through and populate int vectors for each sequence 
100                    for(TrainingSequence<Map<String, Object>> seq : seqs) {
101                            String name = (String) seq.getInputSequence().getComponent("name").getX(0);
102                            List<CDS> exons = exonLists.get(name);
103                            
104                            // Sort in position order
105                            Collections.sort(exons);
106                            
107                            int[] states = new int[seq.length()];
108                            //log.warn("Seq: "+name+" Length: "+seq.length());
109                    
110                            
111                            mapExonsToStates(exons, states);
112                            seq.setY(states);
113    
114                            //confirmSeq(seq);
115                    }
116            }
117    
118            void mapExonsToStates(List<CDS> exons, int[] states) {
119                    if(exons.size() == 0)
120                            return;
121    
122                    // 1-based index of the last base of the previous exon (or 0-based index of the first base of gap)
123                    int currentPosition = Integer.MIN_VALUE;
124                    int exonState = Integer.MIN_VALUE;
125                    int intronState = Integer.MIN_VALUE;
126                    
127                    String currentGene = null;
128                    for(CDS exon : exons) {
129                            //log.warn(exon.gene+": "+exon.start+"-"+exon.stop);
130                            Assert.a(exon.start > currentPosition);
131                            boolean sameGene = exon.gene.equals(currentGene); 
132                            if(sameGene) {
133                                    // Gap was intron, fill in the previous state
134                                    // Intergenic is the default 0, and so we don't fill that in.
135                                    if(exon.strand == '+') {
136                                            intronState = (3-(currentPosition - (exonState-1))%3)%3+4;
137                                    }
138                                    else {
139                                            intronState = (currentPosition - (exonState-7))%3+10;
140                                    }
141                                    //log.warn(String.format("%d-%d State: %d", currentPosition, exon.start -1, intronState));
142                                    Arrays.fill(states, currentPosition, exon.start -1, intronState);
143                            }
144                            // Fill in the current exon
145                            if(sameGene) {
146                                    if(exon.strand == '+')
147                                            exonState = (exon.start-1+intronState-4)%3 + 1;
148                                    else
149                                            exonState = ((exon.start-1) - (intronState-10))%3+7;
150                            }
151                            else {
152                                    // New gene, only the current start matters
153                                    exonState = ((exon.start-1)%3) + 1 + (exon.strand == '-' ? 6:0);
154                            }
155                            //log.warn(String.format("%d-%d State: %d", exon.start-1, exon.stop-1, exonState));
156                            Arrays.fill(states, exon.start-1, exon.stop, exonState);
157                            
158                            currentGene = exon.gene;
159                            currentPosition = exon.stop;
160                    }
161            }
162            
163            public void writeTrainingSequences(Object location, Iterator<int[]> data) throws IOException {
164            }
165    
166            /**
167             * @return Returns the nameComponent.
168             */
169            public String getNameComponent() {
170                    return nameComponent;
171            }
172    
173            /**
174             * @param nameComponent The nameComponent to set.
175             */
176            public void setNameComponent(String nameComponent) {
177                    this.nameComponent = nameComponent;
178            }
179    
180            /* This is debugging code that lets you get a better idea if problems occur.  Specific to interval13 */
181            void confirmSeq(TrainingSequence<?> seq) {
182                    int[] states = seq.y;
183                    DirectFeatureList f = new DirectFeatureList();
184                    GeneConstraintsInterval13 g = new GeneConstraintsInterval13();
185                    InputSequenceCharacter a = (InputSequenceCharacter) seq.getInputSequence().getComponent("ref");
186                    for(int i = 1; i<seq.length(); ++i) {
187                            g.evaluateEdge(a, i, states[i-1], states[i], f);
188                            Assert.a(f.valid, String.format("Invalid at %d: %d-%d",i, states[i-1], states[i]));
189                    }
190            }
191            class DirectFeatureList implements FeatureList {
192                    FeatureEvaluation evals1;
193                    public int position;
194                    boolean valid = true;
195                    
196                    public DirectFeatureList() {
197                    }
198                    
199                    public void addFeature(int index, double val) {
200                            evals1.index[position] = (short) index;
201                            evals1.value[position++] = (float) val;
202                    }
203    
204                    /** Returns the invalid flag. */
205                    public boolean isValid() {
206                            return valid;
207                    }
208    
209                    /** Invalidates results. */
210                    public void invalidate() {
211                            valid = false;
212                    }
213            }
214    }