001 package calhoun.analysis.crf.io;
002
003 import java.io.IOException;
004 import java.util.ArrayList;
005 import java.util.Arrays;
006 import java.util.Collections;
007 import java.util.HashMap;
008 import java.util.Iterator;
009 import java.util.List;
010 import java.util.Map;
011
012 import org.apache.commons.lang.builder.CompareToBuilder;
013 import org.apache.commons.lang.builder.EqualsBuilder;
014 import org.apache.commons.lang.builder.HashCodeBuilder;
015 import org.apache.commons.logging.Log;
016 import org.apache.commons.logging.LogFactory;
017
018 import calhoun.analysis.crf.FeatureList;
019 import calhoun.analysis.crf.features.interval13.GeneConstraintsInterval13;
020 import calhoun.analysis.crf.solver.CacheProcessor.FeatureEvaluation;
021 import calhoun.util.Assert;
022 import calhoun.util.FileUtil;
023
024 public class GTFInputInterval13 implements TrainingSequenceIO {
025 private static final long serialVersionUID = 4413724139445660883L;
026 private static final Log log = LogFactory.getLog(GTFInputInterval13.class);
027
028 String nameComponent = "name";
029
030 static class CDS implements Comparable<CDS> {
031 String seq;
032 String gene;
033 int start;
034 int stop;
035 char strand;
036
037 public int compareTo(CDS other) {
038 // Compare order is seq, gene, start. Using gene second ensures we don't have duplicated gene names or overlapping genes
039 return new CompareToBuilder().append(seq, other.seq).append(gene, other.gene).append(start, other.start).toComparison();
040 }
041
042 public boolean equals(CDS other) {
043 return new EqualsBuilder().append(seq, other.seq).append(gene, other.gene).append(start, other.start).isEquals();
044 }
045
046 public long hashcode() {
047 return new HashCodeBuilder().append(seq).append(gene).append(start).hashCode();
048 }
049 }
050
051 public void readTrainingSequences(Object location, List<TrainingSequence<Map<String, Object>>> seqs) throws IOException {
052 String[][] gtf = FileUtil.readFlatFile((String) location);
053
054 // Create a list of exons for each sequence
055 Map<String, List<CDS>> exonLists = new HashMap();
056 for(TrainingSequence<Map<String, Object>> seq : seqs) {
057 exonLists.put((String) seq.getInputSequence().getComponent("name").getX(0), new ArrayList());
058 }
059 int threwout = 0;
060 int kept = 0;
061 // First read in all of the exons into an array and sort by sequence and position
062 for(String[] row : gtf) {
063 if(row[2].equalsIgnoreCase("cds")) {
064 CDS cds = new CDS();
065 cds.seq = row[0];
066 cds.start = Integer.parseInt(row[3]);
067 cds.stop = Integer.parseInt(row[4]);
068 cds.strand = row[6].charAt(0);
069
070 // Parse out the gene identifier
071 String attributes = row[8];
072 //log.warn(attributes);
073 for(String pair : attributes.split(";")) {
074 String[] keyValue = pair.trim().split(" ");
075
076 // Strip quotes if they surround the ids
077 if(keyValue[1].charAt(0)=='"')
078 keyValue[1] = keyValue[1].substring(1,keyValue[1].length()-1);
079
080 // Check for something that indicates where this CDS belongs
081 if(keyValue[0].equals("gene_id") || keyValue[0].equals("Parent")) {
082 cds.gene = keyValue[1];
083 }
084 //log.warn("Key="+keyValue[0]+" Value="+keyValue[1]);
085 }
086 Assert.a(cds.gene != null);
087 if (exonLists.containsKey(cds.seq)) {
088 exonLists.get(cds.seq).add(cds);
089 kept++;
090 } else {
091 threwout++;
092 }
093
094 }
095 }
096 if (threwout > 0) {
097 log.warn("Threw out " + threwout + " of " + (threwout + kept) + " exons");
098 }
099 // Now go through and populate int vectors for each sequence
100 for(TrainingSequence<Map<String, Object>> seq : seqs) {
101 String name = (String) seq.getInputSequence().getComponent("name").getX(0);
102 List<CDS> exons = exonLists.get(name);
103
104 // Sort in position order
105 Collections.sort(exons);
106
107 int[] states = new int[seq.length()];
108 //log.warn("Seq: "+name+" Length: "+seq.length());
109
110
111 mapExonsToStates(exons, states);
112 seq.setY(states);
113
114 //confirmSeq(seq);
115 }
116 }
117
118 void mapExonsToStates(List<CDS> exons, int[] states) {
119 if(exons.size() == 0)
120 return;
121
122 // 1-based index of the last base of the previous exon (or 0-based index of the first base of gap)
123 int currentPosition = Integer.MIN_VALUE;
124 int exonState = Integer.MIN_VALUE;
125 int intronState = Integer.MIN_VALUE;
126
127 String currentGene = null;
128 for(CDS exon : exons) {
129 //log.warn(exon.gene+": "+exon.start+"-"+exon.stop);
130 Assert.a(exon.start > currentPosition);
131 boolean sameGene = exon.gene.equals(currentGene);
132 if(sameGene) {
133 // Gap was intron, fill in the previous state
134 // Intergenic is the default 0, and so we don't fill that in.
135 if(exon.strand == '+') {
136 intronState = (3-(currentPosition - (exonState-1))%3)%3+4;
137 }
138 else {
139 intronState = (currentPosition - (exonState-7))%3+10;
140 }
141 //log.warn(String.format("%d-%d State: %d", currentPosition, exon.start -1, intronState));
142 Arrays.fill(states, currentPosition, exon.start -1, intronState);
143 }
144 // Fill in the current exon
145 if(sameGene) {
146 if(exon.strand == '+')
147 exonState = (exon.start-1+intronState-4)%3 + 1;
148 else
149 exonState = ((exon.start-1) - (intronState-10))%3+7;
150 }
151 else {
152 // New gene, only the current start matters
153 exonState = ((exon.start-1)%3) + 1 + (exon.strand == '-' ? 6:0);
154 }
155 //log.warn(String.format("%d-%d State: %d", exon.start-1, exon.stop-1, exonState));
156 Arrays.fill(states, exon.start-1, exon.stop, exonState);
157
158 currentGene = exon.gene;
159 currentPosition = exon.stop;
160 }
161 }
162
163 public void writeTrainingSequences(Object location, Iterator<int[]> data) throws IOException {
164 }
165
166 /**
167 * @return Returns the nameComponent.
168 */
169 public String getNameComponent() {
170 return nameComponent;
171 }
172
173 /**
174 * @param nameComponent The nameComponent to set.
175 */
176 public void setNameComponent(String nameComponent) {
177 this.nameComponent = nameComponent;
178 }
179
180 /* This is debugging code that lets you get a better idea if problems occur. Specific to interval13 */
181 void confirmSeq(TrainingSequence<?> seq) {
182 int[] states = seq.y;
183 DirectFeatureList f = new DirectFeatureList();
184 GeneConstraintsInterval13 g = new GeneConstraintsInterval13();
185 InputSequenceCharacter a = (InputSequenceCharacter) seq.getInputSequence().getComponent("ref");
186 for(int i = 1; i<seq.length(); ++i) {
187 g.evaluateEdge(a, i, states[i-1], states[i], f);
188 Assert.a(f.valid, String.format("Invalid at %d: %d-%d",i, states[i-1], states[i]));
189 }
190 }
191 class DirectFeatureList implements FeatureList {
192 FeatureEvaluation evals1;
193 public int position;
194 boolean valid = true;
195
196 public DirectFeatureList() {
197 }
198
199 public void addFeature(int index, double val) {
200 evals1.index[position] = (short) index;
201 evals1.value[position++] = (float) val;
202 }
203
204 /** Returns the invalid flag. */
205 public boolean isValid() {
206 return valid;
207 }
208
209 /** Invalidates results. */
210 public void invalidate() {
211 valid = false;
212 }
213 }
214 }