001 package calhoun.analysis.crf.io;
002
003 import java.util.ArrayList;
004 import java.util.Collection;
005 import java.util.List;
006
007 import org.apache.commons.lang.StringUtils;
008 import org.apache.commons.logging.Log;
009 import org.apache.commons.logging.LogFactory;
010
011 import calhoun.analysis.crf.features.supporting.phylogenetic.PhylogeneticTreeFelsensteinOrder;
012 import calhoun.analysis.crf.features.supporting.phylogenetic.RootedBinaryPhylogeneticTree;
013 import calhoun.seq.KmerHasher;
014 import calhoun.util.Assert;
015
016 /** an input sequence where each element represents one column of a multiple alignment.
017 */
018 public class MultipleAlignmentInputSequence implements InputSequence<MultipleAlignmentInputSequence.MultipleAlignmentColumn> {
019 private static final Log log = LogFactory.getLog(MultipleAlignmentInputSequence.class);
020
021 // Raw original data
022 List<String> speciesNames;
023 List<String> consensuses;
024 RootedBinaryPhylogeneticTree tree;
025 String refSpecies;
026
027 // Derived data
028 int nSpecies;
029 int consensusLength;
030 KmerHasher h;
031 ArrayList<Integer> ref2con;
032 int[] con2refLeft;
033 int[] con2refRight;
034
035 int refSpeciesIndex=0;
036
037 int reflen;
038 boolean ready;
039
040 KmerHasher columnHasher;
041
042 public MultipleAlignmentInputSequence(String refSpecies, RootedBinaryPhylogeneticTree tree) {
043 this.refSpecies = refSpecies;
044 this.tree = tree;
045 nSpecies = tree.getNumSpecies();
046 }
047
048 /** constructs a multiple alignment input sequence.
049 * @param speciesNames a list of the names of the species in the alignments
050 * @param consensuses a list of the consensus sequences for each species. The consensus sequences shoudl form a multiple alignment, including gaps.
051 * @param tree a tree of all the species in the alignment with branch lengths.
052 */
053 public MultipleAlignmentInputSequence(List<String> speciesNames, List<String> consensuses, String refSpecies, RootedBinaryPhylogeneticTree tree) {
054 this(refSpecies, tree);
055 setSpeciesAndConsensuses(speciesNames, consensuses);
056 }
057
058 public int getNumSpecies() {
059 return nSpecies;
060 }
061
062 void setSpeciesAndConsensuses(List<String> speciesNames, List<String> consensuses) {
063 this.speciesNames = speciesNames;
064 this.consensuses = consensuses;
065
066 // Determine number of species in multiple alignment;
067 Assert.a(speciesNames.size() == consensuses.size());
068 Assert.a(nSpecies == speciesNames.size());
069 Assert.a(nSpecies >= 1,"Number of species was " + nSpecies + " and supposed to be >= 1");
070
071 refSpeciesIndex = speciesNames.indexOf(refSpecies);
072 Assert.a(refSpeciesIndex != -1, "Reference species ",refSpecies," not found in ",StringUtils.join(speciesNames.iterator(), ','));
073
074 columnHasher = new KmerHasher(KmerHasher.ACGTother, nSpecies);
075
076 // Determine length of consensus (padded) sequence in multiple alignment
077 consensusLength = consensuses.get(refSpeciesIndex).length();
078 for (int spec = 0; spec<nSpecies; spec++) {
079 Assert.a(consensuses.get(spec).length() == consensusLength);
080 }
081
082 h = new KmerHasher(KmerHasher.ACGTN,1);
083 reflen = 0;
084 ref2con = new ArrayList<Integer>();
085 String refCon = consensuses.get(refSpeciesIndex);
086 for (int cpos=0; cpos<consensusLength; cpos++) {
087 if (h.hashable(refCon.charAt(cpos))) {
088 ref2con.add(cpos);
089 reflen++;
090 }
091 }
092 Assert.a(ref2con.size() == reflen);
093
094
095 con2refLeft = new int[consensusLength]; // at cpos, = max(0 , argmax_rpos( ref2con(rpos)< cpos ) )
096 int refLeft = 0;
097 for (int cpos=0; cpos<consensusLength; cpos++) {
098 if (refLeft < (reflen-1)) {
099 if (ref2con.get(refLeft+1) < cpos) { refLeft++; }
100 }
101 con2refLeft[cpos] = refLeft;
102 }
103 con2refRight = new int[consensusLength]; // at cpos = min(reflen-1, argmin_rpos( ref2con(rpos)>cpos) )
104 int refRight = reflen-1;
105 for (int cpos=consensusLength-1; cpos>=0; cpos--) {
106 if (refRight>0) {
107 if (ref2con.get(refRight-1) > cpos) { refRight--; }
108 }
109 con2refRight[cpos] = refRight;
110 }
111
112 log.debug("consensus length = " + consensusLength + " Reference length = " + reflen);
113 }
114
115 public MultipleAlignmentColumn getX(int ix) {
116 MultipleAlignmentColumn ret = new MultipleAlignmentColumn(ix);
117 return ret;
118 }
119
120 public int length() {
121 return reflen;
122 }
123
124 public InputSequence<?> getComponent(String name) {
125 throw new UnsupportedOperationException();
126 }
127
128 public Collection<String> listComponents() {
129 throw new UnsupportedOperationException();
130 }
131
132 public int con2refLeft( int cpos ) {
133 return con2refLeft[cpos];
134 }
135
136 public int con2refRight( int cpos ) {
137 return con2refRight[cpos];
138 }
139
140 public int ref2con(int pos) {
141 return ref2con.get(pos);
142 }
143
144 public char characterInPaddedAlignment(int consensusPosition, int speciesNumber) {
145 return consensuses.get(speciesNumber).charAt(consensusPosition);
146 }
147
148 public int numSpecies() {
149 return nSpecies;
150 }
151
152 public InputSequence<MultipleAlignmentColumn> subSequence(int start, int end) {
153 Assert.a(start >= 1);
154 Assert.a(end <= this.length());
155 Assert.a(start <= end);
156
157 int constart1 = ref2con(start-1)+1; // start coord in consensus, one-based inclusive
158 int conend1 = ref2con(end-1)+1; // end coord in consensus, one-based inclusive
159
160 ArrayList<String> newcon = new ArrayList<String>();
161
162 for (int j=0; j<nSpecies; j++) {
163 newcon.add(consensuses.get(j).substring(constart1-1,conend1));
164 }
165
166 MultipleAlignmentInputSequence MA = new MultipleAlignmentInputSequence(speciesNames,newcon,refSpecies, tree);
167
168 return MA;
169 }
170
171 public List<String> getSpeciesNames()
172 {
173 return speciesNames;
174 }
175
176 public List<String> getConsensusSeqs()
177 {
178 return consensuses;
179 }
180
181 public int getColumnUniqueHash(int conpos) {
182 int ret = 0;
183 for(int i = 0; i<nSpecies; ++i) {
184 ret = columnHasher.shiftHash(consensuses.get(i).charAt(conpos), ret);
185 }
186 return ret;
187 }
188
189 public int getConsensusLength() {
190 return consensusLength;
191 }
192
193 public RootedBinaryPhylogeneticTree getTree() {
194 return tree;
195 }
196
197 public PhylogeneticTreeFelsensteinOrder getFelsensteinOrder() {
198 int n = speciesNames.size();
199 String[] sn = new String[n];
200 for (int j=0; j<n; j++) {
201 sn[j] = speciesNames.get(j);
202 }
203 return tree.subtree(sn).getFelsensteinOrder(sn);
204 }
205
206 /** represents the column of the alignment at a given position on the reference sequence */
207 public class MultipleAlignmentColumn {
208 private int pos;
209 private int cpos;
210
211 /** constructs the column at this position */
212 public MultipleAlignmentColumn(int pos) {
213 this.pos = pos;
214 cpos = ref2con(pos);
215 }
216
217 /** returns the multiple alignment input sequence that this column comes from.
218 * @return the owning input sequence
219 */
220 public MultipleAlignmentInputSequence getMultipleAlignment() {
221 return MultipleAlignmentInputSequence.this;
222 }
223
224 /** returns the number of species in the alignment
225 * @return the number of species in the alignment
226 */
227 public int numSpecies() {
228 return MultipleAlignmentInputSequence.this.numSpecies();
229 }
230
231 /** returns the value of this position in the alignment for the given species
232 * @param spec the species to retrieve. The value will be the index of the species in the <code>speciesName</code> list.
233 * @return the character for this species in this column
234 */
235 public char nucleotide(int spec) {
236 return MultipleAlignmentInputSequence.this.characterInPaddedAlignment(cpos,spec);
237 }
238
239 /** returns a hash value for the characters in this column
240 * @return the hash value for the column
241 */
242 public int getUniqueHash() {
243 return MultipleAlignmentInputSequence.this.getColumnUniqueHash(pos);
244 }
245
246 /** returns the tree for this alignment
247 * @return the species tree for the alignment
248 */
249 public RootedBinaryPhylogeneticTree getTree() {
250 return MultipleAlignmentInputSequence.this.getTree();
251 }
252 }
253 }