001    package calhoun.analysis.crf.io;
002    
003    import java.util.ArrayList;
004    import java.util.Collection;
005    import java.util.List;
006    
007    import org.apache.commons.lang.StringUtils;
008    import org.apache.commons.logging.Log;
009    import org.apache.commons.logging.LogFactory;
010    
011    import calhoun.analysis.crf.features.supporting.phylogenetic.PhylogeneticTreeFelsensteinOrder;
012    import calhoun.analysis.crf.features.supporting.phylogenetic.RootedBinaryPhylogeneticTree;
013    import calhoun.seq.KmerHasher;
014    import calhoun.util.Assert;
015    
016    /** an input sequence where each element represents one column of a multiple alignment.
017     */
018    public class MultipleAlignmentInputSequence implements InputSequence<MultipleAlignmentInputSequence.MultipleAlignmentColumn> {
019            private static final Log log = LogFactory.getLog(MultipleAlignmentInputSequence.class);
020    
021            // Raw original data
022            List<String> speciesNames;
023            List<String> consensuses;
024            RootedBinaryPhylogeneticTree tree;
025            String refSpecies;
026            
027            // Derived data
028            int nSpecies;
029            int consensusLength;
030            KmerHasher h;
031            ArrayList<Integer> ref2con;
032            int[] con2refLeft;
033            int[] con2refRight;
034            
035            int refSpeciesIndex=0;
036            
037            int reflen;
038            boolean ready;  
039    
040            KmerHasher columnHasher;
041    
042            public MultipleAlignmentInputSequence(String refSpecies, RootedBinaryPhylogeneticTree tree) {
043                    this.refSpecies = refSpecies;
044                    this.tree = tree;
045                    nSpecies = tree.getNumSpecies();
046            }
047    
048            /** constructs a multiple alignment input sequence.
049             * @param speciesNames a list of the names of the species in the alignments
050             * @param consensuses a list of the consensus sequences for each species.  The consensus sequences shoudl form a multiple alignment, including gaps.
051             * @param tree a tree of all the species in the alignment with branch lengths.
052             */
053            public MultipleAlignmentInputSequence(List<String> speciesNames, List<String> consensuses, String refSpecies, RootedBinaryPhylogeneticTree tree) {
054                    this(refSpecies, tree);
055                    setSpeciesAndConsensuses(speciesNames, consensuses);
056            }
057    
058            public int getNumSpecies() {
059                    return nSpecies;
060            }
061            
062            void setSpeciesAndConsensuses(List<String> speciesNames, List<String> consensuses) {
063                    this.speciesNames = speciesNames;
064                    this.consensuses = consensuses;
065    
066                    // Determine number of species in multiple alignment;
067                    Assert.a(speciesNames.size() == consensuses.size());
068                    Assert.a(nSpecies == speciesNames.size());
069                    Assert.a(nSpecies >= 1,"Number of species was " + nSpecies + " and supposed to be >= 1");
070    
071                    refSpeciesIndex = speciesNames.indexOf(refSpecies);
072                    Assert.a(refSpeciesIndex != -1, "Reference species ",refSpecies," not found in ",StringUtils.join(speciesNames.iterator(), ','));
073                    
074                    columnHasher = new KmerHasher(KmerHasher.ACGTother, nSpecies);
075                     
076                    // Determine length of consensus (padded) sequence in multiple alignment
077                    consensusLength = consensuses.get(refSpeciesIndex).length();
078                    for (int spec = 0; spec<nSpecies; spec++) {
079                            Assert.a(consensuses.get(spec).length() == consensusLength);
080                    }
081                    
082                    h = new KmerHasher(KmerHasher.ACGTN,1);
083                    reflen = 0;
084                    ref2con = new ArrayList<Integer>();
085                    String refCon = consensuses.get(refSpeciesIndex);
086                    for (int cpos=0; cpos<consensusLength; cpos++) {
087                            if (h.hashable(refCon.charAt(cpos))) {
088                                    ref2con.add(cpos);
089                                    reflen++;
090                            }
091                    }
092                    Assert.a(ref2con.size() == reflen);
093    
094                    
095                    con2refLeft = new int[consensusLength]; // at cpos, = max(0 , argmax_rpos( ref2con(rpos)< cpos ) ) 
096                    int refLeft = 0;
097                    for (int cpos=0; cpos<consensusLength; cpos++) {
098                            if (refLeft < (reflen-1)) {
099                                    if (ref2con.get(refLeft+1) < cpos)  { refLeft++; }
100                            }
101                            con2refLeft[cpos] = refLeft;
102                    }
103                    con2refRight = new int[consensusLength]; // at cpos = min(reflen-1, argmin_rpos( ref2con(rpos)>cpos) )
104                    int refRight = reflen-1;
105                    for (int cpos=consensusLength-1; cpos>=0; cpos--) {
106                            if (refRight>0) {
107                                    if (ref2con.get(refRight-1) > cpos) { refRight--; }
108                            }
109                            con2refRight[cpos] = refRight;
110                    }
111                    
112                    log.debug("consensus length = " + consensusLength + "    Reference length = " + reflen);
113            }
114            
115            public MultipleAlignmentColumn getX(int ix) {
116                    MultipleAlignmentColumn ret = new MultipleAlignmentColumn(ix);
117                    return ret;
118            }
119    
120            public int length() {
121                    return reflen;
122            }
123    
124            public InputSequence<?> getComponent(String name) {
125                    throw new UnsupportedOperationException();
126            }
127    
128            public Collection<String> listComponents() {
129                    throw new UnsupportedOperationException();
130            }
131    
132            public int con2refLeft( int cpos ) {
133                    return con2refLeft[cpos];
134            }
135    
136            public int con2refRight( int cpos ) {
137                    return con2refRight[cpos];
138            }
139    
140            public int ref2con(int pos) {
141                    return ref2con.get(pos);
142            }
143    
144            public char characterInPaddedAlignment(int consensusPosition, int speciesNumber) {
145                    return consensuses.get(speciesNumber).charAt(consensusPosition);
146            }
147    
148            public int numSpecies() {
149                    return nSpecies;
150            }
151    
152            public InputSequence<MultipleAlignmentColumn> subSequence(int start, int end) {
153                    Assert.a(start >= 1);
154                    Assert.a(end <= this.length());
155                    Assert.a(start <= end);
156                    
157                    int constart1 = ref2con(start-1)+1; // start coord in consensus, one-based inclusive
158                    int conend1 = ref2con(end-1)+1; // end coord in consensus, one-based inclusive
159                    
160                    ArrayList<String> newcon = new ArrayList<String>();
161                    
162                    for (int j=0; j<nSpecies; j++) {
163                            newcon.add(consensuses.get(j).substring(constart1-1,conend1));
164                    }
165                    
166                    MultipleAlignmentInputSequence MA = new MultipleAlignmentInputSequence(speciesNames,newcon,refSpecies, tree);
167                    
168                    return MA;
169            }
170    
171            public List<String> getSpeciesNames()
172            {
173                    return speciesNames;
174            }
175            
176            public List<String> getConsensusSeqs()
177            {
178                    return consensuses;
179            }
180    
181            public int getColumnUniqueHash(int conpos) {
182                    int ret = 0;
183                    for(int i = 0; i<nSpecies; ++i) {
184                            ret = columnHasher.shiftHash(consensuses.get(i).charAt(conpos), ret);
185                    }
186                    return ret;
187            }
188    
189            public int getConsensusLength() {
190                    return consensusLength;
191            }
192    
193            public RootedBinaryPhylogeneticTree getTree() {
194                    return tree;
195            }
196            
197            public PhylogeneticTreeFelsensteinOrder getFelsensteinOrder() {
198                    int n = speciesNames.size();
199                    String[] sn = new String[n];
200                    for (int j=0; j<n; j++) {
201                            sn[j] = speciesNames.get(j);
202                    }
203                    return tree.subtree(sn).getFelsensteinOrder(sn);
204            }
205    
206            /** represents the column of the alignment at a given position on the reference sequence */
207            public class MultipleAlignmentColumn {
208                    private int pos;
209                    private int cpos;
210    
211                    /** constructs the column at this position */
212                    public MultipleAlignmentColumn(int pos) {
213                            this.pos = pos;
214                            cpos = ref2con(pos);
215                    }
216    
217                    /** returns the multiple alignment input sequence that this column comes from.
218                     * @return the owning input sequence
219                     */
220                    public MultipleAlignmentInputSequence getMultipleAlignment() {
221                            return MultipleAlignmentInputSequence.this;
222                    }
223                    
224                    /** returns the number of species in the alignment
225                     * @return the number of species in the alignment
226                     */
227                    public int numSpecies() {
228                            return MultipleAlignmentInputSequence.this.numSpecies();
229                    }
230    
231                    /** returns the value of this position in the alignment for the given species
232                     * @param spec the species to retrieve.  The value will be the index of the species in the <code>speciesName</code> list.
233                     * @return the character for this species in this column
234                     */
235                    public char nucleotide(int spec) {
236                            return MultipleAlignmentInputSequence.this.characterInPaddedAlignment(cpos,spec);
237                    }
238    
239                    /** returns a hash value for the characters in this column
240                     * @return the hash value for the column
241                     */
242                    public int getUniqueHash() {
243                            return MultipleAlignmentInputSequence.this.getColumnUniqueHash(pos);
244                    }
245    
246                    /** returns the tree for this alignment
247                     * @return the species tree for the alignment
248                     */
249                    public RootedBinaryPhylogeneticTree getTree() {
250                            return MultipleAlignmentInputSequence.this.getTree();
251                    }
252            }
253    }