001    package calhoun.seq;
002    
003    import java.io.Serializable;
004    
005    import org.apache.commons.logging.Log;
006    import org.apache.commons.logging.LogFactory;
007    
008    import calhoun.util.Assert;
009    import calhoun.util.ErrorException;
010    
011    /** Class for computing kmer hashes.  You instantiate the class with the length of the kmer to hash and the alphabet to use.  
012     * The hashing functions then compute hashes for individual kmers.  This class does not stored any hashes, which are just ints. */
013    public class KmerHasher implements Serializable {
014            private static final long serialVersionUID = -3402947063680917230L;
015    
016            @SuppressWarnings("unused")
017            private static final Log log = LogFactory.getLog(KmerHasher.class);
018    
019            CharacterHash charHash;
020            int length;
021            int mscMult;    // Multiplier for most significant character
022            int lscMult;    // Multiplier for least significant character
023            
024            public interface CharacterHash extends Serializable {
025                    public short getSize();
026                    public int hash(char a) throws ErrorException;
027                    //public boolean hashable(char a);
028                    public char reverse(int a);
029            }
030            
031            /** Creates a hash from a given character hash and length.
032             */
033            public KmerHasher(CharacterHash charHash, int length) {
034                    this.charHash = charHash;
035                    this.length = length;
036                    mscMult = (int) Math.pow(charHash.getSize(), length-1);
037                    lscMult = charHash.getSize();
038                    // log.info(String.format("Alphabet size: %d Max hash: %d Max hash(len-1): %d, length: %d", lscMult, mscMult*lscMult-1, mscMult, length));
039            }
040    
041            
042            public static String reverseComplement(String forward) {
043                    String reverse = "";
044                    
045                    CharacterHash hf = ACGTother;
046                    CharacterHash hr = ACGTotherRC;
047                    
048                    for (int j=forward.length()-1; j>=0; j--) {
049                            reverse = reverse + hr.reverse(hf.hash(forward.charAt(j)));
050                    }
051                    
052                    return reverse;
053            }
054            
055            
056            public int range() {
057                    //return (int) mscMult+lscMult-1;//Math.pow(charHash.getSize(), length);
058                    return (int) mscMult*lscMult;
059            }
060            
061            /** Computes hash given a 0-based position on the string. */
062            public int hash(String str, int pos) {
063                    Assert.a(pos>=0);
064                    Assert.a(pos+length <= str.length());
065                    int hash = 0;
066                    for(int i = 0; i<length; ++i) {
067                            hash = hash*lscMult + charHash.hash(str.charAt(pos+i));
068                    }
069                    return hash;
070            }
071    
072            /** Computes a hash given a character array.  The array must be the exact size of the hash */
073            public int hash(char[] chr) {
074                    int hash = 0;
075                    for(int i = 0; i<length; ++i) {
076                            hash = hash*lscMult + charHash.hash(chr[i]);
077                    }
078                    return hash;
079            }
080    
081            /** Computes a hash given a character array.  Starts at the given index into the array */
082            public int hash(char[] chr, int start) {
083                    int hash = 0;
084                    for(int i = start; i<length+start; ++i) {
085                            hash = hash*lscMult + charHash.hash(chr[i]);
086                    }
087                    return hash;
088            }
089    
090            /** Updates an existing hash.  Drops the first character and adds in the new one to the end.
091             * hash("BCDE", 0) == shiftHash("E", hash("ABCD", 0))*/
092            public int shiftHash(char chr, int hash) {
093                    return ((hash%mscMult)*lscMult) + charHash.hash(chr);
094                    
095            }
096    
097            /** Updates an existing hash.  Drops the first character and adds in the new one to the end.
098             * hash("ABCD", 0) == reverseShiftHash("A", hash("BCDE", 0))*/
099            public int reverseShiftHash(char chr, int hash) {
100                    return charHash.hash(chr)*mscMult + hash/lscMult;
101            }
102    
103            /** Character hash function to use with DNA bases.  Upper and lower case get hashed to the same value.  Handles only the 4 nucleotides (ACTG).  No other characters allowed. */
104            public static CharacterHash DNA = new CharacterHash() {
105                    private static final long serialVersionUID = -5641887174464060367L;
106                    final char[] BASES = new char[] {'A','C','G','T'};
107                    public short getSize() { return 4;}
108                    public int hash(char a) {
109                            switch(a) {
110                                    case 'A':
111                                    case 'a':
112                                            return 0;
113                                    case 'C':
114                                    case 'c':
115                                            return 1;
116                                    case 'G':
117                                    case 'g':
118                                            return 2;
119                                    case 'T':
120                                    case 't':
121                                            return 3;
122                                    default:
123                                            throw new ErrorException("Bad character for hashing '"+a+"'.  Only A,C,T,G,a,c,t,g are allowed.");
124                            }
125                            
126                    }
127                    public char reverse(int a) {
128                            return BASES[a];
129                    }
130            };
131            
132            /** Character hash function to use with DNA bases which included the ambiguity code "N".  Upper and lower case get hashed to the same value.  Handles only the 4 nucleotides (ACTG).  No other characters allowed. */
133            public static CharacterHash ACGTN = new CharacterHash() {
134                    private static final long serialVersionUID = -4366495850512839766L;
135                    final char[] BASES = new char[] {'A','C','G','T','N'};
136                    public short getSize() { return 5;}
137                    public int hash(char a) {
138                            switch(a) {
139                                    case 'A':
140                                    case 'a':
141                                            return 0;
142                                    case 'C':
143                                    case 'c':
144                                            return 1;
145                                    case 'G':
146                                    case 'g':
147                                            return 2;
148                                    case 'T':
149                                    case 't':
150                                            return 3;
151                                    case 'N':
152                                    case 'n':
153                                            return 4;
154                                    default:
155                                            throw new ErrorException("Bad character for hashing '"+a+"'.  Only A,C,T,G,N,a,c,t,g,n are allowed.");
156                            }
157                    }
158                    public char reverse(int a) {
159                            return BASES[a];
160                    }
161            };
162            
163            /** Character hash function to use with DNA bases which included the ambiguity code "N".  Upper and lower case get hashed to the same value.  Handles only the 4 nucleotides (ACTG).  No other characters allowed. */
164            public static CharacterHash ACGTNcomp = new CharacterHash() {
165                    private static final long serialVersionUID = -5959780907260989652L;
166                    final char[] BASES = new char[] {'T','G','C','A','N'};
167                    public short getSize() { return 5;}
168                    public int hash(char a) {
169                            switch(a) {
170                                    case 'T':
171                                    case 't':
172                                            return 0;
173                                    case 'G':
174                                    case 'g':
175                                            return 1;
176                                    case 'C':
177                                    case 'c':
178                                            return 2;
179                                    case 'A':
180                                    case 'a':
181                                            return 3;
182                                    case 'N':
183                                    case 'n':
184                                            return 4;
185                                    default:
186                                            throw new ErrorException("Bad character for hashing '"+a+"'.  Only A,C,T,G,N,a,c,t,g,n are allowed.");
187                            }
188                    }
189                    public char reverse(int a) {
190                            return BASES[a];
191                    }
192            };
193            
194            
195            /** Character hash function to use with DNA bases which included the ambiguity code "N".  Upper and lower case get hashed to the same value.  Handles only the 4 nucleotides (ACTG).  No other characters allowed. */
196            public static CharacterHash ACGTother = new CharacterHash() {
197                    private static final long serialVersionUID = -3279138437137318988L;
198                    final char[] BASES = new char[] {'A','C','G','T','N'};
199                    public short getSize() { return 5;}
200                    public int hash(char a) {
201                            switch(a) {
202                                    case 'A':
203                                    case 'a':
204                                            return 0;
205                                    case 'C':
206                                    case 'c':
207                                            return 1;
208                                    case 'G':
209                                    case 'g':
210                                            return 2;
211                                    case 'T':
212                                    case 't':
213                                            return 3;
214                                    default:
215                                            return 4;
216                            }
217                    }
218                    public char reverse(int a) {
219                            return BASES[a];
220                    }
221            };
222            
223            
224            /** Character hash function to use with DNA bases which included the ambiguity code "N".  Upper and lower case get hashed to the same value.  Handles only the 4 nucleotides (ACTG).  No other characters allowed. */
225            public static CharacterHash ACGTotherRC = new CharacterHash() {
226                    private static final long serialVersionUID = 609468521914363959L;
227                    /* Like DNA, excapt never throws an exception; returns 4 if not ACGTN. */
228                    final char[] BASES = new char[] {'T','G','C','A','N'};
229                    public short getSize() { return 5;}
230                    public int hash(char a) {
231                            switch(a) {
232                                    case 'A':
233                                    case 'a':
234                                            return 3;
235                                    case 'C':
236                                    case 'c':
237                                            return 2;
238                                    case 'G':
239                                    case 'g':
240                                            return 1;
241                                    case 'T':
242                                    case 't':
243                                            return 0;
244                                    default:
245                                            return 4;
246                            }
247                    }
248                    public char reverse(int a) {
249                            return BASES[a];
250                    }
251            };
252            
253            
254            /** Character hash function to use with any letters.  Upper and lower case get hashed to the same value. */
255            public static CharacterHash LETTERS = new CharacterHash() { 
256                    private static final long serialVersionUID = -4036999685428397071L;
257                    public short getSize() { return 26;}
258                    public int hash(char a) {
259                            if(a >= 'a' && a <= 'z') {
260                                    return a - 'a';
261                            }
262                            else if(a >= 'A' && a <= 'Z') {
263                                    return a - 'A'; 
264                            }
265                            else {
266                                    throw new ErrorException("Bad character for hashing '"+a+"'.  Only a-z and A-Z are allowed.");
267                            }
268                    }
269                    public char reverse(int a) {
270                            return (char)('a'+(char)a);
271                    }
272            };
273    
274            public int hash(char c) {
275                    return charHash.hash(c);
276            }
277    
278            public boolean hashable(char a)  {
279                    try {
280                            charHash.hash(a);
281                    } catch (ErrorException E) {
282                            return false;
283                    }
284                    return true;
285            }
286            
287    }