001 package calhoun.seq;
002
003 import java.io.Serializable;
004
005 import org.apache.commons.logging.Log;
006 import org.apache.commons.logging.LogFactory;
007
008 import calhoun.util.Assert;
009 import calhoun.util.ErrorException;
010
011 /** Class for computing kmer hashes. You instantiate the class with the length of the kmer to hash and the alphabet to use.
012 * The hashing functions then compute hashes for individual kmers. This class does not stored any hashes, which are just ints. */
013 public class KmerHasher implements Serializable {
014 private static final long serialVersionUID = -3402947063680917230L;
015
016 @SuppressWarnings("unused")
017 private static final Log log = LogFactory.getLog(KmerHasher.class);
018
019 CharacterHash charHash;
020 int length;
021 int mscMult; // Multiplier for most significant character
022 int lscMult; // Multiplier for least significant character
023
024 public interface CharacterHash extends Serializable {
025 public short getSize();
026 public int hash(char a) throws ErrorException;
027 //public boolean hashable(char a);
028 public char reverse(int a);
029 }
030
031 /** Creates a hash from a given character hash and length.
032 */
033 public KmerHasher(CharacterHash charHash, int length) {
034 this.charHash = charHash;
035 this.length = length;
036 mscMult = (int) Math.pow(charHash.getSize(), length-1);
037 lscMult = charHash.getSize();
038 // log.info(String.format("Alphabet size: %d Max hash: %d Max hash(len-1): %d, length: %d", lscMult, mscMult*lscMult-1, mscMult, length));
039 }
040
041
042 public static String reverseComplement(String forward) {
043 String reverse = "";
044
045 CharacterHash hf = ACGTother;
046 CharacterHash hr = ACGTotherRC;
047
048 for (int j=forward.length()-1; j>=0; j--) {
049 reverse = reverse + hr.reverse(hf.hash(forward.charAt(j)));
050 }
051
052 return reverse;
053 }
054
055
056 public int range() {
057 //return (int) mscMult+lscMult-1;//Math.pow(charHash.getSize(), length);
058 return (int) mscMult*lscMult;
059 }
060
061 /** Computes hash given a 0-based position on the string. */
062 public int hash(String str, int pos) {
063 Assert.a(pos>=0);
064 Assert.a(pos+length <= str.length());
065 int hash = 0;
066 for(int i = 0; i<length; ++i) {
067 hash = hash*lscMult + charHash.hash(str.charAt(pos+i));
068 }
069 return hash;
070 }
071
072 /** Computes a hash given a character array. The array must be the exact size of the hash */
073 public int hash(char[] chr) {
074 int hash = 0;
075 for(int i = 0; i<length; ++i) {
076 hash = hash*lscMult + charHash.hash(chr[i]);
077 }
078 return hash;
079 }
080
081 /** Computes a hash given a character array. Starts at the given index into the array */
082 public int hash(char[] chr, int start) {
083 int hash = 0;
084 for(int i = start; i<length+start; ++i) {
085 hash = hash*lscMult + charHash.hash(chr[i]);
086 }
087 return hash;
088 }
089
090 /** Updates an existing hash. Drops the first character and adds in the new one to the end.
091 * hash("BCDE", 0) == shiftHash("E", hash("ABCD", 0))*/
092 public int shiftHash(char chr, int hash) {
093 return ((hash%mscMult)*lscMult) + charHash.hash(chr);
094
095 }
096
097 /** Updates an existing hash. Drops the first character and adds in the new one to the end.
098 * hash("ABCD", 0) == reverseShiftHash("A", hash("BCDE", 0))*/
099 public int reverseShiftHash(char chr, int hash) {
100 return charHash.hash(chr)*mscMult + hash/lscMult;
101 }
102
103 /** Character hash function to use with DNA bases. Upper and lower case get hashed to the same value. Handles only the 4 nucleotides (ACTG). No other characters allowed. */
104 public static CharacterHash DNA = new CharacterHash() {
105 private static final long serialVersionUID = -5641887174464060367L;
106 final char[] BASES = new char[] {'A','C','G','T'};
107 public short getSize() { return 4;}
108 public int hash(char a) {
109 switch(a) {
110 case 'A':
111 case 'a':
112 return 0;
113 case 'C':
114 case 'c':
115 return 1;
116 case 'G':
117 case 'g':
118 return 2;
119 case 'T':
120 case 't':
121 return 3;
122 default:
123 throw new ErrorException("Bad character for hashing '"+a+"'. Only A,C,T,G,a,c,t,g are allowed.");
124 }
125
126 }
127 public char reverse(int a) {
128 return BASES[a];
129 }
130 };
131
132 /** Character hash function to use with DNA bases which included the ambiguity code "N". Upper and lower case get hashed to the same value. Handles only the 4 nucleotides (ACTG). No other characters allowed. */
133 public static CharacterHash ACGTN = new CharacterHash() {
134 private static final long serialVersionUID = -4366495850512839766L;
135 final char[] BASES = new char[] {'A','C','G','T','N'};
136 public short getSize() { return 5;}
137 public int hash(char a) {
138 switch(a) {
139 case 'A':
140 case 'a':
141 return 0;
142 case 'C':
143 case 'c':
144 return 1;
145 case 'G':
146 case 'g':
147 return 2;
148 case 'T':
149 case 't':
150 return 3;
151 case 'N':
152 case 'n':
153 return 4;
154 default:
155 throw new ErrorException("Bad character for hashing '"+a+"'. Only A,C,T,G,N,a,c,t,g,n are allowed.");
156 }
157 }
158 public char reverse(int a) {
159 return BASES[a];
160 }
161 };
162
163 /** Character hash function to use with DNA bases which included the ambiguity code "N". Upper and lower case get hashed to the same value. Handles only the 4 nucleotides (ACTG). No other characters allowed. */
164 public static CharacterHash ACGTNcomp = new CharacterHash() {
165 private static final long serialVersionUID = -5959780907260989652L;
166 final char[] BASES = new char[] {'T','G','C','A','N'};
167 public short getSize() { return 5;}
168 public int hash(char a) {
169 switch(a) {
170 case 'T':
171 case 't':
172 return 0;
173 case 'G':
174 case 'g':
175 return 1;
176 case 'C':
177 case 'c':
178 return 2;
179 case 'A':
180 case 'a':
181 return 3;
182 case 'N':
183 case 'n':
184 return 4;
185 default:
186 throw new ErrorException("Bad character for hashing '"+a+"'. Only A,C,T,G,N,a,c,t,g,n are allowed.");
187 }
188 }
189 public char reverse(int a) {
190 return BASES[a];
191 }
192 };
193
194
195 /** Character hash function to use with DNA bases which included the ambiguity code "N". Upper and lower case get hashed to the same value. Handles only the 4 nucleotides (ACTG). No other characters allowed. */
196 public static CharacterHash ACGTother = new CharacterHash() {
197 private static final long serialVersionUID = -3279138437137318988L;
198 final char[] BASES = new char[] {'A','C','G','T','N'};
199 public short getSize() { return 5;}
200 public int hash(char a) {
201 switch(a) {
202 case 'A':
203 case 'a':
204 return 0;
205 case 'C':
206 case 'c':
207 return 1;
208 case 'G':
209 case 'g':
210 return 2;
211 case 'T':
212 case 't':
213 return 3;
214 default:
215 return 4;
216 }
217 }
218 public char reverse(int a) {
219 return BASES[a];
220 }
221 };
222
223
224 /** Character hash function to use with DNA bases which included the ambiguity code "N". Upper and lower case get hashed to the same value. Handles only the 4 nucleotides (ACTG). No other characters allowed. */
225 public static CharacterHash ACGTotherRC = new CharacterHash() {
226 private static final long serialVersionUID = 609468521914363959L;
227 /* Like DNA, excapt never throws an exception; returns 4 if not ACGTN. */
228 final char[] BASES = new char[] {'T','G','C','A','N'};
229 public short getSize() { return 5;}
230 public int hash(char a) {
231 switch(a) {
232 case 'A':
233 case 'a':
234 return 3;
235 case 'C':
236 case 'c':
237 return 2;
238 case 'G':
239 case 'g':
240 return 1;
241 case 'T':
242 case 't':
243 return 0;
244 default:
245 return 4;
246 }
247 }
248 public char reverse(int a) {
249 return BASES[a];
250 }
251 };
252
253
254 /** Character hash function to use with any letters. Upper and lower case get hashed to the same value. */
255 public static CharacterHash LETTERS = new CharacterHash() {
256 private static final long serialVersionUID = -4036999685428397071L;
257 public short getSize() { return 26;}
258 public int hash(char a) {
259 if(a >= 'a' && a <= 'z') {
260 return a - 'a';
261 }
262 else if(a >= 'A' && a <= 'Z') {
263 return a - 'A';
264 }
265 else {
266 throw new ErrorException("Bad character for hashing '"+a+"'. Only a-z and A-Z are allowed.");
267 }
268 }
269 public char reverse(int a) {
270 return (char)('a'+(char)a);
271 }
272 };
273
274 public int hash(char c) {
275 return charHash.hash(c);
276 }
277
278 public boolean hashable(char a) {
279 try {
280 charHash.hash(a);
281 } catch (ErrorException E) {
282 return false;
283 }
284 return true;
285 }
286
287 }