/* Accompanying code for Stark, Kheradpour, et al. (2007): * Reliable prediction of regulator targets using 12 Drosophila genomes * * Software not to be redistributed without permission from authors * Support contact email: alex.stark / pouyak / manoli / at mit.edu * * PURPOSE: * 1) calculates branch length score (BLS) and its confidence * 2) identifies control motifs * 3) counts the number of instances at each confidence level * * COMPILATION: * gcc -Wall -O3 -m32 -lm -o branch-sn branch-sn.c * * SAMPLE USAGE: * * REQUIRED FILES (. in formats indicates spacer column): * * with lines of format: * motif . . . . species_bitstring * where species_bitstring is, for e.g. 5 if both the first and * third informant species species match, but not the second * these represent each match to each motif and potential * control motif in the desired regions * * with one motif (no whitespace) per line (should not include control motifs) * * with lines of format: * species_bitstring . . branch_length * * is the number of species includingthe target species * thus, 2^(-1)-1 is the greatest species_bitstring * * First, compute (control motifs for each motif): * branch-sn -b -n -e -pc 1 | grep -w -f > * NOTE: control motifs have the same composition and are within +/-20% of the number of instances * NOTE: we do not permit one of the non-control motifs to be control motifs * NOTE: this only works if the motif in is a consensus sequence * * Next, use these and compute the branch length to confidence mapping: * branch-sn -f -b -n > * * has the format: * column 1: motif * columns 2-103: for branch lengths MAXBL*(0.00, 0.01, ..., 0.99, 1.00): * confidence;#instances;branch_length */ #define _GNU_SOURCE #include #include #include #include #include #include #define asserte1(b,a1) do {if (!(b)) {fprintf(stderr, (a1)); exit(1);}} while(0) #define asserte2(b,a1,a2) do {if (!(b)) {fprintf(stderr, (a1), (a2)); exit(1);}} while(0) #define MAXMOTLEN 50 #define ALPHSIZE 26 #define MAX(a,b) (((a)>(b)) ? (a) : (b)) #define MIN(a,b) (((a)<(b)) ? (a) : (b)) struct motif { char str[MAXMOTLEN+1]; unsigned char base_cnts[ALPHSIZE]; unsigned int *bin_cnts; unsigned int total_cnt; unsigned char c_ex; /* indicates excluded from being a control */ }; /* reads a branch length file where the first column is the species bistring and the * forth column is the total branch length */ float* read_bl_file(char* bl_file, int num_sp); /* compares motifs on the basis of base_cnts (i.e. motifs with the same composition will * have return value 0) */ int motifs_cmp (const void *a, const void *b) { return memcmp(((struct motif*)a)->base_cnts, ((struct motif*)b)->base_cnts, sizeof(unsigned char)*ALPHSIZE); } /* read in all the motifs along with their counts at each of the blcuts * final motifs table is sorted such that motifs with the same composition are together */ int read_motifs (char* fn, struct motif **motifs_ptr, int num_bl, int num_bins, unsigned int *spbit_to_bin); /* same as read_motifs except uses output from motif_match directly without use of * mm2branch-sn */ int read_motifs_mm (char* fn, struct motif **motifs_ptr, int num_bl, int num_bins, unsigned int *spbit_to_bin); double count_to_confid (unsigned int n, unsigned int nc, unsigned int n_r, unsigned int nc_r); void print_motif_line (char* mstr, unsigned int* m_bins, unsigned int* c_bins, float* consm_bl, int sp_mask, int num_bins); int main (int argc, char** argv) { int num_sp = 0; int num_motifs = 0; int num_bins = 101; struct motif* motifs; int num_bl; char* bl_file = NULL; char* c_file = NULL; char* ex_file = NULL; float* consm_bl = NULL; float tol_diff = 0.2; int sp_mask = -1; int print_controls = 0; int motif_match_input = 1; unsigned int *spbit_to_bin; /* same_end is actually one AFTER last with same composition */ int same_start = 0, same_end = 0; int i, j; for (i=1; i <-b file> [-t frac] [-n num] [-f file] [-e file] [-pc 0/1] [-mm 0/1] [-z float] \n", argv[0]); printf(" -n Number of species (0 <= #Bls < 2^(NS - 1)) [required]\n"); printf(" -f List of controls for each motif (overrides -e, -pc, -t)\n"); printf(" -e List of motifs to exclude from controls\n"); printf(" -k Species mask (-1 for all) [default: -1]\n"); printf(" -b Branch length file [required]\n"); printf(" -m Number of bins to use for output [default: 101]\n"); printf(" -t Tolerance permitted between motifs and matched controls [default: 0.1]\n"); printf(" -z Z-value to use to create confidence intervals for ratios when generating confidence number [default: 0]\n"); printf(" -mm Input is output directly from motif-match [default: 0]\n"); printf(" -pc Prints control motifs chosen for each motif [default: 0]\n"); return 1; } num_bl = 1 << (num_sp-1); if (sp_mask < 0) sp_mask = num_bl - 1; consm_bl = read_bl_file(bl_file,num_sp); /* conversion from the species bitstring to the corresponding bin */ spbit_to_bin = (unsigned int*) malloc(sizeof(unsigned int)*num_bl); for (i=0; idata]; unsigned int *c_bins = (unsigned int*) calloc(num_bins,sizeof(unsigned int)); e.key = strtok(NULL, "\t "); while (e.key != NULL) { /* if found, update c_bins */ ep = hsearch(e, FIND); if (ep != NULL) for (i=0; idata].bin_cnts[i]; e.key = strtok(NULL, "\t "); } /* print the line corresponding to this motif */ print_motif_line(m->str, m->bin_cnts, c_bins, consm_bl, sp_mask, num_bins); free(c_bins); } } if (line) free(line); } else { /* exclude motifs */ if (ex_file != NULL) { ENTRY e, *ep; FILE* fp = fopen (ex_file, "r"); char ex_motif[MAXMOTLEN+1]; e.key = ex_motif; while (fscanf(fp, "%s", e.key) != EOF) { ep = hsearch(e, FIND); if (ep != NULL) motifs[(int) ep->data].c_ex = 1; } fclose(fp); } /* scan through motifs and find matching instances */ for (i=0; i= same_end) { same_start = i; /* correct same_end */ for (same_end=i+1; same_end= motifs[j].total_cnt ) for (k=0; k= motifs[j].total_cnt ) printf("\t%s", motifs[j].str); printf("\n"); } } } /* free up used memory */ for (i=0; i0; i--) t[i-1] = t[i] + s[i-1]; return t; } void print_motif_line (char* mstr, unsigned int* m_bins, unsigned int* c_bins, float* consm_bl, int sp_mask, int num_bins) { int j; unsigned int* m_bins_c = cumul_array(m_bins, num_bins); unsigned int* c_bins_c = cumul_array(c_bins, num_bins); printf("%s", mstr); for (j=0; j 0 && m_bins_c[j] > 0) printf("%lf", count_to_confid(m_bins_c[0], m_bins_c[j], c_bins_c[0], c_bins_c[j])); else /* use -1 confidence to indicate unknown */ printf("-"); printf(";%d;%lf", m_bins_c[j], ((double)j*consm_bl[sp_mask])/(num_bins-1)); } printf("\n"); free(m_bins_c); free(c_bins_c); } double count_to_confid (unsigned int n, unsigned int nc, unsigned int n_r, unsigned int nc_r) { if (nc == 0 || n_r == 0) return 0.0; return 1.0 - ((double) nc_r * n) / n_r / nc; } int read_motifs (char* fn, struct motif **motifs_ptr, int num_bl, int num_bins, unsigned int *spbit_to_bin) { int num_motifs = 0; int size_reserved = 1 << 5; struct motif* motifs = (struct motif*) malloc(size_reserved*sizeof(struct motif)); FILE* fp = strcmp(fn, "-")==0 ? stdin : fopen(fn, "r"); asserte1(fp != NULL, "File could not be opened!\n"); while (fscanf(fp, "%s", motifs[num_motifs].str) != EOF) { struct motif* m; int i; num_motifs++; if ((size_reserved-1) < num_motifs) { size_reserved *= 2; motifs = (struct motif*) realloc(motifs,size_reserved*sizeof(struct motif)); } m = &motifs[num_motifs-1]; m->c_ex = 0; memset(m->base_cnts, 0, sizeof(unsigned char)*ALPHSIZE); m->bin_cnts = (unsigned int*) calloc(num_bins, sizeof(unsigned int)); m->total_cnt = 0; for (i=0; m->str[i]; i++) { /* note; with -f all this base composition stuff is ignored allowing for motifs * to have names that include non-alpha characters */ int c = toupper(m->str[i]) - 'A'; if (c >= 0 && c < 26) m->base_cnts[c]++; } for (i=0; ibin_cnts[spbit_to_bin[i]] += c; m->total_cnt += c; } } /* sort motifs so that motifs with the same composition are put together */ qsort (motifs, num_motifs, sizeof(struct motif), motifs_cmp); *motifs_ptr = motifs; fclose(fp); return num_motifs; } struct motif_idx { char str[MAXMOTLEN+1]; int idx; }; int motif_idx_cmp (const void *a, const void *b) { return strcmp(((struct motif_idx*)a)->str, ((struct motif_idx*)b)->str); } int read_motifs_mm (char* fn, struct motif **motifs_ptr, int num_bl, int num_bins, unsigned int *spbit_to_bin) { int num_motifs = 0; int size_reserved = 1 << 5; struct motif* motifs = (struct motif*) malloc(size_reserved*sizeof(struct motif)); struct motif_idx m, *mp, **mpp; void* motif_idx_tree = NULL; int bl; FILE* fp = strcmp(fn, "-")==0 ? stdin : fopen(fn, "r"); asserte1(fp != NULL, "File could not be opened!\n"); while (fscanf(fp, "%s %*s %*s %*s %*s %d%*[^\n]\n", m.str, &bl) != EOF) { /* lookup in tree */ mpp = tfind((void *) (&m), &motif_idx_tree, motif_idx_cmp); /* if not found, setup the motif */ if (mpp == NULL) { int i; /* add to array */ num_motifs++; if ((size_reserved-1) < num_motifs) { size_reserved *= 2; motifs = (struct motif*) realloc(motifs,size_reserved*sizeof(struct motif)); } strcpy(motifs[num_motifs-1].str, m.str); memset(motifs[num_motifs-1].base_cnts, 0, sizeof(unsigned char)*ALPHSIZE); motifs[num_motifs-1].c_ex = 0; motifs[num_motifs-1].bin_cnts = (unsigned int*) calloc(num_bins, sizeof(unsigned int)); motifs[num_motifs-1].total_cnt = 0; /* figure out base_cnts */ for (i=0; motifs[num_motifs-1].str[i]; i++) { /* note; with -f all this base composition stuff is ignored allowing for motifs * to have names that include non-alpha characters */ int c = toupper(motifs[num_motifs-1].str[i]) - 'A'; if (c >= 0 && c < 26) motifs[num_motifs-1].base_cnts[c]++; } /* add to tree */ mp = (struct motif_idx*) malloc(sizeof(struct motif_idx)); strcpy(mp->str, m.str); mp->idx = num_motifs-1; mpp = tsearch((void *) mp, &motif_idx_tree, motif_idx_cmp); } motifs[(*mpp)->idx].bin_cnts[spbit_to_bin[bl & (num_bl-1)]]++; motifs[(*mpp)->idx].total_cnt++; } /* NOTE: the tree is not deleted even though it could be now */ /* sort motifs so that motifs with the same composition are put together */ qsort (motifs, num_motifs, sizeof(struct motif), motifs_cmp); *motifs_ptr = motifs; fclose(fp); return num_motifs; } float* read_bl_file(char* bl_file, int num_sp) { int num_sp_comb = 1 << (num_sp-1); float* consm_bl = (float*) calloc(num_sp_comb, sizeof(float)); int index; float bl; FILE* fp = fopen(bl_file, "r"); asserte2(fp != NULL, "Cannot open: %s\n", bl_file); while (fscanf(fp, "%d %*s %*s %f\n", &index, &bl) != EOF) if (index < num_sp_comb) consm_bl[index]=bl; fclose(fp); return consm_bl; }