#define min(x,y) (x #include #include #include #define MAXNBSEQS 20000 #define BIG_INT 10000000; char** allocate_more_kmers(char** kmers, int n, int max_nb_kmers, int inc_nb_kmers); int sequenceOverlap(char* s1, char* s2); int findSites(char* r, char* name, char* seq, int* positions, int* np, int* orientations, int* no, int singlestrand); double lcumhyper(int i, int s1, int s2, int N); double cumhyper(int i, int s1, int s2, int N); double bico(int n, int k); double factln(int n); double gammln(double xx); double factrl(int n) ; void nrerror(char str[]); double hypergeom(int i, int s1, int s2, int N); void chomp(char* s); char* complement(char* s); char* getComplement(char* s) ; char* get_parameter(int argc, char** argv, char* param); char** readFastaFile(char* filename, char** names_seq, int* n); void get_overlap_array(char* site1, char* site2, int size, int* i, int* s1, int* s2); double getUpdatedScore(char* site1k1, char* site2k1, char* site1k2, char* site2k2, int size, int* overlap_i, int* overlap_s1, int* overlap_s2, int op); int CmpFunc(const void* _a, const void* _b); int CmpFuncSimpleInt(const void* _a, const void* _b); typedef struct _KmerPair { int index1; // where is it found int index2; int s1; int s2; int ov; float score; } KmerPair; int verbose = 0; int main (int argc, char** argv) { // sequences char* fastafile1; char* fastafile2; char** seqs1; char** seqs2; int nbgenes; int ns1; int ns2; char** names_seq; char* outfile; // kmers char* kmersfile; char* kmer1; char* kmer2; int kmer_size = 0; char** kmers; int motifsize; int n; double* scores; int s1; int s2; int sc; // occurrences char** sites_species1; char** sites_species2; int* positions; int np; char* name = 0; char* new_sites_species1; char* new_sites_species2; char* available_kmers; int* a_s1; int* a_s2; int* a_i; int limit = 0; int i_limit = 0; double newscore; int overlap_i, overlap_s1, overlap_s2; // general char* buff; FILE* fp; int i, j, k; int a, b; // comparison of kmers int strand; int decal; int ov; int ovmin; int surface; int nb_pairs = 0; int twokmers = 0; KmerPair* pairs_array; int maxov; // positions int*** positions_species1; int*** positions_species2; int nb_distances; int* distances; int* genes_for_distances; int p1, p2; int dist, mindist; float med; char* distfile; int use_min_dist = 0; // by default, do not use min dist // output char* dir; char* outdir; int singlestrand = 0; int* orientations; int no; int ov1; int ov2; int max_nb_kmers = 10000; int inc_nb_kmers = 1000; int ss1, ss2, ss3, ss4; if (argc < 3) { printf("Usage: do_fastcompare_coconservation -fasta1 FILE -fasta2 FILE -kmers FILE -maxov INT -limit INT -singlestrand INT\n"); exit(0); } // // INPUTS // fastafile1 = get_parameter(argc, argv, "-fasta1"); fastafile2 = get_parameter(argc, argv, "-fasta2"); kmersfile = get_parameter(argc, argv, "-kmers"); limit = atoi(get_parameter(argc, argv, "-limit")); maxov = atoi(get_parameter(argc, argv, "-maxov")); if (strcmp(get_parameter(argc, argv, "-singlestrand"), "") != 0) singlestrand = atoi(get_parameter(argc, argv, "-singlestrand")); else singlestrand = 1; // // READ AND STORE THE SEQUENCES // names_seq = (char**)malloc(MAXNBSEQS * sizeof(char*)); seqs1 = readFastaFile(fastafile1, names_seq, &ns1); seqs2 = readFastaFile(fastafile2, names_seq, &ns2); // // READ THE LIST OF KMERS // kmers = (char**)malloc(max_nb_kmers * sizeof(char*)); // create a line buffer buff = (char*)malloc(10000 * sizeof(char)); fp = fopen(kmersfile, "r"); if (!fp) { printf("could not kmer file %s\n", kmersfile); } n = 0; motifsize = 0; while (!feof(fp)) { fscanf(fp, "%s\t%f\t%f\t%f\t%f\n", buff, &sc, &sc, &sc, &sc); kmers[n] = (char*)calloc(100, sizeof(char)); strcat(kmers[n], buff); n++; // allocate more memory if we need it if (n == max_nb_kmers) { kmers = allocate_more_kmers(kmers, n, max_nb_kmers, inc_nb_kmers); max_nb_kmers += inc_nb_kmers; } if ((limit > 0) && (limit == n)) { break; } } fclose(fp); // // for each pattern, fill a vector of occurrences // positions = (int*)malloc(5000 * sizeof(char)); orientations = (int*)malloc(5000 * sizeof(int)); sites_species1 = (char**)malloc(n * sizeof(char*)); sites_species2 = (char**)malloc(n * sizeof(char*)); scores = (double*)malloc(n * sizeof(double)); a_s1 = (int*)malloc(n * sizeof(int)); a_s2 = (int*)malloc(n * sizeof(int)); a_i = (int*)malloc(n * sizeof(int)); for (i=0; i 0) { sites_species1[i][j] = 1; } np = 0; no = 0; findSites(kmers[i], name, seqs2[j], positions, &np, orientations, &no, singlestrand); if (np > 0) { sites_species2[i][j] = 1; } } // calculate a score get_overlap_array(sites_species1[i], sites_species2[i], ns1, &overlap_i, &overlap_s1, &overlap_s2); a_s1[i] = overlap_s1; a_s2[i] = overlap_s2; a_i [i] = overlap_i; // calculate (and store) conservation score for this kmer scores[i] = - lcumhyper(overlap_i, overlap_s1, overlap_s2, ns1); } // // allocate memory for a whole list of pairs of kmers // pairs_array = (KmerPair*)malloc(n * n * sizeof(KmerPair)); nb_pairs = 0; for (i=0; i 0 ) continue; if (maxov && ((ov1 > maxov) || (ov2 > maxov))) continue; newscore = getUpdatedScore(sites_species1[i], sites_species2[i], sites_species1[j], sites_species2[j], ns1, &overlap_i, &overlap_s1, &overlap_s2, 0); pairs_array[nb_pairs].index1 = i; pairs_array[nb_pairs].index2 = j; pairs_array[nb_pairs].ov = overlap_i; pairs_array[nb_pairs].s1 = overlap_s1; pairs_array[nb_pairs].s2 = overlap_s2; pairs_array[nb_pairs].score = newscore; nb_pairs ++; } } qsort((void*)pairs_array, nb_pairs, sizeof(KmerPair), CmpFunc); outdir = (char*)calloc(200, sizeof(char)); for (i=0; i *b) return 1; else if(*a == *b) return 0; else return -1; } int CmpFunc(const void* _a, const void* _b) { const KmerPair* a = (const KmerPair*) _a; const KmerPair* b = (const KmerPair*) _b; if (a->score > b->score) return 1; else if(a->score == b->score) return 0; else return -1; } /** REGEXP, returns 1 if a site was found, 0 otherwise **/ int findSites(char* r, char* name, char* seq, int* positions, int* np, int* orientations, int* no, int singlestrand) { int i, j; //, j, n; int score1, score2; pcre* re; const char* error; int erroffset; int rc; int ovector[30]; char* cr = 0; char* substring_start; int substring_length; int startoffset = 0; int pos; char* seq_pos; // // store the position where a RE has been found, to avoid counting twice the palindromes // seq_pos = (char*)calloc(strlen(seq), sizeof(char)); *np = 0; *no = 0; //printf("searching for %s in %s\n", r, seq); // // ONE STRAND // re = pcre_compile(r, 0, &error, &erroffset, NULL); while ( (rc = pcre_exec(re, NULL, seq, strlen(seq), startoffset, 0, ovector, 30) ) > 0) { substring_start = seq + ovector[0]; substring_length = ovector[1] - ovector[0]; // save the position pos = strlen(seq) - ovector[0] - 1; positions[ *np ] = pos; // save the orientation orientations[ *no ] = 1; startoffset = ovector[0] + substring_length; seq_pos[ pos ] = 1; (*np)++; (*no)++; } //return 0; if (singlestrand == 0) { // OTHER STRAND cr = complement(r); //printf("Complement of %s is %s\n", r, cr); startoffset = 0; re = pcre_compile(cr, 0, &error, &erroffset, NULL); while ( (rc = pcre_exec(re, NULL, seq, strlen(seq), startoffset, 0, ovector, 30)) > 0) { substring_start = seq + ovector[0]; substring_length = ovector[1] - ovector[0]; pos = strlen(seq) - ovector[0] - 1; // if not a palindromic version if (seq_pos[ pos ] != 1) { // save the position positions[ *np ] = pos; (*np)++; } // save the orientation in all cases orientations[ *no ] = -1; (*no)++; startoffset = ovector[0] + substring_length; } } if (cr != 0) free(cr); free(seq_pos); pcre_free(re); return 0; } // // take a char* and replace every nt by its complement // char* complement(char* s) { int l = strlen(s); char* c; char d; int i; c = (char*)calloc(l+1, sizeof(char)); for (i=l-1; i>=0; i--) { if (s[i] == 'A') d = 'T'; else if (s[i] == 'T') d = 'A'; else if (s[i] == 'G') d = 'C'; else if (s[i] == 'C') d = 'G'; else if (s[i] == '[') d = ']'; else if (s[i] == ']') d = '['; else d = s[i]; strncat(c, &d, 1); } return c; } void chomp(char* s) { int j; j = strlen(s); while (j >= 0) { if (s[j] == '\n') { s[j] = '\0'; return; } j--; } } char* get_parameter(int argc, char** argv, char* param) { int i = 0; while ((i < argc) && (strcmp(param, argv[i]))) i++; if (i') { names_seq[i] = strdup(buff + 1); } else if ((buff[0] != '>') && (strlen(buff) > 0)) { seqs[i] = strdup(buff); //printf("Read %d sequences (l=%d)..\n", i, strlen(seqs[i])); i++; } } *n = i; return seqs; } // // // NNNNNNNNNNNNNNNNNNNNNNNNNNN // <--- decal ---> NNNNNNNNNNNNNNNNN // // int getBestOverlap(char* s1, char* s2, int* strand, int* decal, int ma, int mm, int twostrand) { int l1 = strlen(s1); int l2 = strlen(s2); int** a; //[l1][l2]; int best_a; int i, j; char* s2c; a = (int**)malloc(l1 * sizeof(int*)); for (i=0; i best_a) { best_a = a[i][l2 - 1]; *decal = - (l2 - (i + 1)); *strand = 1; } } for (j=0; j best_a) { best_a = a[l1 - 1][j]; *decal = l1 - (j + 1); *strand = 1; } } if (twostrand == 1) { // do the same, but reverse complement s2 s2c = getComplement(s2); for (i=0; i best_a) { best_a = a[i][l2 - 1]; *decal = - (l2 - (i + 1)); *strand = 0; } } for (j=0; j best_a) { best_a = a[l1 - 1][j]; *decal = l1 - (j + 1); *strand = 0; } } free(s2c); } return best_a; } char* getComplement(char* s) { int l = strlen(s); char* c; char d; int i; c = (char*)calloc(l+1, sizeof(char)); for (i=l-1; i>=0; i--) { if (s[i] == 'A') d = 'T'; else if (s[i] == 'T') d = 'A'; else if (s[i] == 'G') d = 'C'; else if (s[i] == 'C') d = 'G'; else if (s[i] == '.') d = '.'; else d = 'N'; strncat(c, &d, 1); } return c; } // // get the overlap between two arrays of 0,1 // ie i, unique s1, unique s2 // size = number of orthologs void get_overlap_array(char* site1, char* site2, int size, int* i, int* s1, int* s2) { int overlap_i = 0, overlap_s1 = 0, overlap_s2 = 0; int ii; for (ii=0; ii 32) return exp(gammln(n+1.0)); while (ntop 0) { return 1; } else { return 0; } } int sequenceOverlap(char* s1, char* s2) { pcre* re; char* ss; int l1 = strlen(s1); int i; int erroffset; int rc; const char* error; int startoffset = 0; int ovector[30]; int best_ov = 0; ss = (char*)malloc(l1 * sizeof(char)); //printf("look for overlaps between %s and %s\n", s1, s2); // // examine substrings of s1 // for (i=1; i<=l1; i++) { memcpy(ss, s1, i); ss[i] = '$' ; ss[i+1] = '\0'; re = pcre_compile(ss, 0, &error, &erroffset, NULL); if (re == NULL) { printf("Could not create RE\n"); } rc = pcre_exec(re, NULL, s2, strlen(s2), startoffset, 0, ovector, 30); if (rc > 0) { if (i > best_ov) best_ov = i; //printf("%s is a substring of %s\n", ss, s2); } else { //printf("%s is NOT a substring of %s\n", ss, s2); } memcpy(ss+1, s1+l1-i, i); ss[0] = '^' ; ss[i+1] = '\0'; re = pcre_compile(ss, 0, &error, &erroffset, NULL); if (re == NULL) { printf("Could not create RE\n"); } rc = pcre_exec(re, NULL, s2, strlen(s2), startoffset, 0, ovector, 30); if (rc > 0) { if (i > best_ov) best_ov = i; //printf("%s is a substring of %s\n", ss, s2); } else { //printf("%s is NOT a substring of %s\n", ss, s2); } } return best_ov; } // // substring // char *substr(char *string, int start, int length) { char *substring; substring = (char*)calloc(length+1, sizeof(char)); memcpy(substring, string+start, length*sizeof(char)); substring[length] = '\0'; return substring; }