#include #include #include #include #include #include #include #include #define max(x,y) (x>y?x:y) #define min(x,y) (x 0) { k2 = motifsize - k1; } // // END READING THE LIST OF KMERS // // alloc memory for local storage local_sites1 = (char*)malloc(n * sizeof(char)); local_sites2 = (char*)malloc(n * sizeof(char)); // alloc more memory for calculating hypergeometric p-values scores = (float*)malloc(n * sizeof(float)); a_s1 = (int*)calloc(n, sizeof(int)); a_s2 = (int*)calloc(n, sizeof(int)); a_i = (int*)calloc(n, sizeof(int)); // // START BUILDING A PREFIX TREE KMER => INDEX // // associates a node to an index node2index = (int*)malloc(n * motifsize * sizeof(int)); // we are going to need many nodes ... pnode = (prefixNode*)malloc(motifsize * n * (sizeof(prefixNode))); pnode[0].next[0] = -1; pnode[0].next[1] = -1; pnode[0].next[2] = -1; pnode[0].next[3] = -1; // // add all the words in the list // nextfree = 1; for (i=0; i INDEX // // // open sequence file 1 // fp1 = fopen(fastafile1, "r"); if (!fp1) { printf("cannot open %s ..\n", fastafile1); exit(0); } // // open sequence file 2 // fp2 = fopen(fastafile2, "r"); if (!fp2) { printf("cannot open %s ..\n", fastafile2); exit(0); } mce = (char*)calloc(motifsize+2, sizeof(char)); mcb = (char*)calloc(motifsize+2, sizeof(char)); c_mce = (char*)calloc(motifsize+2, sizeof(char)); // // START SCANNING SPECIES 1 AND 2 // s = 0; nbgenes = 0; while ( (seq1 = nextSequence(fp1, &name1, &size1, &nextSequence_started1, &nextSequence_ended1, nextSequence_currentLine1)) ) { //printf("Read SEQ 1 %s, %s ..\n", name1, seq1); l = strlen(seq1); // initilialize the local store memset(local_sites1, 0, n); stop = strlen(seq1) - motifsize - gap + 1; if (stop < 0) stop = 0; //printf("stop1 = %d\n", stop); for (j=0; j 0) { if (k1 > 0) { strncpy(mce, seq1+j, k1); strncpy(mce + k1, seq1 + j + k1 + gap, k2); } else { strncpy(mce, seq1+j, (motifsize / 2)); strncpy(mce+(motifsize / 2), seq1+j + (motifsize / 2) + gap, motifsize - (motifsize / 2)); } } else { strncpy(mce, seq1+j, motifsize); } mce[motifsize] = '\0'; c_mce = complement(mce); nb1 = existWord(mce, motifsize, pnode); nb2 = -1; if (singlestrand == 0) nb2 = existWord(c_mce, motifsize, pnode); // if any of them has been found, no need to go on if ((nb1 > 0) || (nb2 > 0)) { if (nb1 > 0) i = node2index[nb1]; else i = node2index[nb2]; // increment s1 for the ith kmer if it has not already been incremented if (local_sites1[i] == 0) { a_s1[i] ++; // the ith kmer is present in this upstream region local_sites1[i] = 1; } } // free memory free(c_mce); } free(name1); free(seq1); // // SCAN SEQUENCE 2 // seq2 = nextSequence(fp2, &name2, &size2, &nextSequence_started2, &nextSequence_ended2, nextSequence_currentLine2); // printf("Read SEQ 2 %s, %s ..\n", name2, seq2); // initilialize the local store memset(local_sites2, 0, n); l = strlen(seq2); stop = strlen(seq2) - motifsize - gap + 1; if (stop < 0) stop = 0; //printf("stop2 = %d\n", stop); for (j=0; j 0) { if (k1 > 0) { strncpy(mce, seq1+j, k1); strncpy(mce + k1, seq1 + j + k1 + gap, k2); } else { strncpy(mce, seq2+j, (motifsize / 2)); strncpy(mce+(motifsize/2), seq2+j + (motifsize / 2) + gap, motifsize - (motifsize / 2)); } } else { strncpy(mce, seq2+j, motifsize); } mce[motifsize] = '\0'; c_mce = complement(mce); nb1 = existWord(mce, motifsize, pnode); nb2 = -1; if (singlestrand == 0) nb2 = existWord(c_mce, motifsize, pnode); // if any of them has been found, no need to go on if ((nb1 > 0) || (nb2 > 0)) { if (nb1 > 0) i = node2index[nb1]; else i = node2index[nb2]; // increment s2 for this kmer if (local_sites2[i] == 0) { a_s2[i] ++; // the ith kmer is present in this upstream region local_sites2[i] = 1; // the same kmer was present in the orth upstream sequence if (local_sites1[i] > 0) { a_i[i] ++; } } } free(c_mce); } free(name2); free(seq2); s++; nbgenes ++; } fclose(fp1); fclose(fp2); // // END SCANNING SPECIES 1 AND 2 // // // traverse every kmers and calculate conservation score // kmers_array = (Kmer*)malloc(n * sizeof(Kmer)); for (i=0; i 0) { if (k1 > 0) { strncpy(stmp, kmers[ii], k1); stmp[k1] = '\0'; printf("%s", stmp); for (j=0; j 0) && (i_limit == a)) { break; } } for (i=0; i= 0) { if (s[j] == '\n') { s[j] = '\0'; return; } j--; } } char* complement(char* s) { int l = strlen(s); char* c; char d; int i; c = (char*)calloc(l+1, sizeof(char)); for (i=l-1; i>=0; i--) { if (s[i] == 'A') d = 'T'; else if (s[i] == 'T') d = 'A'; else if (s[i] == 'G') d = 'C'; else if (s[i] == 'C') d = 'G'; else d = 'N'; strncat(c, &d, 1); } return c; } // // returns the next sequence in the file fp // char* nextSequence(FILE* fp, char** name, int* size, int* nextSequence_started, int* nextSequence_ended, char* nextSequence_currentLine) { int len = 20000; char* seq; // not yet started, get the first line if (*nextSequence_started == 0) { fgets(nextSequence_currentLine, len, fp); chomp(nextSequence_currentLine); *nextSequence_started = 1; } //printf("%s\n", nextSequence_currentLine); // if file is finished, return 0 if (*nextSequence_ended == 1) { return 0; } // if started, line should be filled with ">..." if (nextSequence_currentLine[0] == '>') { *name = strdup(nextSequence_currentLine + 1); // create a new line seq = (char*)calloc(100000, sizeof(char) ); while (1) { fgets(nextSequence_currentLine, len, fp); if (feof(fp)) { *nextSequence_ended = 1; return seq; } chomp(nextSequence_currentLine); if (strlen(nextSequence_currentLine) == 0) { continue; } if (nextSequence_currentLine[0] == '>') { return seq; } else { strcat(seq, nextSequence_currentLine); } } } else return 0; } double lcumhyper(int i, int s1, int s2, int N) { return log(cumhyper(i, s1, s2, N)); } double cumhyper(int i, int s1, int s2, int N) { int min = (s1 32) return exp(gammln(n+1.0)); while (ntopscore > b->score) return 1; else if(a->score == b->score) return 0; else return -1; } /*** PREFIX TREE OPTIONS ***/ int nucl(char c) { if (c == 'A') return 0; else if (c == 'C') return 1; else if (c == 'G') return 2; else if (c == 'T') return 3; else return -1; } // add word to the prefix tree (if it is not there) int addWord(char* s, int m, prefixNode* pnode, int* nextfree) { //printf("Adding %s to the prefix tree ..\n", s); int i, k; int j = 0; for (i=0; i