diff --git a/POMP-DETECT/contig_generator.cpp b/POMP-DETECT/contig_generator.cpp new file mode 100644 index 0000000..5eab9b2 --- /dev/null +++ b/POMP-DETECT/contig_generator.cpp @@ -0,0 +1,834 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +using namespace std; + +vector random_list; + +bool isExist(vector& list, int number) { + for (int i = 0; i < list.size(); i++) { + if (list[i] == number) { + return true; + } + } + return false; +} + + +void generateRandomNumbers(int argument, int how_many){ + + for (int i = 0; i < how_many; i++) { + while (true) { + int randomNumber = rand() % argument; + if (!isExist(random_list, randomNumber)) { + random_list.push_back(randomNumber); + break; + } + } + } +} + + +string NumberToString (int Number){ + ostringstream ss; + ss << Number; + return ss.str(); +} + +struct Information{ + int readId; + uint8_t position; +}; + +struct KMer{ + int readId; + uint8_t startPosition; +}; + +struct Map{ + uint8_t sourcePositionLow; + uint8_t firstTargetPosition; + bool intersection; + int readId; +}; + +struct Read{ + string read; + int readId; + tr1::unordered_map list; + string compressed_read; + int consensus_id; + uint8_t pos_in_consensus; + bool is_already_compressed; + bool is_perfect; +}; + +struct Cluster{ + string consensus; + vector list; +}; + +uint8_t kMerLength, final_hamming_distance, hamming_distance, overlap, coverage; +int cut_off_value, max_value, discard, no_neighbor, number_already_compressed = 0, range_to, range_from; +uint8_t finalKMerLength, readLength; +tr1::unordered_map readsList; +tr1::unordered_map > hashMap; +vector read_cluster; +list sort_strings; +int id = 0, consensus_length = 0; + +void buildKMerFromRead(string, int, bool); +void getClusteredReads(int); + +bool compare_nocase (const std::string& first, const std::string& second) +{ + unsigned int i=0; + while ( (itolower(second[i])) return false; + ++i; + } + return ( first.length() < second.length() ); +} + + +void initialize_parameters(){ + + + kMerLength = 14; + cut_off_value = 400; + discard = 2000; + overlap = finalKMerLength; + coverage = 60; + + + if(finalKMerLength < readLength/2){ + finalKMerLength = readLength/2; + overlap = finalKMerLength; + } + +} + +void readReads(const char* fileName, bool accept){ + ifstream infile; + infile.open(fileName); + int counter = 0, readId = -1, iteration = 0; + string id; + bool store = false; + while(!infile.eof()) + { + string text; + getline(infile, text); + if(counter % 2 == 1){ + if(!store){ + readLength = text.length(); + initialize_parameters(); + store = true; + } + sort_strings.push_back(text); + ++iteration; + if(iteration % 1000000 == 0){ + cout << "BUILDING COMPLETED OF: " << iteration << endl; + } + + }else if(!text.empty()){ + id = text.substr(1); + //readId = atoi(id.c_str()) - 1; + readId = readId + 1; + } + counter++; + } + infile.close(); +} + +inline int getIndex(char& a) { + switch (a) { + case 'A': + case 'a': + return 1; + + case 'C': + case 'c': + return 2; + + case 'G': + case 'g': + return 3; + + case 'T': + case 't': + return 4; + + case 'N': + case 'n': + return 5; + + default: + return 0; + } +} + +inline char getChar(uint8_t index) { + switch (index) { + case 1: + return 'A'; + + case 2: + return 'C'; + + case 3: + return 'G'; + + case 4: + return 'T'; + + case 5: + return 'N'; + + default: + return 'X'; + } +} + +inline bool calculateHD(string& readOne, string& readTwo, uint8_t& positionOne, uint8_t& positionTwo) { + int hammingDistance = 0; + + int length = readOne.length() - positionOne + 1 >= readTwo.length() - positionTwo + 1 ? readTwo.length() - positionTwo + 1 : readOne.length() - positionOne + 1; + + + for (unsigned int i = 0; i < length; i++) { + if (readOne[i + positionOne] != readTwo[i + positionTwo]) { + hammingDistance++; + if(hammingDistance > final_hamming_distance){ + return false; + } + } + } + return true; +} + +inline void buildKMerFromRead(string read, int readId, bool accept) { + uint8_t iteration = read.length() - kMerLength + 1; + uint8_t random_number = 0; + tr1::hash str_hash; + + for (uint8_t i = 0; i < iteration; i++) { + random_number = rand() % 100 + 1; + if(random_number <= 100){ + KMer newKMer; + newKMer.readId = readId; + newKMer.startPosition = i; + size_t value = str_hash(read.substr(i, kMerLength)); + vector* list = &hashMap[value]; + if(list -> size() < discard + 10){ + list->push_back(newKMer); + } + } + } + + Read obj; + obj.read = read; + obj.readId = readId; + obj.consensus_id = -1; + obj.compressed_read = ""; + obj.is_already_compressed = false; + + if(accept){ + obj.is_perfect = true; + }else{ + obj.is_perfect = false; + } + + readsList[readId] = obj; +} + +inline void setMaps(KMer* sourceKMer, Read* source, KMer* targetKMer, Read* target) { + int targetId = targetKMer -> readId; + int low, sourcePositionLow; + + Map* maps = &source -> list[targetId]; + + if (maps->intersection == true) { + low = maps -> firstTargetPosition; + if (low >= targetKMer -> startPosition) { + low = targetKMer -> startPosition; + sourcePositionLow = sourceKMer -> startPosition; + maps -> firstTargetPosition = low; + maps -> sourcePositionLow = sourcePositionLow; + } + + } else { + low = targetKMer -> startPosition; + sourcePositionLow = sourceKMer -> startPosition; + + Map maps; + maps.firstTargetPosition = low; + maps.sourcePositionLow = sourcePositionLow; + maps.intersection = true; + maps.readId = target -> readId; + (source -> list)[targetId] = maps; + } +} + +void getClusteredReads1(int cut_value) { + + cout << "STARTED 1ST STAGE..." << endl; + + KMer* kMerOne, *kMerTwo; + int readIdOne, readIdTwo, counter = 0, i, j; + uint8_t positionA, positionB; + Read* readObjOne, *readObjTwo; + uint8_t threshold = readLength - overlap + 1; + int traverse_length = 0; + + for (tr1::unordered_map >::iterator it = hashMap.begin(); it != hashMap.end();) { + counter++; + + if(counter % 1000000 == 0){ + cout << "LISTS TRAVERSED: " << counter << endl; + } + + traverse_length = it->second.size(); + + for (i = 0; i < traverse_length; i++) { + + kMerOne = &it->second[i]; + + positionA = kMerOne -> startPosition; + + if(positionA > threshold){ + continue; + } + + readIdOne = kMerOne -> readId; + readObjOne = &readsList[readIdOne]; + + + for (j = i + 1; j < traverse_length; j++) { + + kMerTwo = &it->second[j]; + + positionB = kMerTwo -> startPosition; + + if(positionB > threshold){ + continue; + } + + readIdTwo = kMerTwo -> readId; + + if(readIdOne == readIdTwo){continue;} + + readObjTwo = &readsList[readIdTwo]; + + + if(readObjOne -> is_perfect && readObjTwo -> is_perfect) continue; + + + if(calculateHD(readObjOne -> read, readObjTwo -> read, positionA, positionB)) { + if(!readObjOne -> is_perfect){ + setMaps(kMerOne, readObjOne, kMerTwo, readObjTwo); + } + if(!readObjTwo -> is_perfect){ + setMaps(kMerTwo, readObjTwo, kMerOne, readObjOne); + } + } + } + } + + it = hashMap.erase(it); + } +} + +void process_aligned_reads(vector& list, string& consensus, uint8_t& front_deletion) { + + int position, identical; + + for (int j = 0; j < list.size(); j++) { + string text; + Read& target_read = readsList[list[j].readId]; + string& read = target_read.read; + position = list[j].position - front_deletion; + identical = 0; + + for (int i = 0; i < read.length(); i++) { + if (consensus[position + i] == read[i]) { + identical++; + } else if (consensus[position + i] != read[i]) { + if (identical > 0) { + text += NumberToString(identical) + read[i]; + } else { + text.push_back(read[i]); + } + identical = 0; + } + } + + string& compressed_read = target_read.compressed_read; + + if (compressed_read.empty()) { + compressed_read = text; + target_read.consensus_id = id; + target_read.pos_in_consensus = position; + + //if(compressed_read.length() <= readLength/8){ + target_read.is_already_compressed = true; + //} + + } else if (compressed_read.length() > text.length()) { + compressed_read = text; + target_read.consensus_id = id; + target_read.pos_in_consensus = position; + + //if(compressed_read.length() <= readLength/8){ + target_read.is_already_compressed = true; + //} + } + + } +} + +int alignReads(Read* baseReadObj, int** count) { + + tr1::unordered_map& hashMap = baseReadObj -> list; + vector list; + Information info; + string& baseRead = baseReadObj -> read; + + if(hashMap.size() == 0){ + no_neighbor++; + return hashMap.size(); + } + + for(unsigned int i = 0; i < consensus_length; i++){ + for(int j = 0; j < 6; j++){ + count[i][j] = 0; + } + } + + int pos_index = consensus_length / 4; + + if(!baseReadObj->is_already_compressed){ + + info.position = pos_index; + info.readId = baseReadObj -> readId; + list.push_back(info); + + for (unsigned int i = 0; i < baseRead.length(); i++) { + count[i + pos_index][getIndex(baseRead[i])]++; + count[i + pos_index][0]++; + } + }else{ + return hashMap.size(); + } + + + int counter = 0, iteration = 0; + uint8_t sourcePosition = 0; + uint8_t targetPosition = 0; + uint8_t start = 0; + + + for (tr1::unordered_map::iterator it = hashMap.begin(); it != hashMap.end(); ++it) { + + Map& maps = it -> second; + + /*if((readsList[it -> first]).is_already_compressed){ + number_already_compressed++; + continue; + }*/ + + Read* neighbor_object = &readsList[maps.readId]; + string& target = neighbor_object -> read; + + bool is_perfect = neighbor_object -> is_perfect; + + Information info; + + sourcePosition = maps.sourcePositionLow; + targetPosition = maps.firstTargetPosition; + int temp_store = 0; + + info.readId = it -> first; + + if (sourcePosition - targetPosition >= 0) { + counter = 0; + start = sourcePosition - targetPosition + pos_index; + info.position = start; + + for (uint8_t index = 0; index < baseRead.length(); index++) { + if(!is_perfect){ + count[index + start][getIndex(target[counter])]++; + }else{ + temp_store = count[index + start][getIndex(target[counter])]; + count[index + start][getIndex(target[counter])] = temp_store + 3; + temp_store = count[index + start][0]; + count[index + start][0] = temp_store + 3; + } + counter++; + count[index + start][0]++; + } + + } else { + start = pos_index - targetPosition; + info.position = start; + counter = 0; + + for (uint8_t index = 0; index < baseRead.length(); index++) { + if(!is_perfect){ + count[index + start][getIndex(target[counter])]++; + }else{ + temp_store = count[index + start][getIndex(target[counter])]; + count[index + start][getIndex(target[counter])] = temp_store + 3; + temp_store = count[index + start][0]; + count[index + start][0] = temp_store + 3; + } + counter++; + count[index + start][0]++; + } + } + + + if(neighbor_object -> is_already_compressed == false){// && neighbor_object -> is_perfect == false){ + list.push_back(info); + } + } + + string consensus; + bool accept = true; + uint8_t slide = 0; + int maxOccurrence = 0; + int maxPosition = 0; + + + for (int i = 0; i < consensus_length; i++) { + + maxOccurrence = 0; + maxPosition = 0; + + for (int j = 1; j <= 5; j++) { + if (maxOccurrence <= count[i][j]) { + maxOccurrence = count[i][j]; + maxPosition = j; + } + } + if (maxOccurrence > 0) { + accept = false; + consensus.push_back(getChar(maxPosition)); + } else { + if (accept) { + slide++; + } + } + } + + Cluster cluster; + cluster.consensus = consensus; + read_cluster.push_back(cluster); + + process_aligned_reads(list, consensus, slide); + + id++; + int size = hashMap.size(); + hashMap.clear(); + return size; +} + +void write(const char* outFile, string text){ + ofstream myfile(outFile, ios::out|ios::app); + if (myfile.is_open()){ + myfile << text; + myfile.close(); + } else cout << "Unable to open file"; +} + + +int main(int argc,char *argv[]) +{ + srand(time(NULL)); + + /*const char* readFileNameIR = "Data/ec-random-out.fa"; + const char* readFileNamePR = "Data/ec-random-out.fa"; + const char* out_file = "Data/random-incorrect-reads.fa";*/ + + if (argc != 6) { + printf("UNABLE TO RUN...\n"); + return 1; + } + + const char* readFileNameIR = argv[1]; + const char* out_file = argv[2]; + consensus_length = atoi(argv[3]); + + finalKMerLength = atoi(argv[4]); + final_hamming_distance = atoi(argv[5]); + + + no_neighbor = 0; + id = 0; + + remove(out_file); + + const clock_t begin_time = clock(); + + cout << "STARTING..." << endl; + int** count; + + time_t now = time(0); + char* dt = ctime(&now); + + cout << "BUILDING STARTED AT: " << dt << endl; + + readReads(readFileNameIR, false); + + cout << "NUMBER_OF_READS: " << sort_strings.size() << endl; + + + list::iterator it; + int readId = 0, hamming_dist = 0; + + + sort_strings.unique(); + cout << "NUMBER_OF_UNIQUE_READS: " << sort_strings.size() << endl; + + + for (it = sort_strings.begin(); it != sort_strings.end(); ++it){ + buildKMerFromRead(*it, readId, false); + readId = readId + 1; + + if(readId % 1000000 == 0){ + cout << "BUILDING COMPLETED OF: " << readId << endl; + } + } + + sort_strings.clear(); + + + + count = new int *[consensus_length]; + for (unsigned int index = 0; index < consensus_length; index++){ + count[index] = new int[6]; + } + + cout << endl << "READ LENGTH: " << (int) readLength << endl; + cout << "K1-MER: " << (int) kMerLength << endl; + cout << "K2-MER: " << (int) finalKMerLength << endl; + cout << "CUTOFF VALUE: " << cut_off_value << endl; + cout << "MINIMUM OVERLAP: " << (int) overlap << endl; + cout << "DISCARD THRESHOLD: " << discard << endl; + cout << "MAX CONSENSUS LENGTH: " << consensus_length << endl; + + now = time(0); + dt = ctime(&now); + cout << "BUILDING COMPLETED AT: " << dt << endl; + cout << "READS: " << readsList.size() << " HASH_MAP_1_SIZE: " << hashMap.size() << endl; + cout << "CLUSTERING..." << endl; + + int max = 0, min = 99999, omit = 0; + + for (tr1::unordered_map >::iterator it = hashMap.begin(); it != hashMap.end();) { + if(it->second.size() > discard){ + it = hashMap.erase(it); + omit++; + }else if(it->second.size() <= 1){ + it = hashMap.erase(it); + omit++; + }else{ + random_shuffle(it->second.begin(), it->second.end()); + if(it->second.size() > cut_off_value){ + it->second.erase(it->second.begin() + cut_off_value, it->second.end()); + } + max_value += it->second.size(); + it++; + } + } + + cout << "MOD HASHMAP SIZE: " << hashMap.size() << endl; + cout << "DISCARD: " << omit << endl; + + int safe_cut = (int)((double)max_value/hashMap.size()); + cout << "SAFE CUT: " << safe_cut << endl; + + getClusteredReads1(safe_cut); + hashMap.clear(); + + int size = 0; + + + now = time(0); + dt = ctime(&now); + cout << "CLUSTERING COMPLETED AT: " << dt << endl; + cout << "ALIGNING..." << endl; + + int shared = 0; + int reads_count = 1; + + for (tr1::unordered_map::iterator it = readsList.begin(); it != readsList.end(); ++it) { + shared += alignReads(&it->second, count); + reads_count++; + } + + now = time(0); + dt = ctime(&now); + + cout << "SHARED: " << (double) shared/readsList.size() << endl; + cout << "ALONE: " << no_neighbor << endl; + cout << "OMITTED: " << number_already_compressed << endl; + cout << "ALIGNING COMPLETED AT: " << dt << endl; + + const clock_t end_time = clock(); + + cout << "TIME IN SECONDS: " << float( end_time - begin_time ) / CLOCKS_PER_SEC << endl; + + string out; + string out2; + string out3; + + int diff = 0, iterate = 0, next = 0, next_diff = 0, number_discards = 0; + for (tr1::unordered_map::iterator it = readsList.begin(); it != readsList.end(); ++it) { + Read& object = it->second; + + if(object.consensus_id < 0 || object.compressed_read.length() > readLength/2){ + if(!object.is_perfect){ + number_discards++; + //out2 += NumberToString(i); + out2 += object.read; + out2 += "\n"; + object.read = ""; + object.compressed_read = ""; + } + }else{ + vector* list = &read_cluster[object.consensus_id].list; + list->push_back(object.readId); + } + } + + + for (int i = 0; i < read_cluster.size(); i++) { + int consensus_id = i; + Cluster& cluster = read_cluster[i]; + vector& read_list = cluster.list; + vector obj_list; + string& consensus_string = cluster.consensus; + int8_t min = 120; + int8_t max = -120; + + if(read_list.size() == 0) {continue;} + + for(int i = 0; i < read_list.size(); i++){ + obj_list.push_back(readsList[read_list[i]]); + if(obj_list[i].pos_in_consensus < min) {min = obj_list[i].pos_in_consensus;} + if(obj_list[i].pos_in_consensus > max) {max = obj_list[i].pos_in_consensus;} + } + + int prev_length = consensus_string.length(); + int count_pos = (max - min) + readLength + 1; + + try{ + consensus_string = consensus_string.substr(min, count_pos); + }catch(...){ + for(int i = 0; i < obj_list.size(); i++){ + Read& object = obj_list[i]; + if(!object.is_perfect){ + number_discards++; + out += object.read + "\n"; + //out += NumberToString(read_list[i]) + "\n"; + } + } + continue; + } + + + int current_length = consensus_string.length(); + + iterate = 0; + for(int i = 0; i < obj_list.size(); i++){ + Read& object = obj_list[i]; + + if(read_list.size() == 1){ + if(!object.is_perfect){ + number_discards++; + out += object.read + "\n"; + object.read = ""; + object.compressed_read = ""; + break; + } + } + + if(iterate == 0){ + out += consensus_string + "\n"; + diff = read_list[i]; + if(object.compressed_read.length() > 0){ + /*out += NumberToString(read_list[i]); + out += " "; + out += NumberToString(object.pos_in_consensus - min); + out += " "; + out += object.compressed_read; + out += "\n";*/ + }else{ + /*out += NumberToString(read_list[i]); + out += " "; + out += NumberToString(object.pos_in_consensus - min); + out += "\n";*/ + } + iterate++; + }else{ + if(object.compressed_read.length() > 0){ + int t = object.pos_in_consensus - min; + if(t + readLength - 1 > consensus_string.length()){ + out3 += object.read + "\n"; + //out3 += NumberToString(read_list[i]) + "\n"; + continue; + } + diff = read_list[i]; + /*out += NumberToString(diff); + out += " "; + out += NumberToString(object.pos_in_consensus - min); + out += " "; + out += object.compressed_read; + out += "\n";*/ + }else{ + diff = read_list[i]; + /*out += NumberToString(diff); + out += " "; + out += NumberToString(object.pos_in_consensus - min); + out += "\n";*/ + } + diff = read_list[i]; + } + object.read = ""; + object.compressed_read = ""; + } + } + + cout << "DISCARDS: " << number_discards << endl; + + string preamble = NumberToString(readLength) + "\n" + "!" + "\n"; + //write(out_file, preamble); + write(out_file, out); + write(out_file, out3); + //write(out_file, "% \n"); + write(out_file, out2); + + now = time(0); + dt = ctime(&now); + cout << "COMPLETED AT: " << dt << endl; + const clock_t _end_time = clock(); + cout << "TIME IN SECONDS: " << float( _end_time - begin_time ) / CLOCKS_PER_SEC << endl; + + + return 0; +} +