diff --git a/similarity_search/include/method/hnsw.h b/similarity_search/include/method/hnsw.h index 747ef47..a5ea2b8 100644 --- a/similarity_search/include/method/hnsw.h +++ b/similarity_search/include/method/hnsw.h @@ -260,21 +260,21 @@ namespace similarity { void addFriendlevel(int level, HnswNode *element, const Space *space, int delaunay_type) { unique_lock lock(accessGuard_); - for (int i = 0; i < allFriends[level].size(); i++) - if (allFriends[level][i] == element) { + for (unsigned i = 0; i < allFriends_[level].size(); i++) + if (allFriends_[level][i] == element) { cerr << "This should not happen. For some reason the elements is " "already added"; return; } - allFriends[level].push_back(element); + allFriends_[level].push_back(element); bool shrink = false; if (level > 0) { - if (allFriends[level].size() > maxsize) { + if (allFriends_[level].size() > maxsize) { shrink = true; } else { shrink = false; } - } else if (allFriends[level].size() > maxsize0) { + } else if (allFriends_[level].size() > maxsize0) { shrink = true; } else { shrink = false; @@ -282,10 +282,10 @@ namespace similarity { if (shrink) { if (delaunay_type > 0) { priority_queue> resultSet; - // for (int i = 1; i < allFriends[level].size(); i++) { - for (int i = 0; i < allFriends[level].size(); i++) { - resultSet.emplace(space->IndexTimeDistance(this->getData(), allFriends[level][i]->getData()), - allFriends[level][i]); + // for (int i = 1; i < allFriends_[level].size(); i++) { + for (int i = 0; i < allFriends_[level].size(); i++) { + resultSet.emplace(space->IndexTimeDistance(this->getData(), allFriends_[level][i]->getData()), + allFriends_[level][i]); } if (delaunay_type == 1) this->getNeighborsByHeuristic1(resultSet, resultSet.size() - 1, space); @@ -293,23 +293,23 @@ namespace similarity { this->getNeighborsByHeuristic2(resultSet, resultSet.size() - 1, space, level); else if (delaunay_type == 3) this->getNeighborsByHeuristic3(resultSet, resultSet.size() - 1, space, level); - allFriends[level].clear(); + allFriends_[level].clear(); while (resultSet.size()) { - allFriends[level].push_back(resultSet.top().getMSWNodeHier()); + allFriends_[level].push_back(resultSet.top().getMSWNodeHier()); resultSet.pop(); } } else { - dist_t max = space->IndexTimeDistance(this->getData(), allFriends[level][0]->getData()); + dist_t max = space->IndexTimeDistance(this->getData(), allFriends_[level][0]->getData()); int maxi = 0; - for (int i = 1; i < allFriends[level].size(); i++) { - dist_t curd = space->IndexTimeDistance(this->getData(), allFriends[level][i]->getData()); + for (int i = 1; i < allFriends_[level].size(); i++) { + dist_t curd = space->IndexTimeDistance(this->getData(), allFriends_[level][i]->getData()); if (curd > max) { max = curd; maxi = i; } } - allFriends[level].erase(allFriends[level].begin() + maxi); + allFriends_[level].erase(allFriends_[level].begin() + maxi); } } } @@ -319,11 +319,11 @@ namespace similarity { level = level1; maxsize = maxFriends; maxsize0 = maxfriendslevel0; - allFriends.resize(level + 1); + allFriends_.resize(level + 1); for (int i = 0; i <= level; i++) { - allFriends[i].reserve(maxsize + 1); + allFriends_[i].reserve(maxsize + 1); } - allFriends[0].reserve(maxsize0 + 1); + allFriends_[0].reserve(maxsize0 + 1); } void copyDataAndLevel0LinksToOptIndex(char *mem1, size_t offsetlevels, size_t offsetData) @@ -336,10 +336,10 @@ namespace similarity { char *memlevels = mem1 + offsetlevels; char *memt = memlevels; - *((int *)(memt)) = (int)allFriends[0].size(); + *((int *)(memt)) = (int)allFriends_[0].size(); memt += sizeof(int); - for (size_t j = 0; j < allFriends[0].size(); j++) { - *((int *)(memt)) = (int)allFriends[0][j]->getId(); + for (size_t j = 0; j < allFriends_[0].size(); j++) { + *((int *)(memt)) = (int)allFriends_[0][j]->getId(); memt += sizeof(int); } mem = mem1 + offsetData; @@ -359,11 +359,11 @@ namespace similarity { for (int i = 1; i <= level; i++) { char *memt = memlevels; - *((int *)(memt)) = (int)allFriends[i].size(); + *((int *)(memt)) = (int)allFriends_[i].size(); memt += sizeof(int); - for (size_t j = 0; j < allFriends[i].size(); j++) { - *((int *)(memt)) = (int)allFriends[i][j]->getId(); + for (size_t j = 0; j < allFriends_[i].size(); j++) { + *((int *)(memt)) = (int)allFriends_[i][j]->getId(); memt += sizeof(int); } memlevels += (1 + maxsize) * sizeof(int); @@ -372,11 +372,11 @@ namespace similarity { } const Object *getData() const { return data_; } size_t getId() const { return id_; } - const vector &getAllFriends(int level) const { return allFriends[level]; } + const vector &getAllFriends(int level) const { return allFriends_[level]; } mutex accessGuard_; size_t id_; - vector> allFriends; + vector> allFriends_; int maxsize0; int maxsize; @@ -486,6 +486,15 @@ namespace similarity { return (int)r; } + void SaveOptimizedIndex(std::ostream& output); + void LoadOptimizedIndex(std::istream& input); + + void SaveRegularIndexBin(std::ostream& output); + void LoadRegularIndexBin(std::istream& input); + void SaveRegularIndexText(std::ostream& output); + void LoadRegularIndexText(std::istream& input); + + public: void kSearchElementsWithAttemptsLevel(const Space *space, const Object *queryObj, size_t NN, std::priority_queue> &resultSet, HnswNode *ep, diff --git a/similarity_search/src/method/hnsw.cc b/similarity_search/src/method/hnsw.cc index bb8009a..8f56637 100644 --- a/similarity_search/src/method/hnsw.cc +++ b/similarity_search/src/method/hnsw.cc @@ -39,6 +39,7 @@ #include "space.h" #include "space/space_lp.h" #include "thread_pool.h" +#include "utils.h" #include #include @@ -58,6 +59,17 @@ #define PORTABLE_ALIGN16 __declspec(align(16)) #endif +// For debug purposes we also implemented saving an index to a text file +#define USE_TEXT_REGULAR_INDEX (false) + +#define TOTAL_QTY "TOTAL_QTY" +#define MAX_LEVEL "MAX_LEVEL" +#define ENTER_POINT_ID "ENTER_POINT_ID" +#define FIELD_M "M" +#define FIELD_MAX_M "MAX_M" +#define FIELD_MAX_M0 "MAX_M0" +#define CURR_LEVEL "CURR_LEVEL" + namespace similarity { // This is the counter to keep the size of neighborhood information (for one node) @@ -88,14 +100,14 @@ namespace similarity { { int ok = 1; for (size_t i = 0; i < list.size(); i++) { - for (size_t j = 0; j < list[i]->allFriends[0].size(); j++) { - for (size_t k = j + 1; k < list[i]->allFriends[0].size(); k++) { - if (list[i]->allFriends[0][j] == list[i]->allFriends[0][k]) { + for (size_t j = 0; j < list[i]->allFriends_[0].size(); j++) { + for (size_t k = j + 1; k < list[i]->allFriends_[0].size(); k++) { + if (list[i]->allFriends_[0][j] == list[i]->allFriends_[0][k]) { cout << "\nDuplicate links\n\n\n\n\n!!!!!"; ok = 0; } } - if (list[i]->allFriends[0][j] == list[i]) { + if (list[i]->allFriends_[0][j] == list[i]) { cout << "\nLink to the same element\n\n\n\n\n!!!!!"; ok = 0; } @@ -114,8 +126,8 @@ namespace similarity { ofstream out(filename); size_t maxdegree = 0; for (HnswNode *node : list) { - if (node->allFriends[0].size() > maxdegree) - maxdegree = node->allFriends[0].size(); + if (node->allFriends_[0].size() > maxdegree) + maxdegree = node->allFriends_[0].size(); } vector distrin = vector(1000); @@ -123,7 +135,7 @@ namespace similarity { vector inconnections = vector(list.size()); vector outconnections = vector(list.size()); for (size_t i = 0; i < list.size(); i++) { - for (HnswNode *node : list[i]->allFriends[0]) { + for (HnswNode *node : list[i]->allFriends_[0]) { outconnections[list[i]->getId()]++; inconnections[node->getId()]++; } @@ -200,7 +212,10 @@ namespace similarity { ParallelFor(1, data_.size(), indexThreadQty_, [&](int id) { HnswNode *node = new HnswNode(data_[id], id); add(&space_, node); - ElList_[id] = node; + { + unique_lock lock(ElListGuard_); + ElList_[id] = node; + } if (progress_bar) ++(*progress_bar); }); @@ -223,7 +238,10 @@ namespace similarity { int id = data_.size() - pos_id; HnswNode *node = new HnswNode(data_[id], id); add(&space_, node); - ElList_[id] = node; + { + unique_lock lock(ElListGuard_); + ElList_[id] = node; + } if (progress_bar1) ++(*progress_bar1); }); @@ -278,8 +296,11 @@ namespace similarity { } } - ElList_[id]->allFriends[0].swap(rez); - // degrees[ElList_[id]->allFriends[0].size()]++; + { + unique_lock lock(ElList_[id]->accessGuard_); + ElList_[id]->allFriends_[0].swap(rez); + } + // degrees[ElList_[id]->allFriends_[0].size()]++; }); for (int i = 0; i < temp.size(); i++) delete temp[i]; @@ -291,6 +312,8 @@ namespace similarity { data_level0_memory_ = NULL; linkLists_ = NULL; + enterpointId_ = enterpoint_->getId(); + if (skip_optimized_index) { LOG(LIB_INFO) << "searchMethod = " << searchMethod_; pmgr.CheckUnused(); @@ -403,7 +426,7 @@ namespace similarity { linkLists_[i] = linkList; ElList_[i]->copyHigherLevelLinksToOptIndex(linkList, 0); }; - enterpointId_ = enterpoint_->getId(); + LOG(LIB_INFO) << "Finished making optimized index"; LOG(LIB_INFO) << "Maximum level = " << enterpoint_->level; LOG(LIB_INFO) << "Total memory allocated for optimized index+data: " << (total_memory_allocated >> 20) << " Mb"; @@ -646,13 +669,6 @@ namespace similarity { #endif } - template - void - Hnsw::addToElementListSynchronized(HnswNode *HierElement) - { - unique_lock lock(ElListGuard_); - ElList_.push_back(HierElement); - } template void Hnsw::Search(RangeQuery *query, IdType) const @@ -701,14 +717,33 @@ namespace similarity { template void - Hnsw::SaveIndex(const string &location) - { - if (!data_level0_memory_) - throw runtime_error("Storing non-optimized index is not supported yet!"); - - std::ofstream output(location, std::ios::binary); + Hnsw::SaveIndex(const string &location) { + std::ofstream output(location, + std::ios::binary /* text files can be opened in binary mode as well */); CHECK_MSG(output, "Cannot open file '" + location + "' for writing"); - streampos position; + output.exceptions(ios::badbit | ios::failbit); + + unsigned int optimIndexFlag = data_level0_memory_ != nullptr; + + + if (!optimIndexFlag) { +#if USE_TEXT_REGULAR_INDEX + SaveRegularIndexText(output); +#else + writeBinaryPOD(output, optimIndexFlag); + SaveRegularIndexBin(output); +#endif + } else { + writeBinaryPOD(output, optimIndexFlag); + SaveOptimizedIndex(output); + } + + output.close(); + } + + template + void + Hnsw::SaveOptimizedIndex(std::ostream& output) { totalElementsStored_ = ElList_.size(); writeBinaryPOD(output, totalElementsStored_); @@ -737,19 +772,223 @@ namespace similarity { if ((sizemass)) output.write(linkLists_[i], sizemass); }; - output.close(); + } template void - Hnsw::LoadIndex(const string &location) - { + Hnsw::SaveRegularIndexBin(std::ostream& output) { + totalElementsStored_ = ElList_.size(); + + writeBinaryPOD(output, totalElementsStored_); + writeBinaryPOD(output, maxlevel_); + writeBinaryPOD(output, enterpointId_); + writeBinaryPOD(output, M_); + writeBinaryPOD(output, maxM_); + writeBinaryPOD(output, maxM0_); + + for (unsigned i = 0; i < totalElementsStored_; ++i) { + const HnswNode& node = *ElList_[i]; + unsigned currlevel = node.level; + CHECK(currlevel + 1 == node.allFriends_.size()); + /* + * This check strangely fails ... + CHECK_MSG(maxlevel_ >= currlevel, "" + "maxlevel_ (" + ConvertToString(maxlevel_) + ") < node.allFriends_.size() (" + ConvertToString(currlevel)); + */ + writeBinaryPOD(output, currlevel); + for (unsigned level = 0; level <= currlevel; ++level) { + const auto& friends = node.allFriends_[level]; + unsigned friendQty = friends.size(); + writeBinaryPOD(output, friendQty); + for (unsigned k = 0; k < friendQty; ++k) { + IdType friendId = friends[k]->id_; + writeBinaryPOD(output, friendId); + } + } + } + } + + template + void + Hnsw::SaveRegularIndexText(std::ostream& output) { + + size_t lineNum = 0; + + totalElementsStored_ = ElList_.size(); + + WriteField(output, TOTAL_QTY, totalElementsStored_); lineNum++; + WriteField(output, MAX_LEVEL, maxlevel_); lineNum++; + WriteField(output, ENTER_POINT_ID, enterpointId_); lineNum++; + WriteField(output, FIELD_M, M_); lineNum++; + WriteField(output, FIELD_MAX_M, maxM_); lineNum++; + WriteField(output, FIELD_MAX_M0, maxM0_); lineNum++; + + vector friendIds; + + for (unsigned i = 0; i < totalElementsStored_; ++i) { + const HnswNode& node = *ElList_[i]; + unsigned currlevel = node.level; + CHECK(currlevel + 1 == node.allFriends_.size()); + /* + * This check strangely fails ... + CHECK_MSG(maxlevel_ >= currlevel, "" + "maxlevel_ (" + ConvertToString(maxlevel_) + ") < node.allFriends_.size() (" + ConvertToString(currlevel)); + */ + WriteField(output, CURR_LEVEL, currlevel); lineNum++; + for (unsigned level = 0; level <= currlevel; ++level) { + const auto& friends = node.allFriends_[level]; + unsigned friendQty = friends.size(); + + friendIds.resize(friendQty); + for (unsigned k = 0; k < friendQty; ++k) { + friendIds[k] = friends[k]->id_; + } + output << MergeIntoStr(friendIds, ' ') << endl; lineNum++; + } + } + WriteField(output, LINE_QTY, lineNum); + } + + + template + void + Hnsw::LoadRegularIndexText(std::istream& input) { + LOG(LIB_INFO) << "Loading regular index."; + size_t lineNum = 0; + ReadField(input, TOTAL_QTY, totalElementsStored_); lineNum++; + ReadField(input, MAX_LEVEL, maxlevel_); lineNum++; + ReadField(input, ENTER_POINT_ID, enterpointId_); lineNum++; + ReadField(input, FIELD_M, M_); lineNum++; + ReadField(input, FIELD_MAX_M, maxM_); lineNum++; + ReadField(input, FIELD_MAX_M0, maxM0_); lineNum++; + + fstdistfunc_ = nullptr; + dist_func_type_ = 0; + searchMethod_ = 0; + + ElList_.resize(totalElementsStored_); + for (unsigned id = 0; id < totalElementsStored_; ++id) { + ElList_[id] = new HnswNode(data_[id], id); + } + + enterpoint_ = ElList_[enterpointId_]; + + string line; + vector friendIds; + for (unsigned id = 0; id < totalElementsStored_; ++id) { + HnswNode& node = *ElList_[id]; + unsigned currlevel; + ReadField(input, CURR_LEVEL, currlevel); lineNum++; + node.level = currlevel; + node.allFriends_.resize(currlevel + 1); + for (unsigned level = 0; level <= currlevel; ++level) { + CHECK_MSG(getline(input, line), + "Failed to read line #" + ConvertToString(lineNum)); lineNum++; + CHECK_MSG(SplitStr(line, friendIds, ' '), + "Failed to extract neighbor IDs from line #" + ConvertToString(lineNum)); + + unsigned friendQty = friendIds.size(); + + auto& friends = node.allFriends_[level]; + friends.resize(friendQty); + for (unsigned k = 0; k < friendQty; ++k) { + IdType friendId = friendIds[k]; + CHECK_MSG(friendId >= 0 && friendId < totalElementsStored_, + "Invalid friendId = " + ConvertToString(friendId) + " for node id: " + ConvertToString(id)); + friends[k] = ElList_[friendId]; + } + } + } + size_t ExpLineNum; + ReadField(input, LINE_QTY, ExpLineNum); + CHECK_MSG(lineNum == ExpLineNum, + DATA_MUTATION_ERROR_MSG + " (expected number of lines " + ConvertToString(ExpLineNum) + + " read so far doesn't match the number of read lines: " + ConvertToString(lineNum)); + } + + + template + void + Hnsw::LoadRegularIndexBin(std::istream& input) { + LOG(LIB_INFO) << "Loading regular index."; + readBinaryPOD(input, totalElementsStored_); + readBinaryPOD(input, maxlevel_); + readBinaryPOD(input, enterpointId_); + readBinaryPOD(input, M_); + readBinaryPOD(input, maxM_); + readBinaryPOD(input, maxM0_); + + fstdistfunc_ = nullptr; + dist_func_type_ = 0; + searchMethod_ = 0; + + + ElList_.resize(totalElementsStored_); + for (unsigned id = 0; id < totalElementsStored_; ++id) { + ElList_[id] = new HnswNode(data_[id], id); + } + + enterpoint_ = ElList_[enterpointId_]; + + for (unsigned id = 0; id < totalElementsStored_; ++id) { + HnswNode& node = *ElList_[id]; + unsigned currlevel; + readBinaryPOD(input, currlevel); + node.level = currlevel; + node.allFriends_.resize(currlevel + 1); + for (unsigned level = 0; level <= currlevel; ++level) { + auto& friends = node.allFriends_[level]; + unsigned friendQty; + readBinaryPOD(input, friendQty); + + friends.resize(friendQty); + for (unsigned k = 0; k < friendQty; ++k) { + IdType friendId; + readBinaryPOD(input, friendId); + CHECK_MSG(friendId >= 0 && friendId < totalElementsStored_, + "Invalid friendId = " + ConvertToString(friendId) + " for node id: " + ConvertToString(id)); + friends[k] = ElList_[friendId]; + } + } + } + } + + template + void + Hnsw::LoadIndex(const string &location) { LOG(LIB_INFO) << "Loading index from " << location; - std::ifstream input(location, std::ios::binary); + std::ifstream input(location);//, std::ios::binary); CHECK_MSG(input, "Cannot open file '" + location + "' for reading"); - streampos position; - // input.seekg(0, std::ios::beg); + input.exceptions(ios::badbit | ios::failbit); + +#if USE_TEXT_REGULAR_INDEX + LoadRegularIndexText(input); +#else + unsigned int optimIndexFlag= 0; + + readBinaryPOD(input, optimIndexFlag); + + if (!optimIndexFlag) { + LoadRegularIndexBin(input); + } else { + LoadOptimizedIndex(input); + } +#endif + input.close(); + + LOG(LIB_INFO) << "Finished loading index"; + visitedlistpool = new VisitedListPool(1, totalElementsStored_); + + + } + + + template + void + Hnsw::LoadOptimizedIndex(std::istream& input) { + LOG(LIB_INFO) << "Loading optimized index."; readBinaryPOD(input, totalElementsStored_); readBinaryPOD(input, memoryPerObject_); @@ -783,7 +1022,7 @@ namespace similarity { for (size_t i = 0; i < totalElementsStored_; i++) { SIZEMASS_TYPE linkListSize; readBinaryPOD(input, linkListSize); - position = input.tellg(); + if (linkListSize == 0) { linkLists_[i] = nullptr; } else { @@ -792,10 +1031,7 @@ namespace similarity { } data_rearranged_[i] = new Object(data_level0_memory_ + (i)*memoryPerObject_ + offsetData_); } - LOG(LIB_INFO) << "Finished loading index"; - visitedlistpool = new VisitedListPool(1, totalElementsStored_); - input.close(); } template @@ -855,7 +1091,7 @@ namespace similarity { while (!candidateQueue.empty()) { auto iter = candidateQueue.top(); // This one was already compared to the query const HnswNodeDistFarther &currEv = iter; - // Check condtion to end the search + // Check condition to end the search dist_t lowerBound = closestDistQueue1.top().getDistance(); if (currEv.getDistance() > lowerBound) { break;