From 090e6f7a4d2c58fa6f86b1cd0d3cf0163b0c8dfc Mon Sep 17 00:00:00 2001 From: searchivairus Date: Thu, 20 Jul 2017 15:33:01 -0400 Subject: [PATCH] Fixing a memory leak + shutting up warnings --- .../include/factory/init_methods.h | 4 - .../include/factory/method/small_world_rand.h | 9 - .../include/method/small_world_rand.h | 2 +- .../include/method/small_world_rand_split.h | 225 ------ .../src/method/small_world_rand.cc | 36 +- .../src/method/small_world_rand_split.cc | 639 ------------------ 6 files changed, 21 insertions(+), 894 deletions(-) delete mode 100644 similarity_search/include/method/small_world_rand_split.h delete mode 100644 similarity_search/src/method/small_world_rand_split.cc diff --git a/similarity_search/include/factory/init_methods.h b/similarity_search/include/factory/init_methods.h index c413863..83305bb 100644 --- a/similarity_search/include/factory/init_methods.h +++ b/similarity_search/include/factory/init_methods.h @@ -157,10 +157,6 @@ inline void initMethods() { REGISTER_METHOD_CREATOR(double, METH_HNSW, CreateHnsw) REGISTER_METHOD_CREATOR(int, METH_HNSW, CreateHnsw) - REGISTER_METHOD_CREATOR(float, METH_SMALL_WORLD_RAND_SPLIT, CreateSmallWorldRandSplit) - REGISTER_METHOD_CREATOR(double, METH_SMALL_WORLD_RAND_SPLIT, CreateSmallWorldRandSplit) - REGISTER_METHOD_CREATOR(int, METH_SMALL_WORLD_RAND_SPLIT, CreateSmallWorldRandSplit) - // SA-tree REGISTER_METHOD_CREATOR(float, METH_SATREE, CreateSATree) REGISTER_METHOD_CREATOR(double, METH_SATREE, CreateSATree) diff --git a/similarity_search/include/factory/method/small_world_rand.h b/similarity_search/include/factory/method/small_world_rand.h index 2d6ed97..5cf1da4 100644 --- a/similarity_search/include/factory/method/small_world_rand.h +++ b/similarity_search/include/factory/method/small_world_rand.h @@ -18,7 +18,6 @@ #define _FACTORY_SMALL_WORLD_RAND_H_ #include -#include namespace similarity { @@ -34,14 +33,6 @@ Index* CreateSmallWorldRand(bool PrintProgress, return new SmallWorldRand(PrintProgress, space, DataObjects); } -template -Index* CreateSmallWorldRandSplit(bool PrintProgress, - const string& SpaceType, - Space& space, - const ObjectVector& DataObjects) { - return new SmallWorldRandSplit(PrintProgress, space, DataObjects); -} - /* * End of creating functions. */ diff --git a/similarity_search/include/method/small_world_rand.h b/similarity_search/include/method/small_world_rand.h index ec34f50..3a97c4e 100644 --- a/similarity_search/include/method/small_world_rand.h +++ b/similarity_search/include/method/small_world_rand.h @@ -292,7 +292,7 @@ class SmallWorldRand : public Index { mutable mutex ElListGuard_; ElementMap ElList_; - size_t NextNodeId_ = 0; // This is internal node id + IdType NextNodeId_ = 0; // This is internal node id bool changedAfterCreateIndex_ = false; MSWNode* pEntryPoint_ = nullptr; diff --git a/similarity_search/include/method/small_world_rand_split.h b/similarity_search/include/method/small_world_rand_split.h deleted file mode 100644 index a279172..0000000 --- a/similarity_search/include/method/small_world_rand_split.h +++ /dev/null @@ -1,225 +0,0 @@ -/** - * Non-metric Space Library - * - * Authors: Bilegsaikhan Naidan (https://github.com/bileg), Leonid Boytsov (http://boytsov.info). - * With contributions from Lawrence Cayton (http://lcayton.com/) and others. - * - * For the complete list of contributors and further details see: - * https://github.com/searchivarius/NonMetricSpaceLib - * - * Copyright (c) 2014 - * - * This code is released under the - * Apache License Version 2.0 http://www.apache.org/licenses/. - * - */ - -#ifndef _SMALL_WORLD_RAND_SPLIT_H_ -#define _SMALL_WORLD_RAND_SPLIT_H_ - -#include "index.h" -#include "params.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#define METH_SMALL_WORLD_RAND_SPLIT "sw-graph-split" - -namespace similarity { - -using std::string; -using std::vector; -using std::thread; -using std::mutex; -using std::unique_lock; -using std::condition_variable; -using std::ref; - -template -class Space; - -/* - * - * A small world approach. It builds the knn-graph incrementally and relies on - * a straightforward randomized algorithm to insert an element. - * The index can be split into chunks (to make indexing faster). Usually, - * this comes at the expense of search efficiency. - * - * The main publication is as follows, but the basic algorithm was also presented as SISAP'12: - * Malkov, Y., Ponomarenko, A., Logvinov, A., & Krylov, V., 2014. - * Approximate nearest neighbor algorithm based on navigable small world graphs. Information Systems, 45, 61-68. - * - */ - -//---------------------------------- - -//---------------------------------- -template -class SmallWorldRandSplit : public Index { -public: - class MSWNode{ - public: - MSWNode(const Object *Obj, size_t id) { - data_ = Obj; - id_ = id; - } - ~MSWNode(){}; - void removeAllFriends(){ - friends.clear(); - } - /* - * 1. The list of friend pointers is sorted. - * 2. If bCheckForDup == true addFriend checks for - * duplicates using binary searching (via pointer comparison). - */ - void addFriend(MSWNode* element, bool bCheckForDup) { - unique_lock lock(accessGuard_); - - if (bCheckForDup) { - auto it = lower_bound(friends.begin(), friends.end(), element); - if (it == friends.end() || (*it) != element) { - friends.insert(it, element); - } - } else { - friends.push_back(element); - } - } - const Object* getData() const { - return data_; - } - size_t getId() const { return id_; } - void setId(size_t id) { id_ = id; } - /* - * THIS NOTE APPLIES ONLY TO THE INDEXING PHASE: - * - * Before getting access to the friends, - * one needs to lock the mutex accessGuard_ - * The mutex can be released ONLY when - * we exit the scope that has access to - * the reference returned by getAllFriends() - */ - const vector& getAllFriends() const { - return friends; - } - - mutex accessGuard_; - - private: - const Object* data_; - size_t id_; - vector friends; - }; - //---------------------------------- - class EvaluatedMSWNodeReverse{ - public: - EvaluatedMSWNodeReverse() { - distance = 0; - element = NULL; - } - EvaluatedMSWNodeReverse(dist_t di, MSWNode* node) { - distance = di; - element = node; - } - ~EvaluatedMSWNodeReverse(){} - dist_t getDistance() const {return distance;} - MSWNode* getMSWNode() const {return element;} - bool operator< (const EvaluatedMSWNodeReverse &obj1) const { - return (distance > obj1.getDistance()); - } - - private: - dist_t distance; - MSWNode* element; - }; - - class EvaluatedMSWNodeDirect{ - public: - EvaluatedMSWNodeDirect() { - distance = 0; - element = NULL; - } - EvaluatedMSWNodeDirect(dist_t di, MSWNode* node) { - distance = di; - element = node; - } - ~EvaluatedMSWNodeDirect(){} - dist_t getDistance() const {return distance;} - MSWNode* getMSWNode() const {return element;} - bool operator< (const EvaluatedMSWNodeDirect &obj1) const { - return (distance < obj1.getDistance()); - } - - private: - dist_t distance; - MSWNode* element; - }; - - virtual void SaveIndex(const string &location) override; - - virtual void LoadIndex(const string &location) override; - - SmallWorldRandSplit(bool PrintProgress, - const Space& space, - const ObjectVector& data); - void CreateIndex(const AnyParams& IndexParams) override; - - ~SmallWorldRandSplit(); - - typedef std::vector ElementList; - - const std::string StrDesc() const override; - void Search(RangeQuery* query, IdType) const override; - void Search(KNNQuery* query, IdType) const override; - MSWNode* getRandomEntryPoint(size_t start, size_t end) const; - MSWNode* getRandomEntryPointLocked(size_t start, size_t end) const; - size_t getEntryQtyLocked() const; - - void searchForIndexing(const Object *queryObj, - const size_t chunkStart, const size_t chunkEnd, size_t randomEntryPointEnd, - vector& visitedBitset, - std::priority_queue &resultSet) const; - void add(MSWNode *newElement, const size_t chunkStart, const size_t chunkEnd, vector& visitedBitset); - void link(MSWNode* first, MSWNode* second){ - // addFriend checks for duplicates if the second argument is true - first->addFriend(second, true); - second->addFriend(first, true); - } - - void SetQueryTimeParams(const AnyParams& ) override; - -private: - - size_t NN_; - size_t efConstruction_; - size_t efSearch_; - size_t initIndexAttempts_; - size_t initSearchAttempts_; - size_t indexThreadQty_; - size_t chunkIndexSize_; - - const Space& space_; - const ObjectVector& data_; - bool PrintProgress_; - - mutable mutex ElListGuard_; - ElementList ElList_; - -protected: - - DISABLE_COPY_AND_ASSIGN(SmallWorldRandSplit); -}; - - - - -} - -#endif diff --git a/similarity_search/src/method/small_world_rand.cc b/similarity_search/src/method/small_world_rand.cc index 54bca18..1e73474 100644 --- a/similarity_search/src/method/small_world_rand.cc +++ b/similarity_search/src/method/small_world_rand.cc @@ -217,7 +217,7 @@ void SmallWorldRand::DeleteBatch(const vector& batchData, int de CHECK_MSG(it != ElList_.end(), "An attempt to delete a non-existing object with id=" + ConvertToString(objId)); MSWNode* delNode=it->second; IdType delNodeId = delNode->getId(); - CHECK(delNodeId < delNodesBitset.size()); + CHECK(delNodeId < (ssize_t)delNodesBitset.size()); delNodesBitset[delNodeId]=true; vToDelNodes.push_back(delNode); ElList_.erase(it); @@ -225,7 +225,7 @@ void SmallWorldRand::DeleteBatch(const vector& batchData, int de for (MSWNode* node: vToDelNodes) { for (MSWNode* pNeighbor : node->getAllFriends()) { IdType neighbNodeId = pNeighbor->getId(); - CHECK(neighbNodeId < delNodesBitset.size()); + CHECK(neighbNodeId < (ssize_t)delNodesBitset.size()); if (!delNodesBitset.at(neighbNodeId)) vToPatchNodes.push_back(pNeighbor); } @@ -247,7 +247,7 @@ void SmallWorldRand::DeleteBatch(const vector& batchData, int de mutex mtx; vector threads; - for (int i = 0; i < indexThreadQty_; ++i) { + for (size_t i = 0; i < indexThreadQty_; ++i) { threads.push_back(thread( [&]() { MSWNode* node = nullptr; @@ -267,11 +267,11 @@ void SmallWorldRand::DeleteBatch(const vector& batchData, int de for (auto it : ElList_) { MSWNode* node = it.second; IdType nodeId = node->getId(); - CHECK(nodeId < delNodesBitset.size()); + CHECK(nodeId < (ssize_t)delNodesBitset.size()); CHECK(!delNodesBitset.at(nodeId)); for (MSWNode* neighb : node->getAllFriends()) { IdType neighNodeId = neighb->getId(); - CHECK(neighNodeId < delNodesBitset.size()); + CHECK(neighNodeId < (ssize_t)delNodesBitset.size()); if (delNodesBitset.at(neighNodeId)) { /* * Two things to check here: @@ -312,7 +312,7 @@ template void SmallWorldRand::CheckIDs() const { // ElList_.size() can be smaller though - CHECK_MSG(NextNodeId_ >= ElList_.size(), + CHECK_MSG(NextNodeId_ >= (ssize_t)ElList_.size(), "Bug NextNodeId_ = " + ConvertToString(NextNodeId_) + " is < ElList_.size() = " + ConvertToString(ElList_.size())); vector visitedBitset(NextNodeId_); @@ -386,6 +386,10 @@ const std::string SmallWorldRand::StrDesc() const { template SmallWorldRand::~SmallWorldRand() { + for (auto e : ElList_) { + MSWNode* pNode = e.second; + delete pNode; + } } template @@ -427,7 +431,7 @@ SmallWorldRand::searchForIndexing(const Object *queryObj, closestDistQueue.pop(); } - size_t nodeId = provider->getId(); + IdType nodeId = provider->getId(); CHECK_MSG(nodeId < nextNodeIdUpperBound, "Bug: nodeId (" + ConvertToString(nodeId) + ") > nextNodeIdUpperBound (" + ConvertToString(nextNodeIdUpperBound)); @@ -473,7 +477,7 @@ SmallWorldRand::searchForIndexing(const Object *queryObj, for (size_t neighborId = 0; neighborId < neighborQty; ++neighborId) { MSWNode* pNeighbor = neighborCopy[neighborId]; - size_t nodeId = pNeighbor->getId(); + IdType nodeId = pNeighbor->getId(); CHECK_MSG(nodeId < nextNodeIdUpperBound, "Bug: nodeId (" + ConvertToString(nodeId) + ") > nextNodeIdUpperBound (" + ConvertToString(nextNodeIdUpperBound)); if (!visitedBitset[nodeId]) { @@ -584,12 +588,12 @@ void SmallWorldRand::SearchV1Merge(KNNQuery* query) const { dist_t d = query->DistanceObjLeft(currObj); sortedArr.push_unsorted_grow(d, currNode); // It won't grow - size_t nodeId = currNode->getId(); + IdType nodeId = currNode->getId(); CHECK_MSG(nodeId < NextNodeId_, "Bug: nodeId (" + ConvertToString(nodeId) + ") > NextNodeId_ (" +ConvertToString(NextNodeId_) +")"); visitedBitset[nodeId] = true; - int_fast32_t currElem = 0; + uint_fast32_t currElem = 0; typedef typename SortArrBI::Item QueueItem; @@ -660,7 +664,7 @@ void SmallWorldRand::SearchV1Merge(KNNQuery* query) const { ++currElem; } - for (int_fast32_t i = 0; i < query->GetK() && i < sortedArr.size(); ++i) { + for (uint_fast32_t i = 0; i < query->GetK() && i < sortedArr.size(); ++i) { query->CheckAndAddToResult(queueData[i].key, queueData[i].data->getData()); } } @@ -696,7 +700,7 @@ void SmallWorldRand::SearchOld(KNNQuery* query) const { candidateQueue.push(ev); closestDistQueue.emplace(d); - size_t nodeId = provider->getId(); + IdType nodeId = provider->getId(); CHECK_MSG(nodeId < NextNodeId_, "Bug: nodeId (" + ConvertToString(nodeId) + ") > NextNodeId_ (" +ConvertToString(NextNodeId_)); visitedBitset[nodeId] = true; @@ -762,14 +766,14 @@ void SmallWorldRand::SaveIndex(const string &location) { for(ElementMap::iterator it = ElList_.begin(); it != ElList_.end(); ++it) { MSWNode* pNode = it->second; IdType nodeID = pNode->getId(); - CHECK_MSG(nodeID >= 0 && nodeID < data_.size(), + CHECK_MSG(nodeID >= 0 && nodeID < (ssize_t)data_.size(), "Bug: unexpected node ID " + ConvertToString(nodeID) + " for object ID " + ConvertToString(pNode->getData()->id()) + "data_.size() = " + ConvertToString(data_.size())); outFile << nodeID << ":" << pNode->getData()->id() << ":"; for (const MSWNode* pNodeFriend: pNode->getAllFriends()) { IdType nodeFriendID = pNodeFriend->getId(); - CHECK_MSG(nodeFriendID >= 0 && nodeFriendID < data_.size(), + CHECK_MSG(nodeFriendID >= 0 && nodeFriendID < (ssize_t)data_.size(), "Bug: unexpected node ID " + ConvertToString(nodeFriendID) + " for object ID " + ConvertToString(pNodeFriend->getData()->id()) + "data_.size() = " + ConvertToString(data_.size())); @@ -816,7 +820,7 @@ void SmallWorldRand::LoadIndex(const string &location) { string("Bug or inconsitent data, wrong format, c1=") + c1 + ",c2=" + c2 + " line: " + ConvertToString(lineNum) ); - CHECK_MSG(nodeID >= 0 && nodeID < data_.size(), + CHECK_MSG(nodeID >= 0 && nodeID < (ssize_t)data_.size(), DATA_MUTATION_ERROR_MSG + " (unexpected node ID " + ConvertToString(nodeID) + " for object ID " + ConvertToString(objID) + " data_.size() = " + ConvertToString(data_.size()) + ")"); @@ -836,7 +840,7 @@ void SmallWorldRand::LoadIndex(const string &location) { "Bug, got NULL pointer in the second pass for nodeID " + ConvertToString(nodeID)); IdType nodeFriendID; while (str >> nodeFriendID) { - CHECK_MSG(nodeFriendID >= 0 && nodeFriendID < data_.size(), + CHECK_MSG(nodeFriendID >= 0 && nodeFriendID < (ssize_t)data_.size(), "Bug: unexpected node ID " + ConvertToString(nodeFriendID) + "data_.size() = " + ConvertToString(data_.size())); MSWNode *pFriendNode = ptrMapper[nodeFriendID]; diff --git a/similarity_search/src/method/small_world_rand_split.cc b/similarity_search/src/method/small_world_rand_split.cc deleted file mode 100644 index 77fc19c..0000000 --- a/similarity_search/src/method/small_world_rand_split.cc +++ /dev/null @@ -1,639 +0,0 @@ -/** - * Non-metric Space Library - * - * Authors: Bilegsaikhan Naidan (https://github.com/bileg), Leonid Boytsov (http://boytsov.info). - * With contributions from Lawrence Cayton (http://lcayton.com/) and others. - * - * For the complete list of contributors and further details see: - * https://github.com/searchivarius/NonMetricSpaceLib - * - * Copyright (c) 2014 - * - * This code is released under the - * Apache License Version 2.0 http://www.apache.org/licenses/. - * - */ -#include -#include -#include - -#include "space.h" -#include "knnquery.h" -#include "rangequery.h" -#include "ported_boost_progress.h" -#include "method/small_world_rand_split.h" - -#include -#include -#include -#include -#include - -//#define START_WITH_E0 -#define START_WITH_E0_AT_QUERY_TIME - -//#define USE_ALTERNATIVE_FOR_INDEXING - -namespace similarity { - -using namespace std; - -template -struct IndexThreadParamsSplitSW { - const Space& space_; - SmallWorldRandSplit& index_; - const ObjectVector& data_; - size_t index_every_; - size_t out_of_; - size_t start_; - size_t end_; - ProgressDisplay* progress_bar_; - mutex& display_mutex_; - size_t progress_update_qty_; - vector visitedBitset_; - - IndexThreadParamsSplitSW( - const Space& space, - SmallWorldRandSplit& index, - const ObjectVector& data, - size_t index_every, - size_t out_of, - size_t start, - size_t end, - ProgressDisplay* progress_bar, - mutex& display_mutex, - size_t progress_update_qty - ) : - space_(space), - index_(index), - data_(data), - index_every_(index_every), - out_of_(out_of), - start_(start), - end_(end), - progress_bar_(progress_bar), - display_mutex_(display_mutex), - progress_update_qty_(progress_update_qty), - visitedBitset_(end - start) - { } -}; - -template -struct IndexThreadSplitSW { - void operator()(IndexThreadParamsSplitSW& prm) { - ProgressDisplay* progress_bar = prm.progress_bar_; - mutex& display_mutex(prm.display_mutex_); - - size_t nextQty = prm.progress_update_qty_; - for (size_t id = prm.start_; id < prm.end_; ++id) { - if (prm.index_every_ == id % prm.out_of_) { - typename SmallWorldRandSplit::MSWNode* node = new typename SmallWorldRandSplit::MSWNode(prm.data_[id], id); - prm.index_.add(node, prm.start_, prm.end_, prm.visitedBitset_); - - if ((id + 1 >= min(prm.data_.size(), nextQty)) && progress_bar) { - unique_lock lock(display_mutex); - (*progress_bar) += (nextQty - progress_bar->count()); - nextQty += prm.progress_update_qty_; - } - } - } - } -}; - -template -SmallWorldRandSplit::SmallWorldRandSplit(bool PrintProgress, - const Space& space, - const ObjectVector& data) : - space_(space), data_(data), PrintProgress_(PrintProgress) {} - -template -void SmallWorldRandSplit::CreateIndex(const AnyParams& IndexParams) -{ - AnyParamManager pmgr(IndexParams); - - pmgr.GetParamOptional("NN", NN_, 10); - pmgr.GetParamOptional("efConstruction", efConstruction_, NN_); - pmgr.GetParamOptional("chunkIndexSize", chunkIndexSize_, data_.size()); - CHECK_MSG(chunkIndexSize_ > 0, "chunkIndexSize should be > 0"); - - chunkIndexSize_ = min(chunkIndexSize_, data_.size()); - efSearch_ = NN_; - pmgr.GetParamOptional("initIndexAttempts", initIndexAttempts_, 2); - pmgr.GetParamOptional("indexThreadQty", indexThreadQty_, thread::hardware_concurrency()); - if (indexThreadQty_ <=0) indexThreadQty_ = 1; - - LOG(LIB_INFO) << "NN = " << NN_; - LOG(LIB_INFO) << "efConstruction = " << efConstruction_; - LOG(LIB_INFO) << "chunkIndexSize = " << chunkIndexSize_; - LOG(LIB_INFO) << "initIndexAttempts = " << initIndexAttempts_; - LOG(LIB_INFO) << "indexThreadQty = " << indexThreadQty_; - - pmgr.CheckUnused(); - - SetQueryTimeParams(getEmptyParams()); - - if (data_.empty()) return; - - unique_ptr progress_bar(PrintProgress_ ? - new ProgressDisplay(data_.size(), cerr) - :NULL); - - for (size_t start = 0, chunkNum = 0; start < data_.size(); start += chunkIndexSize_, ++chunkNum) { - size_t end = min(data_.size(), start + chunkIndexSize_); - CHECK(end > start); - - vector threads(indexThreadQty_); - vector>> threadParams; - mutex progressBarMutex; - - for (size_t i = 0; i < indexThreadQty_; ++i) { - threadParams.push_back(shared_ptr>( - new IndexThreadParamsSplitSW(space_, *this, data_, - i, indexThreadQty_, - start, end, - progress_bar.get(), progressBarMutex, 200))); - } - for (size_t i = 0; i < indexThreadQty_; ++i) { - threads[i] = thread(IndexThreadSplitSW(), ref(*threadParams[i])); - } - for (size_t i = 0; i < indexThreadQty_; ++i) { - threads[i].join(); - } - } - - if (progress_bar) { - (*progress_bar) += (progress_bar->expected_count() - progress_bar->count()); - } - - if (ElList_.size() != data_.size()) { - stringstream err; - err << "Bug: Indexing seems to be incomplete ElList_.size() (" << ElList_.size() << ") isn't equal to data_.size() (" << data_.size() << ")"; - LOG(LIB_INFO) << err.str(); - throw runtime_error(err.str()); - } -} - -template -void -SmallWorldRandSplit::SetQueryTimeParams(const AnyParams& QueryTimeParams) { - AnyParamManager pmgr(QueryTimeParams); - pmgr.GetParamOptional("initSearchAttempts", initSearchAttempts_, 3); - pmgr.GetParamOptional("efSearch", efSearch_, NN_); - pmgr.CheckUnused(); - LOG(LIB_INFO) << "Set SmallWorldRandSplit query-time parameters:"; - LOG(LIB_INFO) << "initSearchAttempts =" << initSearchAttempts_; - LOG(LIB_INFO) << "efSearch =" << efSearch_; -} - -template -const std::string SmallWorldRandSplit::StrDesc() const { - return METH_SMALL_WORLD_RAND_SPLIT; -} - -template -SmallWorldRandSplit::~SmallWorldRandSplit() { -} - -template -typename SmallWorldRandSplit::MSWNode* SmallWorldRandSplit::getRandomEntryPointLocked(size_t start, size_t end) const -{ - unique_lock lock(ElListGuard_); - MSWNode* res = getRandomEntryPoint(start, end); - return res; -} - -template -size_t SmallWorldRandSplit::getEntryQtyLocked() const -{ - unique_lock lock(ElListGuard_); - size_t res = ElList_.size(); - return res; -} - -template -typename SmallWorldRandSplit::MSWNode* SmallWorldRandSplit::getRandomEntryPoint(size_t start, size_t end) const { - if(end <= start) { - return NULL; - } else { - size_t num = RandomInt()% (end - start); - return ElList_[start + num]; - } -} - -template -void -SmallWorldRandSplit::searchForIndexing(const Object *queryObj, - size_t chunkStart, size_t chunkEnd, size_t randomEntryPointEnd, - vector& visitedBitset, - priority_queue &resultSet) const -{ -/* - * The trick of using large dense bitsets instead of unordered_set was - * borrowed from Wei Dong's kgraph: https://github.com/aaalgo/kgraph - * - * This trick works really well even in a multi-threaded mode. Indeed, the amount - * of allocated memory is small. For example, if one has 8M entries, the size of - * the bitmap is merely 1 MB. Furthermore, setting 1MB of entries to zero via memset would take only - * a fraction of millisecond. - */ - visitedBitset.assign(visitedBitset.size(), false); // clear the bitset - - vector neighborCopy; - - for (size_t i=0; i < initIndexAttempts_; i++){ - /** - * Search for the k most closest elements to the query. - */ - - MSWNode* provider = NULL; - - // Some entries will hold NULLs temporarily - for (int att = 0; ((provider = getRandomEntryPointLocked(chunkStart, randomEntryPointEnd)) == NULL) && att < 100; ++att); - - if (provider == NULL) { - unique_lock lock(ElListGuard_); - provider = ElList_[chunkStart]; - } - - priority_queue closestDistQueue; - priority_queue candidateSet; - -#ifdef USE_ALTERNATIVE_FOR_INDEXING - dist_t d = space_.ProxyDistance(provider->getData(), queryObj); - #pragma message "Using an alternative/proxy function for indexing, not the original one!" -#else - dist_t d = space_.IndexTimeDistance(provider->getData(), queryObj); -#endif - EvaluatedMSWNodeReverse ev(d, provider); - - candidateSet.push(ev); - closestDistQueue.push(d); - - if (closestDistQueue.size() > efConstruction_) { - closestDistQueue.pop(); - } - - size_t nodeId = provider->getId(); - CHECK_MSG(nodeId >= chunkStart && nodeId < chunkEnd, - "Bug, expecting node ID in the semi-open interval [" + ConvertToString(chunkStart) + "," + ConvertToString(chunkEnd) + ")"); - - visitedBitset[nodeId - chunkStart] = true; - - resultSet.emplace(d, provider); - - if (resultSet.size() > NN_) { // TODO check somewhere that NN > 0 - resultSet.pop(); - } - - while (!candidateSet.empty()) { - const EvaluatedMSWNodeReverse& currEv = candidateSet.top(); - dist_t lowerBound = closestDistQueue.top(); - - /* - * Check if we reached a local minimum. - */ - if (currEv.getDistance() > lowerBound) { - break; - } - MSWNode* currNode = currEv.getMSWNode(); - - /* - * This lock protects currNode from being modified - * while we are accessing elements of currNode. - */ - size_t neighborQty = 0; - { - unique_lock lock(currNode->accessGuard_); - - //const vector& neighbor = currNode->getAllFriends(); - const vector& neighbor = currNode->getAllFriends(); - neighborQty = neighbor.size(); - if (neighborQty > neighborCopy.size()) neighborCopy.resize(neighborQty); - for (size_t k = 0; k < neighborQty; ++k) - neighborCopy[k]=neighbor[k]; - } - - // Can't access curEv anymore! The reference would become invalid - candidateSet.pop(); - - // calculate distance to each neighbor - for (size_t neighborId = 0; neighborId < neighborQty; ++neighborId) { - MSWNode* pNeighbor = neighborCopy[neighborId]; - - size_t nodeId = pNeighbor->getId(); - CHECK_MSG(nodeId >= chunkStart && nodeId < chunkEnd, - "Bug, expecting node ID in the semi-open interval [" + ConvertToString(chunkStart) + "," + ConvertToString(chunkEnd) + ")"); - - if (!visitedBitset[nodeId - chunkStart]) { - visitedBitset[nodeId - chunkStart] = true; - -#ifdef USE_ALTERNATIVE_FOR_INDEXING - d = space_.ProxyDistance(pNeighbor->getData(), queryObj); - #pragma message "Using an alternative/proxy function for indexing, not the original one!" -#else - d = space_.IndexTimeDistance(pNeighbor->getData(), queryObj); -#endif - - if (closestDistQueue.size() < efConstruction_ || d < closestDistQueue.top()) { - closestDistQueue.push(d); - if (closestDistQueue.size() > efConstruction_) { - closestDistQueue.pop(); - } - candidateSet.emplace(d, pNeighbor); - } - - if (resultSet.size() < NN_ || resultSet.top().getDistance() > d) { - resultSet.emplace(d, pNeighbor); - if (resultSet.size() > NN_) { // TODO check somewhere that NN > 0 - resultSet.pop(); - } - } - } - } - } - } -} - - -template -void SmallWorldRandSplit::add(MSWNode *newElement, - const size_t chunkStart, const size_t chunkEnd, - vector& visitedBitset){ - newElement->removeAllFriends(); - - size_t randomEntryPointEnd = 0; - - size_t insertIndex = 0; - { - unique_lock lock(ElListGuard_); - CHECK_MSG(chunkIndexSize_ > 0, "chunkIndexSize should be > 0"); - CHECK(ElList_.size() >= chunkStart && ElList_.size() < chunkEnd); - size_t chunkIndexId = ElList_.size() % chunkIndexSize_ ; - if (0 == chunkIndexId) { - // If we start a new chunk, don't connect chunk elements to previously inserted entries! - ElList_.push_back(newElement); - return; - } - - CHECK(chunkIndexSize_ <= data_.size()); -#ifdef START_WITH_E0 - randomEntryPointEnd = chunkStart + 1; -#else - // don't think that we can ever have a data set so large that this summation would cause an overflow, - // also we ensure that chunkIndexSize_ <= data_.size() - randomEntryPointEnd = min(ElList_.size(), chunkStart + chunkIndexSize_); -#endif - insertIndex = ElList_.size(); - /* - * We need to claim the element space, otherwise we will get overlapping partitions in the multi-threaded mode. - * NULL shouldn't cause problems during indexing, because NULLed entries do not appear as neighbors, - * they can only be retrieved via a getRandomEntryPointLocked(). However, the function searchForIndexing - * calls getRandomEntryPointLocked() until a non-NULL entry is returned. - * After several failed attempts, we will use the first entry in the chunk as the starting point, - * as this entry is guranteed to be non-NULL. - */ - ElList_.push_back(NULL); - } - - CHECK(randomEntryPointEnd > chunkStart); - - { - priority_queue resultSet; - - searchForIndexing(newElement->getData(), chunkStart, chunkEnd, randomEntryPointEnd, visitedBitset, resultSet); - - // TODO actually we might need to add elements in the reverse order in the future. - // For the current implementation, however, the order doesn't seem to matter - while (!resultSet.empty()) { - link(resultSet.top().getMSWNode(), newElement); - resultSet.pop(); - } - } - - { - unique_lock lock(ElListGuard_); - CHECK(NULL == ElList_[insertIndex]); - ElList_[insertIndex] = newElement; - } - -} - -template -void SmallWorldRandSplit::Search(RangeQuery* query, IdType) const { - throw runtime_error("Range search is not supported!"); -} - - -template -void SmallWorldRandSplit::Search(KNNQuery* query, IdType) const { - if (ElList_.empty()) return; -/* - * The trick of using large dense bitsets instead of unordered_set was - * borrowed from Wei Dong's kgraph: https://github.com/aaalgo/kgraph - * - * This trick works really well even in a multi-threaded mode. Indeed, the amount - * of allocated memory is small. For example, if one has 8M entries, the size of - * the bitmap is merely 1 MB. Furthermore, setting 1MB of entries to zero via memset would take only - * a fraction of millisecond. - */ - vector visitedBitset(chunkIndexSize_); - - // don't think that we can every have a data set so this would cause an overflow, also - // we ensure that chunkIndexSize_ <= data_.size() - CHECK(chunkIndexSize_ <= data_.size()); - for (size_t start = 0; start < ElList_.size(); start += chunkIndexSize_) { - size_t end = min(ElList_.size(), start + chunkIndexSize_); - CHECK(end > start); - if (start) visitedBitset.assign(visitedBitset.size(), false); // Clear the visited array when moving to another chunk - for (size_t attempId =0; attempId < initSearchAttempts_; attempId++) { - /** - * Search of most k-closest elements to the query. - */ - - priority_queue closestDistQueue; //The set of all elements which distance was calculated - priority_queue candidateQueue; //the set of elements which we can use to evaluate - -#ifdef START_WITH_E0_AT_QUERY_TIME - size_t randomEntryPointEnd = start + 1; -#else - size_t randomEntryPointEnd = end; -#endif - MSWNode *provider = getRandomEntryPoint(start, randomEntryPointEnd); - - const Object* currObj = provider->getData(); - dist_t d = query->DistanceObjLeft(currObj); - query->CheckAndAddToResult(d, currObj); // This should be done before the object goes to the queue: otherwise it will not be compared to the query at all! - - EvaluatedMSWNodeReverse ev(d, provider); - candidateQueue.push(ev); - closestDistQueue.emplace(d); - - size_t nodeId = provider->getId(); - CHECK_MSG(nodeId >= start && nodeId < end, - "Bug, expecting node ID in the semi-open interval [" + ConvertToString(start) + "," + ConvertToString(end) + ")"); - visitedBitset[nodeId-start] = true; - - while(!candidateQueue.empty()) { - auto iter = candidateQueue.top(); // This one was already compared to the query - const EvaluatedMSWNodeReverse& currEv = iter; - - dist_t lowerBound = closestDistQueue.top(); - - // Did we reach a local minimum? - if (currEv.getDistance() > lowerBound) { - break; - } - - const vector& neighbor = (currEv.getMSWNode())->getAllFriends(); - - // Can't access curEv anymore! The reference would become invalid - candidateQueue.pop(); - - //calculate distance to each neighbor - for (auto iter = neighbor.begin(); iter != neighbor.end(); ++iter){ - size_t nodeId = (*iter)->getId(); - CHECK_MSG(nodeId >= start && nodeId < end, - "Bug, expecting node ID in the semi-open interval [" + ConvertToString(start) + "," + ConvertToString(end) + ")"); - - size_t nodeIdDiff = nodeId - start; - if (!visitedBitset[nodeIdDiff]) { - const Object* currObj = (*iter)->getData(); - dist_t d = query->DistanceObjLeft(currObj); - - visitedBitset[nodeIdDiff] = true; - - if (closestDistQueue.size() < efSearch_ || d < closestDistQueue.top()) { - closestDistQueue.emplace(d); - if (closestDistQueue.size() > efSearch_) { - closestDistQueue.pop(); - } - candidateQueue.emplace(d, *iter); - } - - query->CheckAndAddToResult(d, currObj); - } - } - } - } - } -} - -template -void SmallWorldRandSplit::SaveIndex(const string &location) { - ofstream outFile(location); - CHECK_MSG(outFile, "Cannot open file '" + location + "' for writing"); - outFile.exceptions(std::ios::badbit); - size_t lineNum = 0; - - WriteField(outFile, METHOD_DESC, StrDesc()); lineNum++; - WriteField(outFile, "NN", NN_); lineNum++; - WriteField(outFile, "chunkIndexSize", chunkIndexSize_); lineNum++; - - for (const MSWNode* pNode: ElList_) { - IdType nodeID = pNode->getId(); - CHECK_MSG(nodeID >= 0 && nodeID < data_.size(), - "Bug: unexpected node ID " + ConvertToString(nodeID) + - " for object ID " + ConvertToString(pNode->getData()->id()) + - "data_.size() = " + ConvertToString(data_.size())); - outFile << nodeID << ":" << pNode->getData()->id() << ":"; - for (const MSWNode* pNodeFriend: pNode->getAllFriends()) { - IdType nodeFriendID = pNodeFriend->getId(); - CHECK_MSG(nodeFriendID >= 0 && nodeFriendID < data_.size(), - "Bug: unexpected node ID " + ConvertToString(nodeFriendID) + - " for object ID " + ConvertToString(pNodeFriend->getData()->id()) + - "data_.size() = " + ConvertToString(data_.size())); - outFile << ' ' << nodeFriendID; - } - outFile << endl; lineNum++; - } - outFile << endl; lineNum++; // The empty line indicates the end of data entries - WriteField(outFile, LINE_QTY, lineNum + 1 /* including this line */); - outFile.close(); -} - -template -void SmallWorldRandSplit::LoadIndex(const string &location) { - vector ptrMapper(data_.size()); - - for (unsigned pass = 0; pass < 2; ++ pass) { - ifstream inFile(location); - CHECK_MSG(inFile, "Cannot open file '" + location + "' for reading"); - inFile.exceptions(std::ios::badbit); - - size_t lineNum = 1; - string methDesc; - ReadField(inFile, METHOD_DESC, methDesc); - lineNum++; - CHECK_MSG(methDesc == StrDesc(), - "Looks like you try to use an index created by a different method: " + methDesc); - ReadField(inFile, "NN", NN_); - lineNum++; - - ReadField(inFile, "chunkIndexSize", chunkIndexSize_); - CHECK_MSG(chunkIndexSize_ <= data_.size(), "chunkIndexSize is larger than the # of data points, did you create this index for a larger data set?"); - lineNum++; - - string line; - while (getline(inFile, line)) { - if (line.empty()) { - lineNum++; break; - } - stringstream str(line); - str.exceptions(std::ios::badbit); - char c1, c2; - IdType nodeID, objID; - CHECK_MSG((str >> nodeID) && (str >> c1) && (str >> objID) && (str >> c2), - "Bug or inconsitent data, wrong format, line: " + ConvertToString(lineNum) - ); - CHECK_MSG(c1 == ':' && c2 == ':', - string("Bug or inconsitent data, wrong format, c1=") + c1 + ",c2=" + c2 + - " line: " + ConvertToString(lineNum) - ); - CHECK_MSG(nodeID >= 0 && nodeID < data_.size(), - DATA_MUTATION_ERROR_MSG + " (unexpected node ID " + ConvertToString(nodeID) + - " for object ID " + ConvertToString(objID) + - " data_.size() = " + ConvertToString(data_.size()) + ")"); - CHECK_MSG(data_[nodeID]->id() == objID, - DATA_MUTATION_ERROR_MSG + " (unexpected object ID " + ConvertToString(data_[nodeID]->id()) + - " for data element with ID " + ConvertToString(nodeID) + - " expected object ID: " + ConvertToString(objID) + ")" - ); - if (pass == 0) { - unique_ptr node(new MSWNode(data_[nodeID], nodeID)); - ptrMapper[nodeID] = node.get(); - ElList_.push_back(node.release()); - } else { - MSWNode *pNode = ptrMapper[nodeID]; - CHECK_MSG(pNode != NULL, - "Bug, got NULL pointer in the second pass for nodeID " + ConvertToString(nodeID)); - IdType nodeFriendID; - while (str >> nodeFriendID) { - CHECK_MSG(nodeFriendID >= 0 && nodeFriendID < data_.size(), - "Bug: unexpected node ID " + ConvertToString(nodeFriendID) + - "data_.size() = " + ConvertToString(data_.size())); - MSWNode *pFriendNode = ptrMapper[nodeFriendID]; - CHECK_MSG(pFriendNode != NULL, - "Bug, got NULL pointer in the second pass for nodeID " + ConvertToString(nodeFriendID)); - pNode->addFriend(pFriendNode, false /* don't check for duplicates */); - } - CHECK_MSG(str.eof(), - "It looks like there is some extract erroneous stuff in the end of the line " + - ConvertToString(lineNum)); - } - ++lineNum; - } - - size_t ExpLineNum; - ReadField(inFile, LINE_QTY, ExpLineNum); - CHECK_MSG(lineNum == ExpLineNum, - DATA_MUTATION_ERROR_MSG + " (expected number of lines " + ConvertToString(ExpLineNum) + - " read so far doesn't match the number of read lines: " + ConvertToString(lineNum) + ")"); - inFile.close(); - } -} - -template class SmallWorldRandSplit; -template class SmallWorldRandSplit; -template class SmallWorldRandSplit; - -}