From c409c1d4e1fc53ae1158319aaa230cc42a8938ea Mon Sep 17 00:00:00 2001 From: searchivairus Date: Sun, 7 Jan 2018 14:43:53 -0500 Subject: [PATCH] nearly full reproducibility with a given seed #57 --- python_bindings/nmslib.cc | 2 +- scripts/test_run.sh | 6 +---- similarity_search/apps/bench_distfunc.cc | 2 +- similarity_search/apps/bench_projection.cc | 2 +- similarity_search/apps/knn_stat.cc | 2 +- similarity_search/apps/main.cc | 2 +- similarity_search/apps/report_intr_dim.cc | 2 +- similarity_search/apps/test_clust.cc | 2 +- similarity_search/apps/tune_vptree.cc | 2 +- similarity_search/include/init.h | 2 +- similarity_search/include/method/hnsw.h | 5 ++-- similarity_search/include/utils.h | 29 +++++++++++++++------- similarity_search/src/init.cc | 6 ++++- similarity_search/src/method/hnsw.cc | 2 -- similarity_search/src/utils.cc | 1 - similarity_search/test/bunit.cc | 2 +- similarity_search/test/test_integr.cc | 2 +- 17 files changed, 39 insertions(+), 32 deletions(-) diff --git a/python_bindings/nmslib.cc b/python_bindings/nmslib.cc index 8b4c281..fed3a34 100644 --- a/python_bindings/nmslib.cc +++ b/python_bindings/nmslib.cc @@ -358,7 +358,7 @@ PYBIND11_PLUGIN(nmslib) { py::module nmslibLogger = logging.attr("getLogger")("nmslib"); setGlobalLogger(new PythonLogger(nmslibLogger)); - initLibrary(LIB_LOGCUSTOM, NULL); + initLibrary(0 /* seed */, LIB_LOGCUSTOM, NULL); py::module m(module_name, "Bindings for Non-Metric Space Library (NMSLIB)"); diff --git a/scripts/test_run.sh b/scripts/test_run.sh index 70fe45b..ab989cd 100755 --- a/scripts/test_run.sh +++ b/scripts/test_run.sh @@ -114,11 +114,7 @@ function do_run { # Methods that may create an index (at least for some spaces) do_run 0 "napp" " -c numPivot=512,numPivotIndex=64 " 0 "-t numPivotSearch=40 -t numPivotSearch=42 -t numPivotSearch=44 -t numPivotSearch=46 -t numPivotSearch=48" "napp_${SPACE}.index" do_run 1 "sw-graph" " -c NN=10 " 0 " -t efSearch=10 -t efSearch=20 -t efSearch=40 -t efSearch=80 -t efSearch=160 -t efSearch=240" "sw-graph_${SPACE}.index" -if [ "$SPACE" = "l2" -o "$SPACE" = "cosinesimil" ] ; then - do_run 1 "hnsw" " -c M=10 " 1 " -t efSearch=10 -t efSearch=20 -t efSearch=40 -t efSearch=80 -t efSearch=160 -t efSearch=240" "hnsw_${SPACE}.index" -else - do_run 1 "hnsw" " -c M=10 " 0 " -t efSearch=10 -t efSearch=20 -t efSearch=40 -t efSearch=80 -t efSearch=160 -t efSearch=240" -fi +do_run 1 "hnsw" " -c M=10 " 1 " -t efSearch=10 -t efSearch=20 -t efSearch=40 -t efSearch=80 -t efSearch=160 -t efSearch=240" "hnsw_${SPACE}.index" # Methods that do not support creation of an index do_run 1 "vptree" " -c tuneK=$K,bucketSize=50,desiredRecall=0.99,chunkBucket=1 0 " "" diff --git a/similarity_search/apps/bench_distfunc.cc b/similarity_search/apps/bench_distfunc.cc index 7871a1c..18de9b6 100644 --- a/similarity_search/apps/bench_distfunc.cc +++ b/similarity_search/apps/bench_distfunc.cc @@ -1971,7 +1971,7 @@ using namespace similarity; int main(int argc, char* argv[]) { string LogFile; if (argc == 2) LogFile = argv[1]; - initLibrary(LogFile.empty() ? LIB_LOGSTDERR:LIB_LOGFILE, LogFile.c_str()); + initLibrary(0, LogFile.empty() ? LIB_LOGSTDERR:LIB_LOGFILE, LogFile.c_str()); int nTest = 0; diff --git a/similarity_search/apps/bench_projection.cc b/similarity_search/apps/bench_projection.cc index d943342..2194ad0 100644 --- a/similarity_search/apps/bench_projection.cc +++ b/similarity_search/apps/bench_projection.cc @@ -250,7 +250,7 @@ int main(int argc, char *argv[]) { LOG(LIB_FATAL) << "Failed to parse cmd arguments"; } - initLibrary(logFile.empty() ? LIB_LOGSTDERR:LIB_LOGFILE, logFile.c_str()); + initLibrary(0, logFile.empty() ? LIB_LOGSTDERR:LIB_LOGFILE, logFile.c_str()); LOG(LIB_INFO) << "Program arguments are processed"; diff --git a/similarity_search/apps/knn_stat.cc b/similarity_search/apps/knn_stat.cc index 5b4e384..ea416db 100644 --- a/similarity_search/apps/knn_stat.cc +++ b/similarity_search/apps/knn_stat.cc @@ -377,7 +377,7 @@ int main(int argc, char *argv[]) { LOG(LIB_FATAL) << "Failed to parse cmd arguments"; } - initLibrary(logFile.empty() ? LIB_LOGSTDERR:LIB_LOGFILE, logFile.c_str()); + initLibrary(0, logFile.empty() ? LIB_LOGSTDERR:LIB_LOGFILE, logFile.c_str()); LOG(LIB_INFO) << "Program arguments are processed"; diff --git a/similarity_search/apps/main.cc b/similarity_search/apps/main.cc index 135f9c6..b1c4a85 100644 --- a/similarity_search/apps/main.cc +++ b/similarity_search/apps/main.cc @@ -566,7 +566,7 @@ int main(int ac, char* av[]) { "then you have to specify the gold-standard cache file!"); } - initLibrary(LogFile.empty() ? LIB_LOGSTDERR:LIB_LOGFILE, LogFile.c_str()); + initLibrary(0, LogFile.empty() ? LIB_LOGSTDERR:LIB_LOGFILE, LogFile.c_str()); LOG(LIB_INFO) << "Program arguments are processed"; diff --git a/similarity_search/apps/report_intr_dim.cc b/similarity_search/apps/report_intr_dim.cc index fa01dff..ec6c9ff 100644 --- a/similarity_search/apps/report_intr_dim.cc +++ b/similarity_search/apps/report_intr_dim.cc @@ -137,7 +137,7 @@ int main(int argc, char* argv[]) { THROW_RUNTIME_ERR(err); } - initLibrary(LIB_LOGSTDERR); + initLibrary(0, LIB_LOGSTDERR); if (DIST_TYPE_INT == distType) { TestSpace( diff --git a/similarity_search/apps/test_clust.cc b/similarity_search/apps/test_clust.cc index 468aeda..f6108d4 100644 --- a/similarity_search/apps/test_clust.cc +++ b/similarity_search/apps/test_clust.cc @@ -290,7 +290,7 @@ int main(int argc, char* argv[]) { SampleDistQty ); - initLibrary(LogFile.empty() ? LIB_LOGSTDERR:LIB_LOGFILE, LogFile.c_str()); + initLibrary(0, LogFile.empty() ? LIB_LOGSTDERR:LIB_LOGFILE, LogFile.c_str()); ToLower(DistType); ToLower(ClustType); diff --git a/similarity_search/apps/tune_vptree.cc b/similarity_search/apps/tune_vptree.cc index bd323d6..4dd4755 100644 --- a/similarity_search/apps/tune_vptree.cc +++ b/similarity_search/apps/tune_vptree.cc @@ -474,7 +474,7 @@ int main(int ac, char* av[]) { IndexParams, QueryTimeParams); - initLibrary(LogFile.empty() ? LIB_LOGSTDERR:LIB_LOGFILE, LogFile.c_str()); + initLibrary(0, LogFile.empty() ? LIB_LOGSTDERR:LIB_LOGFILE, LogFile.c_str()); ToLower(DistType); diff --git a/similarity_search/include/init.h b/similarity_search/include/init.h index a1563d4..5bb8caa 100644 --- a/similarity_search/include/init.h +++ b/similarity_search/include/init.h @@ -26,7 +26,7 @@ namespace similarity { - void initLibrary(LogChoice choice = LIB_LOGNONE, const char*pLogFile = NULL); + void initLibrary(int seed = 0, LogChoice choice = LIB_LOGNONE, const char*pLogFile = NULL); } #endif diff --git a/similarity_search/include/method/hnsw.h b/similarity_search/include/method/hnsw.h index c1b4305..7ee586e 100644 --- a/similarity_search/include/method/hnsw.h +++ b/similarity_search/include/method/hnsw.h @@ -481,8 +481,8 @@ namespace similarity { int getRandomLevel(double revSize) { - std::uniform_real_distribution distribution(0.0, 1.0); - double r = -log(distribution(*generator)) * revSize; + // RandomReal is thread-safe + float r = -log(RandomReal()) * revSize; return (int)r; } @@ -513,7 +513,6 @@ namespace similarity { // private: - std::unique_ptr generator; size_t M_; size_t maxM_; size_t maxM0_; diff --git a/similarity_search/include/utils.h b/similarity_search/include/utils.h index cc7a26d..c1aa21c 100644 --- a/similarity_search/include/utils.h +++ b/similarity_search/include/utils.h @@ -84,21 +84,32 @@ bool DoesFileExist(const char *filename); inline bool DoesFileExist(const string &filename) { return DoesFileExist(filename.c_str()); } -inline int RandomInt() { - // Static is thread-safe in C++ 11 - static random_device rdev; - static mt19937 gen(rdev()); - static std::uniform_int_distribution distr(0, std::numeric_limits::max()); +extern int randomSeed; +// random 32-bit integer number +inline int32_t RandomInt() { + /* + * Random number generation is thread safe when respective + * objects are not shared among threads. So, we will keep one + * random number generator per thread. + */ + // thread_local is static by default, but let's keep it static for clarity + static thread_local mt19937 gen(randomSeed); + static thread_local std::uniform_int_distribution distr(0, std::numeric_limits::max()); return distr(gen); } template +// random real number from 0 (inclusive) to 1 (exclusive) inline T RandomReal() { - // Static is thread-safe in C++ 11 - static random_device rdev; - static mt19937 gen(rdev()); - static std::uniform_real_distribution distr(0, 1); + /* + * Random number generation is thread safe when respective + * objects are not shared among threads. So, we will keep one + * random number generator per thread. + */ + // thread_local is static by default, but let's keep it static for clarity + static thread_local mt19937 gen(randomSeed); + static thread_local std::uniform_real_distribution distr(0, 1); return distr(gen); } diff --git a/similarity_search/src/init.cc b/similarity_search/src/init.cc index 08183f7..6acf3fe 100644 --- a/similarity_search/src/init.cc +++ b/similarity_search/src/init.cc @@ -32,7 +32,11 @@ namespace similarity { -void initLibrary(LogChoice choice, const char* pLogFile) { +int randomSeed = 0; + +void initLibrary(int seed, LogChoice choice, const char* pLogFile) { + randomSeed = seed; + std::ios_base::sync_with_stdio(false); InitializeLogger(choice, pLogFile); initSpaces(); diff --git a/similarity_search/src/method/hnsw.cc b/similarity_search/src/method/hnsw.cc index 589a940..ebdd417 100644 --- a/similarity_search/src/method/hnsw.cc +++ b/similarity_search/src/method/hnsw.cc @@ -158,8 +158,6 @@ namespace similarity { { AnyParamManager pmgr(IndexParams); - generator.reset(new std::default_random_engine(100)); - pmgr.GetParamOptional("M", M_, 16); // Let's use a generic algorithm by default! diff --git a/similarity_search/src/utils.cc b/similarity_search/src/utils.cc index 85e6a3f..2c03958 100644 --- a/similarity_search/src/utils.cc +++ b/similarity_search/src/utils.cc @@ -40,7 +40,6 @@ namespace similarity { - const char* GetFileName(const char* fullpath) { for (int i = strlen(fullpath) - 1; i >= 0; --i) { if (fullpath[i] == '\\' || fullpath[i] == '/') { diff --git a/similarity_search/test/bunit.cc b/similarity_search/test/bunit.cc index b66fe57..6957242 100644 --- a/similarity_search/test/bunit.cc +++ b/similarity_search/test/bunit.cc @@ -92,7 +92,7 @@ int TestRunner::RunAllTests() { int main(int argc, char *argv[]) { std::string LogFile; if (argc == 2) LogFile = argv[1]; - similarity::initLibrary(LogFile.empty() ? LIB_LOGSTDERR:LIB_LOGFILE, LogFile.c_str()); + similarity::initLibrary(0, LogFile.empty() ? LIB_LOGSTDERR:LIB_LOGFILE, LogFile.c_str()); return similarity::TestRunner::Instance().RunAllTests(); } diff --git a/similarity_search/test/test_integr.cc b/similarity_search/test/test_integr.cc index 1d58262..1f6aeda 100644 --- a/similarity_search/test/test_integr.cc +++ b/similarity_search/test/test_integr.cc @@ -308,7 +308,7 @@ int main(int ac, char* av[]) { string LogFile; if (ac == 2) LogFile = av[1]; - initLibrary(LogFile.empty() ? LIB_LOGSTDERR:LIB_LOGFILE, LogFile.c_str()); + initLibrary(0, LogFile.empty() ? LIB_LOGSTDERR:LIB_LOGFILE, LogFile.c_str()); WallClockTimer timer; timer.reset();