From 7186a8c93c4e3d6e97b1029b68581f77ac3b2ee5 Mon Sep 17 00:00:00 2001 From: searchivarius Date: Wed, 19 Jun 2019 12:57:39 -0400 Subject: [PATCH] Additional mem leak fixes: #398 --- python_bindings/tests/bindings_test.py | 113 +++++++++++++++++++++++++ similarity_search/include/object.h | 8 +- similarity_search/src/space.cc | 4 +- 3 files changed, 123 insertions(+), 2 deletions(-) diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py index b84dc53..068b52f 100644 --- a/python_bindings/tests/bindings_test.py +++ b/python_bindings/tests/bindings_test.py @@ -6,6 +6,12 @@ import numpy.testing as npt import nmslib +import time + +import os, gc, psutil + +MEM_TEST_ITER=50 +MEM_GROW_COEFF=8 # This is a bit adhoc but seems to work in practice def get_exact_cosine(row, data, N=10): @@ -294,6 +300,113 @@ def testGlobal(self): # this is a one line reproduction of https://github.com/nmslib/nmslib/issues/327 GlobalTestCase.index = nmslib.init() +class MemoryLeak1TestCase(TestCaseBase): + def testMemoryLeak1(self): + process = psutil.Process(os.getpid()) + + np.random.seed(23) + data = np.random.randn(10000, 10).astype(np.float32) + query = np.random.randn(1000, 10).astype(np.float32) + space_name = 'l2' + + num_threads=4 + + index_time_params = {'M': 20, + 'efConstruction': 100, + 'indexThreadQty': num_threads, + 'post' : 0, + 'skip_optimized_index' : 1} # using non-optimized index! + + query_time_params = {'efSearch': 100} + + with tempfile.NamedTemporaryFile() as tmp: + + index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) + index.addDataPointBatch(data) + + index.createIndex(index_time_params) + index.saveIndex(tmp.name, save_data=True) + + init_mem = process.memory_info().rss + + delta1 = None + + for k in range(MEM_TEST_ITER): + + index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) + index.loadIndex(tmp.name, load_data=True) + index.setQueryTimeParams(query_time_params) + + if k == 0: + delta1 = process.memory_info().rss - init_mem + + nbrs = index.knnQueryBatch(query, k = 10, num_threads = num_threads) + + nbrs = None + index = None + + gc.collect() + + gc.collect() + time.sleep(1) + gc.collect() + time.sleep(1) + delta_last = process.memory_info().rss - init_mem + + # if this check fails a memory leak is possible (but not necessarily 100% certain, memory is random) + self.assertTrue(delta_last < delta1 * MEM_GROW_COEFF) + +class MemoryLeak2TestCase(TestCaseBase): + def testMemoryLeak2(self): + process = psutil.Process(os.getpid()) + + np.random.seed(23) + data = np.random.randn(10000, 10).astype(np.float32) + query = np.random.randn(1000, 10).astype(np.float32) + space_name = 'l2' + + num_threads=4 + + index_time_params = {'M': 20, + 'efConstruction': 100, + 'indexThreadQty': num_threads, + 'post' : 0, + 'skip_optimized_index' : 1} # using non-optimized index! + + query_time_params = {'efSearch': 100} + + with tempfile.NamedTemporaryFile() as tmp: + + init_mem = process.memory_info().rss + + delta1 = None + + for k in range(MEM_TEST_ITER): + + index = nmslib.init(method='hnsw', space=space_name, data_type=nmslib.DataType.DENSE_VECTOR) + index.addDataPointBatch(data) + index.createIndex(index_time_params) + + if k == 0: + delta1 = process.memory_info().rss - init_mem + + index.setQueryTimeParams(query_time_params) + nbrs = index.knnQueryBatch(query, k = 10, num_threads = num_threads) + + nbrs = None + index = None + + gc.collect() + + gc.collect() + time.sleep(1) + gc.collect() + time.sleep(1) + delta_last = process.memory_info().rss - init_mem + + # if this check fails a memory leak is possible (but not necessarily 100% certain, memory is random) + self.assertTrue(delta_last < delta1 * MEM_GROW_COEFF) + if __name__ == "__main__": unittest.main() diff --git a/similarity_search/include/object.h b/similarity_search/include/object.h index 7bddc29..1dbb749 100644 --- a/similarity_search/include/object.h +++ b/similarity_search/include/object.h @@ -49,7 +49,13 @@ using std::numeric_limits; class Object { public: - explicit Object(char* buffer) : buffer_(buffer), memory_allocated_(false) {} + /* + * Memory ownership handling of the Object class is not good, but we keep it this way for historical/compatibility reasons + * Currrently, when the second constructor is called, new memory is always allocated. + * However, this constructor is nornally called to created an object that reuses someone else's memory. + * In NMSLIB 1.8.1 we make it possible to take ownership of the memory provided. + */ + explicit Object(char* buffer, bool memory_allocated=false) : buffer_(buffer), memory_allocated_(memory_allocated) {} Object(IdType id, LabelType label, size_t datalength, const void* data) { buffer_ = new char[ID_SIZE + LABEL_SIZE + DATALENGTH_SIZE + datalength]; diff --git a/similarity_search/src/space.cc b/similarity_search/src/space.cc index eda43cf..8449e48 100644 --- a/similarity_search/src/space.cc +++ b/similarity_search/src/space.cc @@ -77,7 +77,9 @@ Space::ReadObjectVectorFromBinData(ObjectVector& data, readBinaryPOD(input, objSize); unique_ptr buf(new char[objSize]); input.read(&buf[0], objSize); - data.push_back(new Object(buf.release())); + // true guarantees that the Object will take ownership of memory + // less than ideal, but ok for now + data.push_back(new Object(buf.release(), true)); } return unique_ptr(new DataFileInputState());