Skip to content

Commit

Permalink
Merge pull request #371 from gregfriedland/greg/bit_jaccard_pr
Browse files Browse the repository at this point in the history
bit_jaccard space
  • Loading branch information
Leonid Boytsov authored and GitHub committed May 25, 2019
2 parents a1edf49 + 4fbae06 commit 59f49ec
Show file tree
Hide file tree
Showing 15 changed files with 463 additions and 233 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ similarity_search/test/Makefile
*.so
*.pyc
*.egg-info/
.idea
83 changes: 83 additions & 0 deletions python_bindings/tests/bindings_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@ def get_hitrate(ground_truth, ids):
return len(set(i for i, _ in ground_truth).intersection(ids))


def bit_vector_to_str(bit_vect):
return " ".join(["1" if e else "0" for e in bit_vect])


def bit_vector_sparse_str(bit_vect):
return " ".join([str(k) for k, b in enumerate(bit_vect) if b])


class DenseIndexTestMixin(object):
def _get_index(self, space='cosinesimil'):
raise NotImplementedError()
Expand Down Expand Up @@ -95,11 +103,86 @@ def testReloadIndex(self):
reloaded_results)


class BitVectorIndexTestMixin(object):
def _get_index(self, space='bit_jaccard'):
raise NotImplementedError()

def _get_batches(self, index, nbits, num_elems, chunk_size):
if "bit_" in str(index):
self.bit_vector_str_func = bit_vector_to_str
else:
self.bit_vector_str_func = bit_vector_sparse_str

batches = []
for i in range(0, num_elems, chunk_size):
strs = []
for j in range(chunk_size):
a = np.random.rand(nbits) > 0.5
strs.append(self.bit_vector_str_func(a))
batches.append([np.arange(i, i + chunk_size), strs])
return batches

def testKnnQuery(self):
np.random.seed(23)

index = self._get_index()
batches = self._get_batches(index, 512, 2000, 1000)
for ids, data in batches:
index.addDataPointBatch(ids=ids, data=data)

index.createIndex()

s = self.bit_vector_str_func(np.ones(512))
index.knnQuery(s, k=10)

def testReloadIndex(self):
np.random.seed(23)

original = self._get_index()
batches = self._get_batches(original, 512, 2000, 1000)
for ids, data in batches:
original.addDataPointBatch(ids=ids, data=data)
original.createIndex()

# test out saving/reloading index
with tempfile.NamedTemporaryFile() as tmp:
original.saveIndex(tmp.name + ".index")

reloaded = self._get_index()
for ids, data in batches:
reloaded.addDataPointBatch(ids=ids, data=data)
reloaded.loadIndex(tmp.name + ".index")

s = self.bit_vector_str_func(np.ones(512))
original_results = original.knnQuery(s)
reloaded_results = reloaded.knnQuery(s)
npt.assert_allclose(original_results,
reloaded_results)


class HNSWTestCase(unittest.TestCase, DenseIndexTestMixin):
def _get_index(self, space='cosinesimil'):
return nmslib.init(method='hnsw', space=space)


class BitJaccardTestCase(unittest.TestCase, BitVectorIndexTestMixin):
def _get_index(self, space='bit_jaccard'):
return nmslib.init(method='hnsw', space=space, data_type=nmslib.DataType.OBJECT_AS_STRING,
dtype=nmslib.DistType.FLOAT)


class SparseJaccardTestCase(unittest.TestCase, BitVectorIndexTestMixin):
def _get_index(self, space='jaccard_sparse'):
return nmslib.init(method='hnsw', space=space, data_type=nmslib.DataType.OBJECT_AS_STRING,
dtype=nmslib.DistType.FLOAT)


class BitHammingTestCase(unittest.TestCase, BitVectorIndexTestMixin):
def _get_index(self, space='bit_hamming'):
return nmslib.init(method='hnsw', space=space, data_type=nmslib.DataType.OBJECT_AS_STRING,
dtype=nmslib.DistType.INT)


class SWGraphTestCase(unittest.TestCase, DenseIndexTestMixin):
def _get_index(self, space='cosinesimil'):
return nmslib.init(method='sw-graph', space=space)
Expand Down
13 changes: 13 additions & 0 deletions similarity_search/include/distcomp.h
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,19 @@ int SpearmanRho(const PivotIdType* x, const PivotIdType* y, size_t qty);
int SpearmanFootruleSIMD(const PivotIdType* x, const PivotIdType* y, size_t qty);
int SpearmanRhoSIMD(const PivotIdType* x, const PivotIdType* y, size_t qty);

template <typename dist_t, typename dist_uint_t>
dist_t inline BitJaccard(const dist_uint_t* a, const dist_uint_t* b, size_t qty) {
dist_uint_t num = 0, den = 0;

for (size_t i=0; i < qty; ++i) {
// __builtin_popcount quickly computes the number on 1s
num += __builtin_popcount(a[i] & b[i]);
den += __builtin_popcount(a[i] | b[i]);
}

return 1 - (dist_t(num) / dist_t(den));
}

//unsigned BitHamming(const uint32_t* a, const uint32_t* b, size_t qty);

unsigned inline BitHamming(const uint32_t* a, const uint32_t* b, size_t qty) {
Expand Down
9 changes: 6 additions & 3 deletions similarity_search/include/factory/init_spaces.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include "factory/space/space_edist.h"
#include "factory/space/space_bit_hamming.h"
#include "factory/space/space_bit_jaccard.h"
#include "factory/space/space_bregman.h"
#include "factory/space/space_dummy.h"
#include "factory/space/space_js.h"
Expand All @@ -36,15 +37,17 @@

namespace similarity {


inline void initSpaces() {
// Registering a dummy space
REGISTER_SPACE_CREATOR(int, SPACE_DUMMY, CreateDummy)
REGISTER_SPACE_CREATOR(float, SPACE_DUMMY, CreateDummy)
REGISTER_SPACE_CREATOR(double, SPACE_DUMMY, CreateDummy)

// Registering binary/bit Hamming
REGISTER_SPACE_CREATOR(int, SPACE_BIT_HAMMING, CreateBitHamming)
// Registering binary/bit Hamming/Jaccard
SpaceFactoryRegistry<int>::CreateFuncPtr bit_hamming_func_ptr = CreateBitHamming<int,uint32_t>;
REGISTER_SPACE_CREATOR(int, SPACE_BIT_HAMMING, bit_hamming_func_ptr )
SpaceFactoryRegistry<float>::CreateFuncPtr bit_jaccard_func_ptr = CreateBitJaccard<float,uint32_t>;
REGISTER_SPACE_CREATOR(float, SPACE_BIT_JACCARD, bit_jaccard_func_ptr )

// Registering the Levensthein-distance: regular and normalized
REGISTER_SPACE_CREATOR(int, SPACE_LEVENSHTEIN, CreateLevenshtein)
Expand Down
5 changes: 3 additions & 2 deletions similarity_search/include/factory/space/space_bit_hamming.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ namespace similarity {
* Creating functions.
*/

inline Space<int>* CreateBitHamming(const AnyParams& /* ignoring params */) {
return new SpaceBitHamming();
template <typename dist_t, typename dist_uint_t>
inline Space<dist_t>* CreateBitHamming(const AnyParams& /* ignoring params */) {
return new SpaceBitHamming<dist_t,dist_uint_t>();
}

/*
Expand Down
39 changes: 39 additions & 0 deletions similarity_search/include/factory/space/space_bit_jaccard.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/**
* Non-metric Space Library
*
* Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak
*
* For the complete list of contributors and further details see:
* https://github.com/searchivarius/NonMetricSpaceLib
*
* Copyright (c) 2013-2018
*
* This code is released under the
* Apache License Version 2.0 http://www.apache.org/licenses/.
*
*/
#ifndef FACTORY_SPACE_BIT_JACCARD_H
#define FACTORY_SPACE_BIT_JACCARD_H

#include <space/space_bit_jaccard.h>

namespace similarity {

/*
* Creating functions.
*/

template <typename dist_t, typename dist_uint_t>
inline Space<dist_t>* CreateBitJaccard(const AnyParams& /* ignoring params */) {
return new SpaceBitJaccard<dist_t,dist_uint_t>();
}

/*
* End of creating functions.
*/
}

#endif



2 changes: 1 addition & 1 deletion similarity_search/include/method/perm_bin_vptree.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ class PermBinVPTree : public Index<dist_t> {
ObjectVector BinPermData_;

unique_ptr<VPTree<int, PolynomialPruner<int>>> VPTreeIndex_;
unique_ptr<SpaceBitHamming> VPTreeSpace_;
unique_ptr<SpaceBitHamming<int,uint32_t>> VPTreeSpace_;

// disable copy and assign
DISABLE_COPY_AND_ASSIGN(PermBinVPTree);
Expand Down
16 changes: 15 additions & 1 deletion similarity_search/include/permutation_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
#include "rangequery.h"
#include "knnquery.h"
#include "permutation_type.h"
#include "distcomp.h"
#include "utils.h"

namespace similarity {
Expand Down Expand Up @@ -163,6 +162,21 @@ inline void Binarize(const vector<PivotIdType> &perm, const PivotIdType thresh,
}
}

inline void Binarize(const vector<PivotIdType> &perm, const PivotIdType thresh, vector<uint64_t>&bin_perm) {
size_t bin_perm_word_qty = (perm.size() + 63)/64;

bin_perm.resize(bin_perm_word_qty);
fill(bin_perm.begin(), bin_perm.end(), 0);

for (size_t i = 0; i < perm.size(); ++i) {
bool b =perm[i] >= thresh;

if (b) {
bin_perm[i/64] |= (1<<(i%64)) ;
}
}
}

} // namespace similarity

#endif // _PERMUTATION_UTILS_H_
Expand Down
52 changes: 14 additions & 38 deletions similarity_search/include/space/space_bit_hamming.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,55 +25,31 @@
#include "utils.h"
#include "space.h"
#include "distcomp.h"
#include "space_bit_vector.h"

#define SPACE_BIT_HAMMING "bit_hamming"

namespace similarity {

class SpaceBitHamming : public Space<int> {
template <typename dist_t, typename dist_uint_t>
class SpaceBitHamming : public SpaceBitVector<dist_t,dist_uint_t> {
public:
explicit SpaceBitHamming() {}
virtual ~SpaceBitHamming() {}

/** Standard functions to read/write/create objects */
// Create an object from string representation.
virtual unique_ptr<Object> CreateObjFromStr(IdType id, LabelType label, const string& s,
DataFileInputState* pInpState) const;
// Create a string representation of an object.
virtual string CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const;
// Open a file for reading, fetch a header (if there is any) and memorize an input state
virtual unique_ptr<DataFileInputState> OpenReadFileHeader(const string& inputFile) const;
// Open a file for writing, write a header (if there is any) and memorize an output state
virtual unique_ptr<DataFileOutputState> OpenWriteFileHeader(const ObjectVector& dataset,
const string& outputFile) const;
/*
* Read a string representation of the next object in a file as well
* as its label. Return false, on EOF.
*/
virtual bool ReadNextObjStr(DataFileInputState &, string& strObj, LabelType& label, string& externId) const;
/** End of standard functions to read/write/create objects */

/*
* Used only for testing/debugging: compares objects approximately. Floating point numbers
* should be nearly equal. Integers and strings should coincide exactly.
*/
virtual bool ApproxEqual(const Object& obj1, const Object& obj2) const;

virtual std::string StrDesc() const { return "Hamming (bit-storage) space"; }
virtual void CreateDenseVectFromObj(const Object* obj, int* pVect,
size_t nElem) const {
throw runtime_error("Cannot create a dense vector for the space: " + StrDesc());
}
virtual size_t GetElemQty(const Object* object) const {return 0;}
virtual Object* CreateObjFromVect(IdType id, LabelType label, std::vector<uint32_t>& InpVect) const {
InpVect.push_back(InpVect.size());
return CreateObjFromVectInternal(id, label, InpVect);
}

protected:
virtual Object* CreateObjFromVectInternal(IdType id, LabelType label, const std::vector<uint32_t>& InpVect) const;
Object* CreateObjFromBitMaskVect(IdType id, LabelType label, const std::vector<uint32_t>& bitMaskVect) const;
virtual int HiddenDistance(const Object* obj1, const Object* obj2) const;
void ReadBitMaskVect(std::string line, LabelType& label, std::vector<uint32_t>& v) const;
virtual dist_t HiddenDistance(const Object* obj1, const Object* obj2) const {
CHECK(obj1->datalength() > 0);
CHECK(obj1->datalength() == obj2->datalength());
const dist_uint_t* x = reinterpret_cast<const dist_uint_t*>(obj1->data());
const dist_uint_t* y = reinterpret_cast<const dist_uint_t*>(obj2->data());
const size_t length = obj1->datalength() / sizeof(dist_uint_t)
- 1; // the last integer is an original number of elements

return BitHamming(x, y, length);
}

DISABLE_COPY_AND_ASSIGN(SpaceBitHamming);
};
Expand Down
59 changes: 59 additions & 0 deletions similarity_search/include/space/space_bit_jaccard.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/**
* Non-metric Space Library
*
* Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak
*
* For the complete list of contributors and further details see:
* https://github.com/searchivarius/NonMetricSpaceLib
*
* Copyright (c) 2013-2018
*
* This code is released under the
* Apache License Version 2.0 http://www.apache.org/licenses/.
*
*/
#ifndef _SPACE_BIT_JACCARD_H_
#define _SPACE_BIT_JACCARD_H_

#include <string>
#include <map>
#include <stdexcept>

#include <string.h>
#include "global.h"
#include "object.h"
#include "utils.h"
#include "space.h"
#include "distcomp.h"
#include "space_bit_vector.h"

#define SPACE_BIT_JACCARD "bit_jaccard"

namespace similarity {

template <typename dist_t, typename dist_uint_t>
class SpaceBitJaccard : public SpaceBitVector<dist_t,dist_uint_t> {
public:
explicit SpaceBitJaccard() {}
virtual ~SpaceBitJaccard() {}

virtual std::string StrDesc() const { return "Jaccard (bit-storage) space"; }

protected:
virtual dist_t HiddenDistance(const Object* obj1, const Object* obj2) const {
CHECK(obj1->datalength() > 0);
CHECK(obj1->datalength() == obj2->datalength());
const dist_uint_t* x = reinterpret_cast<const dist_uint_t*>(obj1->data());
const dist_uint_t* y = reinterpret_cast<const dist_uint_t*>(obj2->data());
const size_t length = obj1->datalength() / sizeof(dist_uint_t)
- 1; // the last integer is an original number of elements

return BitJaccard<dist_t,dist_uint_t>(x, y, length);
}

DISABLE_COPY_AND_ASSIGN(SpaceBitJaccard);
};

} // namespace similarity

#endif
Loading

0 comments on commit 59f49ec

Please sign in to comment.