Skip to content

Commit

Permalink
added bit_jaccard and ABC bit_vector
Browse files Browse the repository at this point in the history
  • Loading branch information
Greg Friedland committed Feb 17, 2019
1 parent 3068df8 commit e984532
Show file tree
Hide file tree
Showing 18 changed files with 771 additions and 239 deletions.
10 changes: 10 additions & 0 deletions .idea/codeStyles/Project.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions .idea/codeStyles/codeStyleConfig.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

16 changes: 8 additions & 8 deletions python_bindings/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@
if os.path.exists(library_file):
# if we have a prebuilt nmslib library file, use that.
extra_objects.append(library_file)

else:
# Otherwise build all the files here directly (excluding extras which need eigen/boost)
exclude_files = set("""bbtree.cc lsh.cc lsh_multiprobe.cc lsh_space.cc falconn.cc nndes.cc space_sqfd.cc
dummy_app.cc main.cc""".split())

for root, subdirs, files in os.walk(os.path.join(libdir, "src")):
source_files.extend(os.path.join(root, f) for f in files
if f.endswith(".cc") and f not in exclude_files)
raise RuntimeError("can't find prebuild lib: " + os.path.abspath(library_file))
# # Otherwise build all the files here directly (excluding extras which need eigen/boost)
# exclude_files = set("""bbtree.cc lsh.cc lsh_multiprobe.cc lsh_space.cc falconn.cc nndes.cc space_sqfd.cc
# dummy_app.cc main.cc""".split())
#
# for root, subdirs, files in os.walk(os.path.join(libdir, "src")):
# source_files.extend(os.path.join(root, f) for f in files
# if f.endswith(".cc") and f not in exclude_files)


if sys.platform.startswith('linux'):
Expand Down
88 changes: 88 additions & 0 deletions python_bindings/tests/bindings_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,99 @@ def testReloadIndex(self):
reloaded_results)


class BitVectorIndexTestMixin(object):
def _get_index(self, space='bit_jaccard'):
raise NotImplementedError()

def testKnnQuery(self):
np.random.seed(23)
nbits = 128

index = self._get_index()

for i in range(100):
a = np.random.rand(nbits) > 0.5
s = " ".join(["1" if e else "0" for e in a])
index.addDataPoint(id=i, data=s)
index.createIndex()

a = np.ones(nbits)
s = " ".join(["1" if e else "0" for e in a])
ids, distances = index.knnQuery(s, k=10)
print(ids)
print(distances)
# self.assertTrue(get_hitrate(get_exact_cosine(row, data), ids) >= 5)

# def testKnnQueryBatch(self):
# np.random.seed(23)
# data = np.random.randn(1000, 10).astype(np.float32)
#
# index = self._get_index()
# index.addDataPointBatch(data)
# index.createIndex()
#
# queries = data[:10]
# results = index.knnQueryBatch(queries, k=10)
# for query, (ids, distances) in zip(queries, results):
# self.assertTrue(get_hitrate(get_exact_cosine(query, data), ids) >= 5)
#
# # test col-major arrays
# queries = np.asfortranarray(queries)
# results = index.knnQueryBatch(queries, k=10)
# for query, (ids, distances) in zip(queries, results):
# self.assertTrue(get_hitrate(get_exact_cosine(query, data), ids) >= 5)
#
# # test custom ids (set id to square of each row)
# index = self._get_index()
# index.addDataPointBatch(data, ids=np.arange(data.shape[0]) ** 2)
# index.createIndex()
#
# queries = data[:10]
# results = index.knnQueryBatch(queries, k=10)
# for query, (ids, distances) in zip(queries, results):
# # convert from square back to row id
# ids = np.sqrt(ids).astype(int)
# self.assertTrue(get_hitrate(get_exact_cosine(query, data), ids) >= 5)

# def testReloadIndex(self):
# np.random.seed(23)
# data = np.random.randn(1000, 10).astype(np.float32)
#
# original = self._get_index()
# original.addDataPointBatch(data)
# original.createIndex()
#
# # test out saving/reloading index
# with tempfile.NamedTemporaryFile() as tmp:
# original.saveIndex(tmp.name + ".index")
#
# reloaded = self._get_index()
# reloaded.addDataPointBatch(data)
# reloaded.loadIndex(tmp.name + ".index")
#
# original_results = original.knnQuery(data[0])
# reloaded_results = reloaded.knnQuery(data[0])
# npt.assert_allclose(original_results,
# reloaded_results)


class HNSWTestCase(unittest.TestCase, DenseIndexTestMixin):
def _get_index(self, space='cosinesimil'):
return nmslib.init(method='hnsw', space=space)


class BitJaccardTestCase(unittest.TestCase, BitVectorIndexTestMixin):
def _get_index(self, space='bit_jaccard'):
return nmslib.init(method='hnsw', space='bit_jaccard', data_type=nmslib.DataType.OBJECT_AS_STRING,
dtype=nmslib.DistType.INT)


# class BitHammingTestCase(unittest.TestCase, BitVectorIndexTestMixin):
# def _get_index(self, space='bit_hamming'):
# return nmslib.init(method='hnsw', space='bit_hamming', data_type=nmslib.DataType.OBJECT_AS_STRING,
# dtype=nmslib.DistType.INT)


class SWGraphTestCase(unittest.TestCase, DenseIndexTestMixin):
def _get_index(self, space='cosinesimil'):
return nmslib.init(method='sw-graph', space=space)
Expand Down
13 changes: 13 additions & 0 deletions similarity_search/include/distcomp.h
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,19 @@ int SpearmanRho(const PivotIdType* x, const PivotIdType* y, size_t qty);
int SpearmanFootruleSIMD(const PivotIdType* x, const PivotIdType* y, size_t qty);
int SpearmanRhoSIMD(const PivotIdType* x, const PivotIdType* y, size_t qty);

//template <typename dist_t, typename dist_uint_t>
double inline BitJaccard(const uint64_t* a, const uint64_t* b, size_t qty) {
uint64_t num = 0, den = 0;

for (size_t i=0; i < qty; ++i) {
// __builtin_popcount quickly computes the number on 1s
num += __builtin_popcount(a[i] & b[i]);
den += __builtin_popcount(a[i] | b[i]);
}

return double(num) / double(den);
}

//unsigned BitHamming(const uint32_t* a, const uint32_t* b, size_t qty);

unsigned inline BitHamming(const uint32_t* a, const uint32_t* b, size_t qty) {
Expand Down
9 changes: 6 additions & 3 deletions similarity_search/include/factory/init_spaces.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include "factory/space/space_edist.h"
#include "factory/space/space_bit_hamming.h"
#include "factory/space/space_bit_jaccard.h"
#include "factory/space/space_bregman.h"
#include "factory/space/space_dummy.h"
#include "factory/space/space_js.h"
Expand All @@ -36,15 +37,17 @@

namespace similarity {


inline void initSpaces() {
// Registering a dummy space
REGISTER_SPACE_CREATOR(int, SPACE_DUMMY, CreateDummy)
REGISTER_SPACE_CREATOR(float, SPACE_DUMMY, CreateDummy)
REGISTER_SPACE_CREATOR(double, SPACE_DUMMY, CreateDummy)

// Registering binary/bit Hamming
REGISTER_SPACE_CREATOR(int, SPACE_BIT_HAMMING, CreateBitHamming)
// Registering binary/bit Hamming/Jaccard
SpaceFactoryRegistry<int>::CreateFuncPtr bit_hamming_func_ptr = CreateBitHamming<int,uint32_t>;
REGISTER_SPACE_CREATOR(int, SPACE_BIT_HAMMING, bit_hamming_func_ptr )
SpaceFactoryRegistry<double>::CreateFuncPtr bit_jaccard_func_ptr = CreateBitJaccard<double,uint64_t>;
REGISTER_SPACE_CREATOR(double, SPACE_BIT_JACCARD, bit_jaccard_func_ptr )

// Registering the Levensthein-distance: regular and normalized
REGISTER_SPACE_CREATOR(int, SPACE_LEVENSHTEIN, CreateLevenshtein)
Expand Down
5 changes: 3 additions & 2 deletions similarity_search/include/factory/space/space_bit_hamming.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ namespace similarity {
* Creating functions.
*/

inline Space<int>* CreateBitHamming(const AnyParams& /* ignoring params */) {
return new SpaceBitHamming();
template <typename dist_t, typename dist_uint_t>
inline Space<dist_t>* CreateBitHamming(const AnyParams& /* ignoring params */) {
return new SpaceBitHamming<dist_t,dist_uint_t>();
}

/*
Expand Down
39 changes: 39 additions & 0 deletions similarity_search/include/factory/space/space_bit_jaccard.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/**
* Non-metric Space Library
*
* Main developers: Bilegsaikhan Naidan, Leonid Boytsov, Yury Malkov, Ben Frederickson, David Novak
*
* For the complete list of contributors and further details see:
* https://github.com/searchivarius/NonMetricSpaceLib
*
* Copyright (c) 2013-2018
*
* This code is released under the
* Apache License Version 2.0 http://www.apache.org/licenses/.
*
*/
#ifndef FACTORY_SPACE_BIT_JACCARD_H
#define FACTORY_SPACE_BIT_JACCARD_H

#include <space/space_bit_jaccard.h>

namespace similarity {

/*
* Creating functions.
*/

template <typename dist_t, typename dist_uint_t>
inline Space<dist_t>* CreateBitJaccard(const AnyParams& /* ignoring params */) {
return new SpaceBitJaccard<dist_t,dist_uint_t>();
}

/*
* End of creating functions.
*/
}

#endif



2 changes: 1 addition & 1 deletion similarity_search/include/method/perm_bin_vptree.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ class PermBinVPTree : public Index<dist_t> {
ObjectVector BinPermData_;

unique_ptr<VPTree<int, PolynomialPruner<int>>> VPTreeIndex_;
unique_ptr<SpaceBitHamming> VPTreeSpace_;
unique_ptr<SpaceBitHamming<int,uint32_t>> VPTreeSpace_;

// disable copy and assign
DISABLE_COPY_AND_ASSIGN(PermBinVPTree);
Expand Down
16 changes: 15 additions & 1 deletion similarity_search/include/permutation_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
#include "rangequery.h"
#include "knnquery.h"
#include "permutation_type.h"
#include "distcomp.h"
#include "utils.h"

namespace similarity {
Expand Down Expand Up @@ -163,6 +162,21 @@ inline void Binarize(const vector<PivotIdType> &perm, const PivotIdType thresh,
}
}

inline void Binarize(const vector<PivotIdType> &perm, const PivotIdType thresh, vector<uint64_t>&bin_perm) {
size_t bin_perm_word_qty = (perm.size() + 63)/64;

bin_perm.resize(bin_perm_word_qty);
fill(bin_perm.begin(), bin_perm.end(), 0);

for (size_t i = 0; i < perm.size(); ++i) {
bool b =perm[i] >= thresh;

if (b) {
bin_perm[i/64] |= (1<<(i%64)) ;
}
}
}

} // namespace similarity

#endif // _PERMUTATION_UTILS_H_
Expand Down
52 changes: 14 additions & 38 deletions similarity_search/include/space/space_bit_hamming.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,55 +25,31 @@
#include "utils.h"
#include "space.h"
#include "distcomp.h"
#include "space_bit_vector.h"

#define SPACE_BIT_HAMMING "bit_hamming"

namespace similarity {

class SpaceBitHamming : public Space<int> {
template <typename dist_t, typename dist_uint_t>
class SpaceBitHamming : public SpaceBitVector<dist_t,dist_uint_t> {
public:
explicit SpaceBitHamming() {}
virtual ~SpaceBitHamming() {}

/** Standard functions to read/write/create objects */
// Create an object from string representation.
virtual unique_ptr<Object> CreateObjFromStr(IdType id, LabelType label, const string& s,
DataFileInputState* pInpState) const;
// Create a string representation of an object.
virtual string CreateStrFromObj(const Object* pObj, const string& externId /* ignored */) const;
// Open a file for reading, fetch a header (if there is any) and memorize an input state
virtual unique_ptr<DataFileInputState> OpenReadFileHeader(const string& inputFile) const;
// Open a file for writing, write a header (if there is any) and memorize an output state
virtual unique_ptr<DataFileOutputState> OpenWriteFileHeader(const ObjectVector& dataset,
const string& outputFile) const;
/*
* Read a string representation of the next object in a file as well
* as its label. Return false, on EOF.
*/
virtual bool ReadNextObjStr(DataFileInputState &, string& strObj, LabelType& label, string& externId) const;
/** End of standard functions to read/write/create objects */

/*
* Used only for testing/debugging: compares objects approximately. Floating point numbers
* should be nearly equal. Integers and strings should coincide exactly.
*/
virtual bool ApproxEqual(const Object& obj1, const Object& obj2) const;

virtual std::string StrDesc() const { return "Hamming (bit-storage) space"; }
virtual void CreateDenseVectFromObj(const Object* obj, int* pVect,
size_t nElem) const {
throw runtime_error("Cannot create a dense vector for the space: " + StrDesc());
}
virtual size_t GetElemQty(const Object* object) const {return 0;}
virtual Object* CreateObjFromVect(IdType id, LabelType label, std::vector<uint32_t>& InpVect) const {
InpVect.push_back(InpVect.size());
return CreateObjFromVectInternal(id, label, InpVect);
}

protected:
virtual Object* CreateObjFromVectInternal(IdType id, LabelType label, const std::vector<uint32_t>& InpVect) const;
Object* CreateObjFromBitMaskVect(IdType id, LabelType label, const std::vector<uint32_t>& bitMaskVect) const;
virtual int HiddenDistance(const Object* obj1, const Object* obj2) const;
void ReadBitMaskVect(std::string line, LabelType& label, std::vector<uint32_t>& v) const;
virtual dist_t HiddenDistance(const Object* obj1, const Object* obj2) const {
CHECK(obj1->datalength() > 0);
CHECK(obj1->datalength() == obj2->datalength());
const dist_uint_t* x = reinterpret_cast<const dist_uint_t*>(obj1->data());
const dist_uint_t* y = reinterpret_cast<const dist_uint_t*>(obj2->data());
const size_t length = obj1->datalength() / sizeof(dist_uint_t)
- 1; // the last integer is an original number of elements

return BitHamming(x, y, length);
}

DISABLE_COPY_AND_ASSIGN(SpaceBitHamming);
};
Expand Down
Loading

0 comments on commit e984532

Please sign in to comment.