diff --git a/similarity_search/include/read_data.h b/similarity_search/include/read_data.h new file mode 100644 index 0000000..c360690 --- /dev/null +++ b/similarity_search/include/read_data.h @@ -0,0 +1,258 @@ +#ifndef READ_DATA_H +#define READ_DATA_H + +#include +#include +#include +#include +#include +#include +#include + +namespace similarity { + +using std::string; +using std::vector; +using std::stringstream; + +template +struct SparseVectElem { + uint32_t id_; + dist_t val_; + SparseVectElem(uint32_t id = 0, dist_t val = 0) : id_(id), val_(val) {} + bool operator<(const SparseVectElem& that) const { + return id_ < that.id_; + } + bool operator==(const SparseVectElem& that) const { + return id_ == that.id_ && val_ == that.val_; + } + bool operator!=(const SparseVectElem& that) const { + return !operator==(that); + } +}; + +template +inline ostream& operator<<(ostream& out, SparseVectElem e) { + return out << "[" << e.id_ << ": " << e.val_ << "]"; +} + +inline int strtoi_wrapper(const char* ptr, char** endPtr) { + errno = 0; + long val = strtol(ptr, endPtr, 10); + if (errno == ERANGE){ + return 0; + } + if (val < INT_MIN || val > INT_MAX) { + *endPtr = const_cast(ptr); + errno = ERANGE; + return 0; + } + return static_cast(val); +} + +template +inline bool ReadVecDataViaStream(string line, vector& res) { + try { + res.clear(); + ReplaceSomePunct(line); + + stringstream str(line); + str.exceptions(ios::badbit); + + T val; + while (str >> val) { + res.push_back(val); + } + } catch (const exception& e) { + LOG(LIB_ERROR) << "Exception: " << e.what(); + return false; + } + return true; +} + +template +inline bool ReadVecDataEfficiently(string line, vector& res); + +template <> +inline bool ReadVecDataEfficiently(string line, vector& res) { + ReplaceSomePunct(line); + const char *ptr = line.c_str(); + char *endPtr = nullptr; + + res.clear(); + errno = 0; + + for (float val = strtof(ptr, &endPtr); + ptr != endPtr; + val = strtof(ptr, &endPtr)) { + ptr = endPtr; + if (errno == ERANGE){ + errno = 0; + return false; + } + res.push_back(val); + } + + if (errno == ERANGE){ + errno = 0; + return false; + } + + return true; +} + +template <> +inline bool ReadVecDataEfficiently(string line, vector& res) { + ReplaceSomePunct(line); + const char *ptr = line.c_str(); + char *endPtr = nullptr; + + res.clear(); + errno = 0; + + for (double val = strtod(ptr, &endPtr); + ptr != endPtr; + val = strtod(ptr, &endPtr)) { + ptr = endPtr; + if (errno == ERANGE){ + errno = 0; + return false; + } + res.push_back(val); + } + + if (errno == ERANGE){ + errno = 0; + return false; + } + + return true; +} + + +template <> +inline bool ReadVecDataEfficiently(string line, vector& res) { + ReplaceSomePunct(line); + const char *ptr = line.c_str(); + char *endPtr = nullptr; + + res.clear(); + errno = 0; + + for (int val = strtoi_wrapper(ptr, &endPtr); + ptr != endPtr; + val = strtoi_wrapper(ptr, &endPtr)) { + ptr = endPtr; + if (errno == ERANGE){ + errno = 0; + return false; + } + res.push_back(val); + } + + if (errno == ERANGE){ + errno = 0; + return false; + } + + return true; +} + + +template +inline bool ReadSparseVecDataViaStream(string line, vector>& res) { + try { + ReplaceSomePunct(line); + std::stringstream str(line); + str.exceptions(std::ios::badbit); + + res.clear(); + + uint32_t id; + T val; + + while (str >> id && str >> val) { + res.push_back(SparseVectElem(id, val)); + } + } catch (const exception& e) { + LOG(LIB_ERROR) << "Exception: " << e.what(); + return false; + } + + return true; +} + +template +inline bool ReadSparseVecDataEfficiently(string line, vector>& res); + +template <> +inline bool ReadSparseVecDataEfficiently(string line, vector>& res) { + ReplaceSomePunct(line); + const char *ptr = line.c_str(); + char *endPtr = nullptr; + + float val; IdType id; + + res.clear(); + errno = 0; + + while (true) { + if (endPtr != nullptr) ptr = endPtr; + id = strtoi_wrapper(ptr, &endPtr); + if (errno == ERANGE){ + errno = 0; + return false; + } + if (ptr == endPtr) break; + + ptr = endPtr; + val = strtof(ptr, &endPtr); + if (errno == ERANGE) { + errno = 0; + return false; + } + if (ptr == endPtr) return false; + + res.push_back(SparseVectElem(id, val)); + }; + + return true; +} + +template <> +inline bool ReadSparseVecDataEfficiently(string line, vector>& res) { + ReplaceSomePunct(line); + const char *ptr = line.c_str(); + char *endPtr = nullptr; + + double val; IdType id; + + res.clear(); + errno = 0; + + while (true) { + if (endPtr != nullptr) ptr = endPtr; + id = strtoi_wrapper(ptr, &endPtr); + if (errno == ERANGE){ + errno = 0; + return false; + } + if (ptr == endPtr) break; + + ptr = endPtr; + val = strtod(ptr, &endPtr); + if (errno == ERANGE) { + errno = 0; + return false; + } + if (ptr == endPtr) return false; + + res.push_back(SparseVectElem(id, val)); + }; + + return true; +} + +} + +#endif diff --git a/similarity_search/include/space/space_sparse_vector.h b/similarity_search/include/space/space_sparse_vector.h index b426fd3..74d1fba 100644 --- a/similarity_search/include/space/space_sparse_vector.h +++ b/similarity_search/include/space/space_sparse_vector.h @@ -31,33 +31,13 @@ #include "utils.h" #include "space.h" #include "distcomp.h" +#include "read_data.h" namespace similarity { using std::vector; using std::fill; -template -struct SparseVectElem { - uint32_t id_; - dist_t val_; - SparseVectElem(uint32_t id = 0, dist_t val = 0) : id_(id), val_(val) {} - bool operator<(const SparseVectElem& that) const { - return id_ < that.id_; - } - bool operator==(const SparseVectElem& that) const { - return id_ == that.id_ && val_ == that.val_; - } - bool operator!=(const SparseVectElem& that) const { - return !operator==(that); - } -}; - -template -ostream& operator<<(ostream& out, SparseVectElem e) { - return out << "[" << e.id_ << ": " << e.val_ << "]"; -} - /* * The maximum number of sparse elements that will be kept on the stack * by the function ComputeDistanceHelper. diff --git a/similarity_search/src/space/space_sparse_vector.cc b/similarity_search/src/space/space_sparse_vector.cc index 077a70f..2c18a3f 100644 --- a/similarity_search/src/space/space_sparse_vector.cc +++ b/similarity_search/src/space/space_sparse_vector.cc @@ -27,30 +27,30 @@ #include "logging.h" #include "distcomp.h" #include "experimentconf.h" +#include "read_data.h" namespace similarity { using namespace std; template -void SpaceSparseVector::ReadSparseVec(std::string line, size_t line_num, LabelType& label, vector& v) const +void SpaceSparseVector::ReadSparseVec(string line, size_t line_num, LabelType& label, vector& v) const { v.clear(); label = Object::extractLabel(line); - ReplaceSomePunct(line); - std::stringstream str(line); - - str.exceptions(std::ios::badbit); - - uint32_t id; - dist_t val; +#if 0 + if (!ReadSparseVecDataViaStream(line, v)) { +#else + if (!ReadSparseVecDataEfficiently(line, v)) { +#endif + PREPARE_RUNTIME_ERR(err) << "Failed to parse the line # " << line_num << ": '" << line << "'" << std::endl; + LOG(LIB_ERROR) << err.stream().str(); + THROW_RUNTIME_ERR(err); + } try { - while (str >> id && str >> val) { - v.push_back(ElemType(id, val)); - } sort(v.begin(), v.end()); for (unsigned i = 1; i < v.size(); ++i) { diff --git a/similarity_search/src/space/space_vector.cc b/similarity_search/src/space/space_vector.cc index 7dd93e1..743ff81 100644 --- a/similarity_search/src/space/space_vector.cc +++ b/similarity_search/src/space/space_vector.cc @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include "object.h" #include "utils.h" @@ -29,6 +31,7 @@ #include "distcomp.h" #include "experimentconf.h" #include "space/space_vector.h" +#include "read_data.h" namespace similarity { @@ -126,21 +129,14 @@ void VectorSpace::ReadVec(string line, LabelType& label, vector& label = Object::extractLabel(line); - ReplaceSomePunct(line); - stringstream str(line); - - str.exceptions(ios::badbit); - - dist_t val; - - - try { - while (str >> val) { - v.push_back(val); - } - } catch (const exception &e) { - LOG(LIB_ERROR) << "Exception: " << e.what(); - LOG(LIB_FATAL) << "Failed to parse the line: '" << line << "'"; +#if 0 + if (!ReadVecDataViaStream(line, v)) { +#else + if (!ReadVecDataEfficiently(line, v)) { +#endif + PREPARE_RUNTIME_ERR(err) << "Failed to parse the line: '" << line << "'"; + LOG(LIB_ERROR) << err.stream().str(); + THROW_RUNTIME_ERR(err); } }