From c5712a3ec378c91da61d83dd6b2102e9fd409342 Mon Sep 17 00:00:00 2001 From: searchivarius Date: Tue, 7 May 2019 22:49:00 -0400 Subject: [PATCH] Support saving distance sampling. --- similarity_search/apps/report_intr_dim.cc | 29 +++++++++++++++---- similarity_search/include/report_intr_dim.h | 5 +++- .../src/method/perm_bin_vptree.cc | 3 +- similarity_search/src/method/proj_vptree.cc | 3 +- 4 files changed, 32 insertions(+), 8 deletions(-) diff --git a/similarity_search/apps/report_intr_dim.cc b/similarity_search/apps/report_intr_dim.cc index f4e34f0..9c828f5 100644 --- a/similarity_search/apps/report_intr_dim.cc +++ b/similarity_search/apps/report_intr_dim.cc @@ -75,7 +75,8 @@ void TestSpace( string dataFile, bool compMuDeffect, unsigned maxNumData, - unsigned sampleQty + unsigned sampleQty, + string sampleFile ) { string spaceType; vector vSpaceArgs; @@ -90,9 +91,21 @@ void TestSpace( vector tmp; unique_ptr inpState(space->ReadDataset(data, tmp, dataFile, maxNumData)); space->UpdateParamsFromFile(*inpState); + vector dist; // Prints the report - ReportIntrinsicDimensionality("********", *space, data, sampleQty); + ReportIntrinsicDimensionality("********", *space, data, dist, sampleQty); + if (!sampleFile.empty()) { + ofstream of(sampleFile); + CHECK_MSG(of, "Cannot open for writing file: " + sampleFile); + of.exceptions ( std::ifstream::failbit | std::ifstream::badbit ); + + for (size_t i = 0; i < dist.size(); ++i) { + if (i) of << "\t"; + of << dist[i]; + } + of << std::endl; + } if (compMuDeffect) { double dleft, dright; ComputeMuDeffect( @@ -108,6 +121,7 @@ void TestSpace( int main(int argc, char* argv[]) { string spaceDesc, distType; string dataFile; + string sampleFile; unsigned maxNumData; unsigned sampleQty; bool compMuDeffect; @@ -126,6 +140,8 @@ int main(int argc, char* argv[]) { &sampleQty, false, defaultSampleQty)); cmd_options.Add(new CmdParam("muDeffect,m", "estimate the left and the right mu deffectiveness", &compMuDeffect, false, false)); + cmd_options.Add(new CmdParam("sampleFile", "optional output sample file", + &sampleFile, false, "")); try { cmd_options.Parse(argc, argv); @@ -143,7 +159,8 @@ int main(int argc, char* argv[]) { dataFile, compMuDeffect, maxNumData, - sampleQty + sampleQty, + sampleFile ); } else if (DIST_TYPE_FLOAT == distType) { TestSpace( @@ -151,7 +168,8 @@ int main(int argc, char* argv[]) { dataFile, compMuDeffect, maxNumData, - sampleQty + sampleQty, + sampleFile ); } else if (DIST_TYPE_DOUBLE == distType) { TestSpace( @@ -159,7 +177,8 @@ int main(int argc, char* argv[]) { dataFile, compMuDeffect, maxNumData, - sampleQty + sampleQty, + sampleFile ); } diff --git a/similarity_search/include/report_intr_dim.h b/similarity_search/include/report_intr_dim.h index 0259c6a..dec015a 100644 --- a/similarity_search/include/report_intr_dim.h +++ b/similarity_search/include/report_intr_dim.h @@ -33,9 +33,10 @@ void ComputeIntrinsicDimensionality(const Space& space, double& IntrDim, double& DistMean, double& DistSigma, + std::vector& dist, size_t SampleQty = 1000000) { - std::vector dist; DistMean = 0; + dist.clear(); for (size_t n = 0; n < SampleQty; ++n) { size_t r1 = RandomInt() % dataset.size(); size_t r2 = RandomInt() % dataset.size(); @@ -70,6 +71,7 @@ template void ReportIntrinsicDimensionality(const string& reportName, const Space& space, const ObjectVector& dataset, + std::vector& dist, size_t SampleQty = 1000000) { double DistMean, DistSigma, IntrDim; @@ -77,6 +79,7 @@ void ReportIntrinsicDimensionality(const string& reportName, IntrDim, DistMean, DistSigma, + dist, SampleQty); LOG(LIB_INFO) << "### " << reportName; diff --git a/similarity_search/src/method/perm_bin_vptree.cc b/similarity_search/src/method/perm_bin_vptree.cc index 9eca6a0..0018f6d 100644 --- a/similarity_search/src/method/perm_bin_vptree.cc +++ b/similarity_search/src/method/perm_bin_vptree.cc @@ -72,7 +72,8 @@ void PermBinVPTree::CreateIndex(const AnyParams& Ind BinPermData_[i] = VPTreeSpace_->CreateObjFromVect(i, -1, binPivot); } - ReportIntrinsicDimensionality("Set of permutations" , *VPTreeSpace_, BinPermData_); + vector dist; + ReportIntrinsicDimensionality("Set of permutations" , *VPTreeSpace_, BinPermData_, dist); VPTreeIndex_.reset(new VPTree>( PrintProgress_, *VPTreeSpace_, diff --git a/similarity_search/src/method/proj_vptree.cc b/similarity_search/src/method/proj_vptree.cc index 20032e5..4b623fd 100644 --- a/similarity_search/src/method/proj_vptree.cc +++ b/similarity_search/src/method/proj_vptree.cc @@ -167,7 +167,8 @@ void ProjectionVPTree::CreateIndex(const AnyParams& IndexParams) { projData_[id] = ProjectOneVect(id, NULL, this->data_[id]); } - ReportIntrinsicDimensionality("Set of projections" , *VPTreeSpace_, projData_); + vector dist; + ReportIntrinsicDimensionality("Set of projections" , *VPTreeSpace_, projData_, dist); VPTreeIndex_.reset(new VPTree>( PrintProgress_,