Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
MssBenchmark/ann_benchmarks/datasets.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
544 lines (458 sloc)
20.3 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import h5py | |
import numpy | |
import os | |
import random | |
import sys | |
try: | |
from urllib import urlretrieve | |
except ImportError: | |
from urllib.request import urlretrieve # Python 3 | |
def download(src, dst): | |
if not os.path.exists(dst): | |
# TODO: should be atomic | |
print('downloading %s -> %s...' % (src, dst)) | |
urlretrieve(src, dst) | |
def get_dataset_fn(dataset): | |
if not os.path.exists('data'): | |
os.mkdir('data') | |
return os.path.join('data', '%s.hdf5' % dataset) | |
def get_dataset(which): | |
import h5sparse | |
hdf5_fn = get_dataset_fn(which) | |
try: | |
url = 'http://ann-benchmarks.com/%s.hdf5' % which | |
download(url, hdf5_fn) | |
except: | |
print("Cannot download %s" % url) | |
if which in DATASETS: | |
print("Creating dataset locally") | |
DATASETS[which](hdf5_fn) | |
hdf5_f = h5sparse.File(hdf5_fn) | |
return hdf5_f | |
# Everything below this line is related to creating datasets | |
# You probably never need to do this at home, just rely on the prepared datasets at http://ann-benchmarks.com | |
def write_output(train, test, fn, distance, point_type='float', count=1000, SMILES=None): | |
from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS | |
import sklearn.neighbors | |
import h5sparse | |
def replace_last(source_string, replace_what, replace_with): | |
head, _sep, tail = source_string.rpartition(replace_what) | |
return head + replace_with + tail | |
# store SMILES first | |
if SMILES: | |
smile_fn = replace_last(fn, '.hdf5', '-SMILES.hdf5') | |
print('Write Smiles to File %s' % smile_fn) | |
f = h5sparse.File(smile_fn, 'w') | |
dt = h5py.special_dtype(vlen=bytes) | |
asciiList = [n.encode("ascii", "ignore") for n in SMILES] | |
f.create_dataset('smile', (len(asciiList), 1), dtype=dt, data=asciiList) | |
f.close() | |
print('Finish.') | |
print('Write Dataset %s' % fn) | |
f = h5sparse.File(fn, 'w') | |
f.attrs['distance'] = distance | |
f.attrs['point_type'] = point_type | |
print('train size: %9d * %4d' % train.shape) | |
print('test size: %9d * %4d' % test.shape) | |
f.create_dataset('train',data=train) | |
f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test | |
neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i') | |
distances = f.create_dataset('distances', (test.shape[0], count), dtype='f') | |
# use which method to compute the groundtruth | |
train = train.toarray() | |
method = 'bruteforth' | |
if method == 'balltree': | |
tree = sklearn.neighbors.BallTree(train, leaf_size=1000000, metric=distance) | |
else: | |
bf = BruteForceBLAS(metric=distance, precision=train.dtype) | |
bf.fit(train) | |
print(test) | |
for i, x in enumerate(test): | |
if i % 1 == 0: | |
print('%d/%d...' % (i, test.shape[0])) | |
if method == 'balltree': | |
dist, ind = tree.query([x], k=count) | |
neighbors[i] = ind[0] | |
distances[i] = dist[0] | |
else: | |
res = list(bf.query_with_distances(x, count)) | |
res.sort(key=lambda t: t[-1]) | |
neighbors[i] = [j for j, _ in res] | |
distances[i] = [d for _, d in res] | |
print(neighbors[i]) | |
print(distances[i]) | |
f.close() | |
print('Finish.') | |
def train_test_split(X, test_size=10000): | |
import sklearn.model_selection | |
print('Splitting %d*%d into train/test' % X.shape) | |
return sklearn.model_selection.train_test_split(X, test_size=test_size, random_state=1) | |
def glove(out_fn, d): | |
import zipfile | |
url = 'http://nlp.stanford.edu/data/glove.twitter.27B.zip' | |
fn = os.path.join('data', 'glove.twitter.27B.zip') | |
download(url, fn) | |
with zipfile.ZipFile(fn) as z: | |
print('preparing %s' % out_fn) | |
z_fn = 'glove.twitter.27B.%dd.txt' % d | |
X = [] | |
for line in z.open(z_fn): | |
v = [float(x) for x in line.strip().split()[1:]] | |
X.append(numpy.array(v)) | |
X_train, X_test = train_test_split(X) | |
write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular') | |
def _load_texmex_vectors(f, n, k): | |
import struct | |
v = numpy.zeros((n, k)) | |
for i in range(n): | |
f.read(4) # ignore vec length | |
v[i] = struct.unpack('f' * k, f.read(k*4)) | |
return v | |
def _get_irisa_matrix(t, fn): | |
import struct | |
m = t.getmember(fn) | |
f = t.extractfile(m) | |
k, = struct.unpack('i', f.read(4)) | |
n = m.size // (4 + 4*k) | |
f.seek(0) | |
return _load_texmex_vectors(f, n, k) | |
def sift(out_fn): | |
import tarfile | |
url = 'ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz' | |
fn = os.path.join('data', 'sift.tar.tz') | |
download(url, fn) | |
with tarfile.open(fn, 'r:gz') as t: | |
train = _get_irisa_matrix(t, 'sift/sift_base.fvecs') | |
test = _get_irisa_matrix(t, 'sift/sift_query.fvecs') | |
write_output(train, test, out_fn, 'euclidean') | |
def gist(out_fn): | |
import tarfile | |
url = 'ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz' | |
fn = os.path.join('data', 'gist.tar.tz') | |
download(url, fn) | |
with tarfile.open(fn, 'r:gz') as t: | |
train = _get_irisa_matrix(t, 'gist/gist_base.fvecs') | |
test = _get_irisa_matrix(t, 'gist/gist_query.fvecs') | |
write_output(train, test, out_fn, 'euclidean') | |
def _load_mnist_vectors(fn): | |
import gzip | |
import struct | |
print('parsing vectors in %s...' % fn) | |
f = gzip.open(fn) | |
type_code_info = { | |
0x08: (1, "!B"), | |
0x09: (1, "!b"), | |
0x0B: (2, "!H"), | |
0x0C: (4, "!I"), | |
0x0D: (4, "!f"), | |
0x0E: (8, "!d") | |
} | |
magic, type_code, dim_count = struct.unpack("!hBB", f.read(4)) | |
assert magic == 0 | |
assert type_code in type_code_info | |
dimensions = [struct.unpack("!I", f.read(4))[0] for i in range(dim_count)] | |
entry_count = dimensions[0] | |
entry_size = numpy.product(dimensions[1:]) | |
b, format_string = type_code_info[type_code] | |
vectors = [] | |
for i in range(entry_count): | |
vectors.append([struct.unpack(format_string, f.read(b))[0] for j in range(entry_size)]) | |
return numpy.array(vectors) | |
def mnist(out_fn): | |
download('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 'mnist-train.gz') | |
download('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 'mnist-test.gz') | |
train = _load_mnist_vectors('mnist-train.gz') | |
test = _load_mnist_vectors('mnist-test.gz') | |
write_output(train, test, out_fn, 'euclidean') | |
def fashion_mnist(out_fn): | |
download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz', 'fashion-mnist-train.gz') | |
download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz', 'fashion-mnist-test.gz') | |
train = _load_mnist_vectors('fashion-mnist-train.gz') | |
test = _load_mnist_vectors('fashion-mnist-test.gz') | |
write_output(train, test, out_fn, 'euclidean') | |
def transform_bag_of_words(filename, n_dimensions, out_fn): | |
import gzip | |
from scipy.sparse import lil_matrix | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from sklearn import random_projection | |
with gzip.open(filename, 'rb') as f: | |
file_content = f.readlines() | |
entries = int(file_content[0]) | |
words = int(file_content[1]) | |
file_content = file_content[3:] # strip first three entries | |
print("building matrix...") | |
A = lil_matrix((entries, words)) | |
for e in file_content: | |
doc, word, cnt = [int(v) for v in e.strip().split()] | |
A[doc - 1, word - 1] = cnt | |
print("normalizing matrix entries with tfidf...") | |
B = TfidfTransformer().fit_transform(A) | |
print("reducing dimensionality...") | |
C = random_projection.GaussianRandomProjection(n_components = n_dimensions).fit_transform(B) | |
X_train, X_test = train_test_split(C) | |
write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular') | |
def nytimes(out_fn, n_dimensions): | |
fn = 'nytimes_%s.txt.gz' % n_dimensions | |
download('https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz', fn) | |
transform_bag_of_words(fn, n_dimensions, out_fn) | |
def random(out_fn, n_dims, n_samples, centers, distance): | |
import sklearn.datasets | |
X, _ = sklearn.datasets.make_blobs(n_samples=n_samples, n_features=n_dims, centers=centers, random_state=1) | |
X_train, X_test = train_test_split(X, test_size=0.1) | |
write_output(X_train, X_test, out_fn, distance) | |
def word2bits(out_fn, path, fn): | |
import tarfile | |
local_fn = fn + '.tar.gz' | |
url = 'http://web.stanford.edu/~maxlam/word_vectors/compressed/%s/%s.tar.gz' % (path, fn) | |
download(url, local_fn) | |
print('parsing vectors in %s...' % local_fn) | |
with tarfile.open(local_fn, 'r:gz') as t: | |
f = t.extractfile(fn) | |
n_words, k = [int(z) for z in next(f).strip().split()] | |
X = numpy.zeros((n_words, k), dtype=numpy.bool) | |
for i in range(n_words): | |
X[i] = numpy.array([float(z) > 0 for z in next(f).strip().split()[1:]], dtype=numpy.bool) | |
X_train, X_test = train_test_split(X, test_size=1000) | |
write_output(X_train, X_test, out_fn, 'hamming', 'bit') | |
def sift_hamming(out_fn, fn): | |
import tarfile | |
local_fn = fn + '.tar.gz' | |
url = 'http://sss.projects.itu.dk/ann-benchmarks/datasets/%s.tar.gz' % fn | |
download(url, local_fn) | |
print('parsing vectors in %s...' % local_fn) | |
with tarfile.open(local_fn, 'r:gz') as t: | |
f = t.extractfile(fn) | |
lines = f.readlines() | |
X = numpy.zeros((len(lines), 256), dtype=numpy.bool) | |
for i, line in enumerate(lines): | |
X[i] = numpy.array([int(x) > 0 for x in line.decode().strip()], dtype=numpy.bool) | |
X_train, X_test = train_test_split(X, test_size = 1000) | |
write_output(X_train, X_test, out_fn, 'hamming', 'bit') | |
def lastfm(out_fn, n_dimensions, test_size=50000): | |
# This tests out ANN methods for retrieval on simple matrix factorization based | |
# recommendation algorithms. The idea being that the query/test vectors are user factors | |
# and the train set are item factors from the matrix factorization model. | |
# Since the predictor is a dot product, we transform the factors first as described in this | |
# paper: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf | |
# This hopefully replicates the experiments done in this post: | |
# http://www.benfrederickson.com/approximate-nearest-neighbours-for-recommender-systems/ | |
# The dataset is from "Last.fm Dataset - 360K users": | |
# http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html | |
# this requires the implicit package to generate the factors (on my desktop/gpu this only | |
# takes 4-5 seconds to train - but could take 1-2 minutes on a laptop) | |
from implicit.datasets.lastfm import get_lastfm | |
from implicit.approximate_als import augment_inner_product_matrix | |
import implicit | |
# train an als model on the lastfm data | |
_, _, play_counts = get_lastfm() | |
model = implicit.als.AlternatingLeastSquares(factors=n_dimensions) | |
model.fit(implicit.nearest_neighbours.bm25_weight(play_counts, K1=100, B=0.8)) | |
# transform item factors so that each one has the same norm, and transform the user | |
# factors such by appending a 0 column | |
_, item_factors = augment_inner_product_matrix(model.item_factors) | |
user_factors = numpy.append(model.user_factors, | |
numpy.zeros((model.user_factors.shape[0], 1)), | |
axis=1) | |
# only query the first 50k users (speeds things up signficantly without changing results) | |
user_factors = user_factors[:test_size] | |
# after that transformation a cosine lookup will return the same results as the inner product | |
# on the untransformed data | |
write_output(item_factors, user_factors, out_fn, 'angular') | |
def get_sparse_matrix_from_txt(file=None, dtype=numpy.bool): | |
from rdkit import Chem | |
from rdkit.Chem import AllChem | |
from scipy.sparse import csr_matrix | |
dimension = 1024 | |
SMILES = [] | |
indptr = [0] | |
indices = [] | |
data = [] | |
num_mols = 0 | |
if file == None: | |
file = '../pycharm_project_422/clustering_toydata.txt' | |
file_object = open(file, "r") | |
for line in file_object.readlines(): | |
elements = line.split() | |
if len(elements) != 14: continue | |
smile = elements[7] | |
mol = Chem.MolFromSmiles(smile) | |
if mol is None: continue | |
SMILES.append(smile) | |
fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=dimension) | |
for i in range(dimension): | |
if fp.GetBit(i) is True: | |
indices.append(i) | |
data.append(1) | |
indptr.append(len(indices)) | |
num_mols += 1 | |
fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype) | |
print('The dimension of the returned sparse matrix: %d*%d' %fps.shape) | |
return fps, SMILES | |
def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool): | |
from rdkit import Chem | |
from rdkit.Chem import AllChem | |
import glob | |
import gzip | |
from scipy.sparse import csr_matrix | |
SMILES = [] | |
indptr = [0] | |
indices = [] | |
data = [] | |
num_mols = 0 | |
file_list = glob.glob(dir + '/*.sdf.gz') | |
print(file_list) | |
for file in file_list: | |
inf = gzip.open(file) | |
suppl = Chem.ForwardSDMolSupplier(inf) | |
for mol in suppl: | |
if mol is None: continue | |
smile = Chem.MolToSmiles(mol) | |
SMILES.append(smile) | |
fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=dimension) | |
for i in range(dimension): | |
if fp.GetBit(i) is True: | |
indices.append(i) | |
data.append(1) | |
indptr.append(len(indices)) | |
num_mols += 1 | |
fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype) | |
print('The dimension of the returned sparse matrix: %d*%d' % fps.shape) | |
return fps, SMILES | |
def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000): | |
from sklearn.utils import shuffle | |
print('prepare dataset ' + dataset_name) | |
if type == 'bit': | |
dtype = numpy.bool | |
elif type == 'int': | |
dtype = numpy.int | |
else: | |
dtype = numpy.float | |
if dataset_name == 'Molport': | |
dir = '/home/cjz18001/Molport' | |
elif dataset_name == 'Chembl': | |
dir = '/home/cjz18001/Chembl' | |
else: | |
print('unknown dataset') | |
exit(0) | |
X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype) | |
# random shuffle fingerprints and smiles at the same time | |
seed = 1 # random.randint(0, 2 ** 32 - 1) | |
X, SMILES = shuffle(X, SMILES, random_state=seed) | |
# data split and make test data full matrix | |
train_size = X.shape[0] - test_size | |
X_train = X[:train_size] | |
X_test = X[train_size:] | |
X_test = X_test.toarray() | |
print('finish dataset preparation') | |
print('Train data dimension: %d*%d' %X_train.shape) | |
print('Test data dimension: %d*%d' %X_test.shape) | |
write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES) | |
def ecfp_sparse_multi(out_fn, dataset_name, num_files, dimension, distance, type): | |
print('prepare dataset ' + dataset_name) | |
import pickle | |
from scipy.sparse import vstack | |
path = '/data/chunjiangzhu/Enamine_680M_SparseMatrix/' | |
if type == 'bit': | |
dtype = numpy.bool | |
elif type == 'int': | |
dtype = numpy.int | |
else: | |
dtype = numpy.float | |
# vertically stack sparse matrices from multiple files | |
test_size = 1 | |
if num_files==0.5: | |
with open(path + dataset_name + '_' + str(dimension) + '_trunk_0.pickle', 'rb') as handle: | |
Y = pickle.load(handle, encoding='latin1') | |
size = 1000000 | |
print('select %i out of %i' %(size, Y.shape[0])) | |
Y = Y[:size] | |
X_test = Y[Y.shape[0] - test_size:] | |
X_train = Y[:Y.shape[0] - test_size] | |
else: | |
first = False | |
for i in range(num_files): | |
print('process ' + str(i) + ' trunk') | |
if first == False: | |
first = True | |
with open(path + dataset_name + '_' + str(dimension) + '_trunk_' + str(i) + '.pickle', 'rb') as handle: | |
Y = pickle.load(handle, encoding='latin1') | |
if i==num_files-1: #last one | |
X_test = Y[Y.shape[0] - test_size:] | |
X_train = Y[:Y.shape[0] - test_size] | |
else: | |
X_train = Y | |
else: | |
with open(path + dataset_name + '_' + str(dimension) + '_trunk_' + str(i) + '.pickle', 'rb') as handle: | |
Y = pickle.load(handle, encoding='latin1') | |
if i==num_files-1: #last one | |
X_test = Y[Y.shape[0] - test_size:] | |
X_train = vstack([X_train, Y[:Y.shape[0] - test_size]]) | |
else: | |
X_train = vstack([X_train, Y]) | |
# X_train = X_train.astype(dtype) | |
# X_test = X_test.astype(dtype) | |
# X_train, X_test = train_test_split(X, test_size=1000) | |
# X_test = X_test.toarray() | |
# encounter memory error when calling train_test_split, for 100M | |
X_test = X_test.toarray() | |
print('finish dataset preparation') | |
print(X_train.shape) | |
print(X_test.shape) | |
write_output(X_train, X_test, out_fn, distance, type, 1000) | |
def ecfp_multi(out_fn, dataset_name, num_files, dimension, distance, type): | |
print('prepare dataset ' + dataset_name) | |
import pickle | |
from scipy.sparse import vstack | |
path = '/data/chunjiangzhu/Enamine_680M_SparseMatrix/' | |
if type == 'bit': | |
dtype = numpy.bool | |
elif type == 'int': | |
dtype = numpy.int | |
else: | |
dtype = numpy.float | |
# vertically stack sparse matrices from multiple files | |
test_size = 3 | |
with open(path + dataset_name + '_' + str(dimension) + '_trunk_0.pickle', 'rb') as handle: | |
Y = pickle.load(handle, encoding='latin1') | |
size = 10000000 | |
print('select %i out of %i' %(size, Y.shape[0])) | |
Y = Y[:size] | |
X_test = Y[Y.shape[0] - test_size:] | |
X_train = Y[:Y.shape[0] - test_size] | |
# make them full matrices here | |
X_train = X_train.toarray() | |
X_test = X_test.toarray() | |
print('finish dataset preparation') | |
print(X_train.shape) | |
print(X_test.shape) | |
write_output(X_train, X_test, out_fn, distance, type, 1000) | |
DATASETS = { | |
'fashion-mnist-784-euclidean': fashion_mnist, | |
'gist-960-euclidean': gist, | |
'glove-25-angular': lambda out_fn: glove(out_fn, 25), | |
'glove-50-angular': lambda out_fn: glove(out_fn, 50), | |
'glove-100-angular': lambda out_fn: glove(out_fn, 100), | |
'glove-200-angular': lambda out_fn: glove(out_fn, 200), | |
'mnist-784-euclidean': mnist, | |
'random-xs-20-euclidean': lambda out_fn: random(out_fn, 20, 10000, 100, 'euclidean'), | |
'random-s-100-euclidean': lambda out_fn: random(out_fn, 100, 100000, 1000, 'euclidean'), | |
'random-xs-20-angular': lambda out_fn: random(out_fn, 20, 10000, 100, 'angular'), | |
'random-s-100-angular': lambda out_fn: random(out_fn, 100, 100000, 1000, 'angular'), | |
'sift-128-euclidean': sift, | |
'nytimes-256-angular': lambda out_fn: nytimes(out_fn, 256), | |
'nytimes-16-angular': lambda out_fn: nytimes(out_fn, 16), | |
'word2bits-800-hamming': lambda out_fn: word2bits(out_fn, '400K', 'w2b_bitlevel1_size800_vocab400K'), | |
'lastfm-64-dot': lambda out_fn: lastfm(out_fn, 64), | |
'sift-256-hamming': lambda out_fn: sift_hamming(out_fn, 'sift.hamming.256'), | |
# below are datasets Chunjiang added | |
'toy-1024-jaccard': lambda out_fn: ecfp(out_fn, 'toydata', 1024, 'jaccard', 'bit', 100), | |
'chembl-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 1024, 'jaccard', 'bit'), | |
'molport-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Molport', 1024, 'jaccard', 'bit'), | |
'chembl100K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl100K', 1024, 'jaccard', 'int'), | |
'chembl10K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K', 1024, 'jaccard', 'int'), | |
'chembl-sparse-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K-sparse', 1024, 'jaccard', 'bit'), | |
'enamine100M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 5, 1024, 'jaccard', 'bit'), | |
'enamine10M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'), | |
'enamine20M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 1, 1024, 'jaccard', 'bit'), | |
'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit') | |
} |