Skip to content
Permalink
5e9b4f136e
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
544 lines (458 sloc) 20.3 KB
import h5py
import numpy
import os
import random
import sys
try:
from urllib import urlretrieve
except ImportError:
from urllib.request import urlretrieve # Python 3
def download(src, dst):
if not os.path.exists(dst):
# TODO: should be atomic
print('downloading %s -> %s...' % (src, dst))
urlretrieve(src, dst)
def get_dataset_fn(dataset):
if not os.path.exists('data'):
os.mkdir('data')
return os.path.join('data', '%s.hdf5' % dataset)
def get_dataset(which):
import h5sparse
hdf5_fn = get_dataset_fn(which)
try:
url = 'http://ann-benchmarks.com/%s.hdf5' % which
download(url, hdf5_fn)
except:
print("Cannot download %s" % url)
if which in DATASETS:
print("Creating dataset locally")
DATASETS[which](hdf5_fn)
hdf5_f = h5sparse.File(hdf5_fn)
return hdf5_f
# Everything below this line is related to creating datasets
# You probably never need to do this at home, just rely on the prepared datasets at http://ann-benchmarks.com
def write_output(train, test, fn, distance, point_type='float', count=1000, SMILES=None):
from ann_benchmarks.algorithms.bruteforce import BruteForceBLAS
import sklearn.neighbors
import h5sparse
def replace_last(source_string, replace_what, replace_with):
head, _sep, tail = source_string.rpartition(replace_what)
return head + replace_with + tail
# store SMILES first
if SMILES:
smile_fn = replace_last(fn, '.hdf5', '-SMILES.hdf5')
print('Write Smiles to File %s' % smile_fn)
f = h5sparse.File(smile_fn, 'w')
dt = h5py.special_dtype(vlen=bytes)
asciiList = [n.encode("ascii", "ignore") for n in SMILES]
f.create_dataset('smile', (len(asciiList), 1), dtype=dt, data=asciiList)
f.close()
print('Finish.')
print('Write Dataset %s' % fn)
f = h5sparse.File(fn, 'w')
f.attrs['distance'] = distance
f.attrs['point_type'] = point_type
print('train size: %9d * %4d' % train.shape)
print('test size: %9d * %4d' % test.shape)
f.create_dataset('train',data=train)
f.create_dataset('test', test.shape, dtype=test.dtype)[:] = test
neighbors = f.create_dataset('neighbors', (test.shape[0], count), dtype='i')
distances = f.create_dataset('distances', (test.shape[0], count), dtype='f')
# use which method to compute the groundtruth
train = train.toarray()
method = 'bruteforth'
if method == 'balltree':
tree = sklearn.neighbors.BallTree(train, leaf_size=1000000, metric=distance)
else:
bf = BruteForceBLAS(metric=distance, precision=train.dtype)
bf.fit(train)
print(test)
for i, x in enumerate(test):
if i % 1 == 0:
print('%d/%d...' % (i, test.shape[0]))
if method == 'balltree':
dist, ind = tree.query([x], k=count)
neighbors[i] = ind[0]
distances[i] = dist[0]
else:
res = list(bf.query_with_distances(x, count))
res.sort(key=lambda t: t[-1])
neighbors[i] = [j for j, _ in res]
distances[i] = [d for _, d in res]
print(neighbors[i])
print(distances[i])
f.close()
print('Finish.')
def train_test_split(X, test_size=10000):
import sklearn.model_selection
print('Splitting %d*%d into train/test' % X.shape)
return sklearn.model_selection.train_test_split(X, test_size=test_size, random_state=1)
def glove(out_fn, d):
import zipfile
url = 'http://nlp.stanford.edu/data/glove.twitter.27B.zip'
fn = os.path.join('data', 'glove.twitter.27B.zip')
download(url, fn)
with zipfile.ZipFile(fn) as z:
print('preparing %s' % out_fn)
z_fn = 'glove.twitter.27B.%dd.txt' % d
X = []
for line in z.open(z_fn):
v = [float(x) for x in line.strip().split()[1:]]
X.append(numpy.array(v))
X_train, X_test = train_test_split(X)
write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular')
def _load_texmex_vectors(f, n, k):
import struct
v = numpy.zeros((n, k))
for i in range(n):
f.read(4) # ignore vec length
v[i] = struct.unpack('f' * k, f.read(k*4))
return v
def _get_irisa_matrix(t, fn):
import struct
m = t.getmember(fn)
f = t.extractfile(m)
k, = struct.unpack('i', f.read(4))
n = m.size // (4 + 4*k)
f.seek(0)
return _load_texmex_vectors(f, n, k)
def sift(out_fn):
import tarfile
url = 'ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz'
fn = os.path.join('data', 'sift.tar.tz')
download(url, fn)
with tarfile.open(fn, 'r:gz') as t:
train = _get_irisa_matrix(t, 'sift/sift_base.fvecs')
test = _get_irisa_matrix(t, 'sift/sift_query.fvecs')
write_output(train, test, out_fn, 'euclidean')
def gist(out_fn):
import tarfile
url = 'ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz'
fn = os.path.join('data', 'gist.tar.tz')
download(url, fn)
with tarfile.open(fn, 'r:gz') as t:
train = _get_irisa_matrix(t, 'gist/gist_base.fvecs')
test = _get_irisa_matrix(t, 'gist/gist_query.fvecs')
write_output(train, test, out_fn, 'euclidean')
def _load_mnist_vectors(fn):
import gzip
import struct
print('parsing vectors in %s...' % fn)
f = gzip.open(fn)
type_code_info = {
0x08: (1, "!B"),
0x09: (1, "!b"),
0x0B: (2, "!H"),
0x0C: (4, "!I"),
0x0D: (4, "!f"),
0x0E: (8, "!d")
}
magic, type_code, dim_count = struct.unpack("!hBB", f.read(4))
assert magic == 0
assert type_code in type_code_info
dimensions = [struct.unpack("!I", f.read(4))[0] for i in range(dim_count)]
entry_count = dimensions[0]
entry_size = numpy.product(dimensions[1:])
b, format_string = type_code_info[type_code]
vectors = []
for i in range(entry_count):
vectors.append([struct.unpack(format_string, f.read(b))[0] for j in range(entry_size)])
return numpy.array(vectors)
def mnist(out_fn):
download('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz', 'mnist-train.gz')
download('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz', 'mnist-test.gz')
train = _load_mnist_vectors('mnist-train.gz')
test = _load_mnist_vectors('mnist-test.gz')
write_output(train, test, out_fn, 'euclidean')
def fashion_mnist(out_fn):
download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/train-images-idx3-ubyte.gz', 'fashion-mnist-train.gz')
download('http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/t10k-images-idx3-ubyte.gz', 'fashion-mnist-test.gz')
train = _load_mnist_vectors('fashion-mnist-train.gz')
test = _load_mnist_vectors('fashion-mnist-test.gz')
write_output(train, test, out_fn, 'euclidean')
def transform_bag_of_words(filename, n_dimensions, out_fn):
import gzip
from scipy.sparse import lil_matrix
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import random_projection
with gzip.open(filename, 'rb') as f:
file_content = f.readlines()
entries = int(file_content[0])
words = int(file_content[1])
file_content = file_content[3:] # strip first three entries
print("building matrix...")
A = lil_matrix((entries, words))
for e in file_content:
doc, word, cnt = [int(v) for v in e.strip().split()]
A[doc - 1, word - 1] = cnt
print("normalizing matrix entries with tfidf...")
B = TfidfTransformer().fit_transform(A)
print("reducing dimensionality...")
C = random_projection.GaussianRandomProjection(n_components = n_dimensions).fit_transform(B)
X_train, X_test = train_test_split(C)
write_output(numpy.array(X_train), numpy.array(X_test), out_fn, 'angular')
def nytimes(out_fn, n_dimensions):
fn = 'nytimes_%s.txt.gz' % n_dimensions
download('https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz', fn)
transform_bag_of_words(fn, n_dimensions, out_fn)
def random(out_fn, n_dims, n_samples, centers, distance):
import sklearn.datasets
X, _ = sklearn.datasets.make_blobs(n_samples=n_samples, n_features=n_dims, centers=centers, random_state=1)
X_train, X_test = train_test_split(X, test_size=0.1)
write_output(X_train, X_test, out_fn, distance)
def word2bits(out_fn, path, fn):
import tarfile
local_fn = fn + '.tar.gz'
url = 'http://web.stanford.edu/~maxlam/word_vectors/compressed/%s/%s.tar.gz' % (path, fn)
download(url, local_fn)
print('parsing vectors in %s...' % local_fn)
with tarfile.open(local_fn, 'r:gz') as t:
f = t.extractfile(fn)
n_words, k = [int(z) for z in next(f).strip().split()]
X = numpy.zeros((n_words, k), dtype=numpy.bool)
for i in range(n_words):
X[i] = numpy.array([float(z) > 0 for z in next(f).strip().split()[1:]], dtype=numpy.bool)
X_train, X_test = train_test_split(X, test_size=1000)
write_output(X_train, X_test, out_fn, 'hamming', 'bit')
def sift_hamming(out_fn, fn):
import tarfile
local_fn = fn + '.tar.gz'
url = 'http://sss.projects.itu.dk/ann-benchmarks/datasets/%s.tar.gz' % fn
download(url, local_fn)
print('parsing vectors in %s...' % local_fn)
with tarfile.open(local_fn, 'r:gz') as t:
f = t.extractfile(fn)
lines = f.readlines()
X = numpy.zeros((len(lines), 256), dtype=numpy.bool)
for i, line in enumerate(lines):
X[i] = numpy.array([int(x) > 0 for x in line.decode().strip()], dtype=numpy.bool)
X_train, X_test = train_test_split(X, test_size = 1000)
write_output(X_train, X_test, out_fn, 'hamming', 'bit')
def lastfm(out_fn, n_dimensions, test_size=50000):
# This tests out ANN methods for retrieval on simple matrix factorization based
# recommendation algorithms. The idea being that the query/test vectors are user factors
# and the train set are item factors from the matrix factorization model.
# Since the predictor is a dot product, we transform the factors first as described in this
# paper: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf
# This hopefully replicates the experiments done in this post:
# http://www.benfrederickson.com/approximate-nearest-neighbours-for-recommender-systems/
# The dataset is from "Last.fm Dataset - 360K users":
# http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html
# this requires the implicit package to generate the factors (on my desktop/gpu this only
# takes 4-5 seconds to train - but could take 1-2 minutes on a laptop)
from implicit.datasets.lastfm import get_lastfm
from implicit.approximate_als import augment_inner_product_matrix
import implicit
# train an als model on the lastfm data
_, _, play_counts = get_lastfm()
model = implicit.als.AlternatingLeastSquares(factors=n_dimensions)
model.fit(implicit.nearest_neighbours.bm25_weight(play_counts, K1=100, B=0.8))
# transform item factors so that each one has the same norm, and transform the user
# factors such by appending a 0 column
_, item_factors = augment_inner_product_matrix(model.item_factors)
user_factors = numpy.append(model.user_factors,
numpy.zeros((model.user_factors.shape[0], 1)),
axis=1)
# only query the first 50k users (speeds things up signficantly without changing results)
user_factors = user_factors[:test_size]
# after that transformation a cosine lookup will return the same results as the inner product
# on the untransformed data
write_output(item_factors, user_factors, out_fn, 'angular')
def get_sparse_matrix_from_txt(file=None, dtype=numpy.bool):
from rdkit import Chem
from rdkit.Chem import AllChem
from scipy.sparse import csr_matrix
dimension = 1024
SMILES = []
indptr = [0]
indices = []
data = []
num_mols = 0
if file == None:
file = '../pycharm_project_422/clustering_toydata.txt'
file_object = open(file, "r")
for line in file_object.readlines():
elements = line.split()
if len(elements) != 14: continue
smile = elements[7]
mol = Chem.MolFromSmiles(smile)
if mol is None: continue
SMILES.append(smile)
fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=dimension)
for i in range(dimension):
if fp.GetBit(i) is True:
indices.append(i)
data.append(1)
indptr.append(len(indices))
num_mols += 1
fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype)
print('The dimension of the returned sparse matrix: %d*%d' %fps.shape)
return fps, SMILES
def get_sparse_matrix_from_sdf(dir, dimension=1024, dtype=numpy.bool):
from rdkit import Chem
from rdkit.Chem import AllChem
import glob
import gzip
from scipy.sparse import csr_matrix
SMILES = []
indptr = [0]
indices = []
data = []
num_mols = 0
file_list = glob.glob(dir + '/*.sdf.gz')
print(file_list)
for file in file_list:
inf = gzip.open(file)
suppl = Chem.ForwardSDMolSupplier(inf)
for mol in suppl:
if mol is None: continue
smile = Chem.MolToSmiles(mol)
SMILES.append(smile)
fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=dimension)
for i in range(dimension):
if fp.GetBit(i) is True:
indices.append(i)
data.append(1)
indptr.append(len(indices))
num_mols += 1
fps = csr_matrix((data, indices, indptr), shape=(num_mols, dimension), dtype=dtype)
print('The dimension of the returned sparse matrix: %d*%d' % fps.shape)
return fps, SMILES
def ecfp(out_fn, dataset_name, dimension, distance, type, test_size=1000):
from sklearn.utils import shuffle
print('prepare dataset ' + dataset_name)
if type == 'bit':
dtype = numpy.bool
elif type == 'int':
dtype = numpy.int
else:
dtype = numpy.float
if dataset_name == 'Molport':
dir = '/home/cjz18001/Molport'
elif dataset_name == 'Chembl':
dir = '/home/cjz18001/Chembl'
else:
print('unknown dataset')
exit(0)
X, SMILES = get_sparse_matrix_from_sdf(dir=dir, dimension=dimension, dtype=dtype)
# random shuffle fingerprints and smiles at the same time
seed = 1 # random.randint(0, 2 ** 32 - 1)
X, SMILES = shuffle(X, SMILES, random_state=seed)
# data split and make test data full matrix
train_size = X.shape[0] - test_size
X_train = X[:train_size]
X_test = X[train_size:]
X_test = X_test.toarray()
print('finish dataset preparation')
print('Train data dimension: %d*%d' %X_train.shape)
print('Test data dimension: %d*%d' %X_test.shape)
write_output(X_train, X_test, out_fn, distance, type, count=1000, SMILES=SMILES)
def ecfp_sparse_multi(out_fn, dataset_name, num_files, dimension, distance, type):
print('prepare dataset ' + dataset_name)
import pickle
from scipy.sparse import vstack
path = '/data/chunjiangzhu/Enamine_680M_SparseMatrix/'
if type == 'bit':
dtype = numpy.bool
elif type == 'int':
dtype = numpy.int
else:
dtype = numpy.float
# vertically stack sparse matrices from multiple files
test_size = 1
if num_files==0.5:
with open(path + dataset_name + '_' + str(dimension) + '_trunk_0.pickle', 'rb') as handle:
Y = pickle.load(handle, encoding='latin1')
size = 1000000
print('select %i out of %i' %(size, Y.shape[0]))
Y = Y[:size]
X_test = Y[Y.shape[0] - test_size:]
X_train = Y[:Y.shape[0] - test_size]
else:
first = False
for i in range(num_files):
print('process ' + str(i) + ' trunk')
if first == False:
first = True
with open(path + dataset_name + '_' + str(dimension) + '_trunk_' + str(i) + '.pickle', 'rb') as handle:
Y = pickle.load(handle, encoding='latin1')
if i==num_files-1: #last one
X_test = Y[Y.shape[0] - test_size:]
X_train = Y[:Y.shape[0] - test_size]
else:
X_train = Y
else:
with open(path + dataset_name + '_' + str(dimension) + '_trunk_' + str(i) + '.pickle', 'rb') as handle:
Y = pickle.load(handle, encoding='latin1')
if i==num_files-1: #last one
X_test = Y[Y.shape[0] - test_size:]
X_train = vstack([X_train, Y[:Y.shape[0] - test_size]])
else:
X_train = vstack([X_train, Y])
# X_train = X_train.astype(dtype)
# X_test = X_test.astype(dtype)
# X_train, X_test = train_test_split(X, test_size=1000)
# X_test = X_test.toarray()
# encounter memory error when calling train_test_split, for 100M
X_test = X_test.toarray()
print('finish dataset preparation')
print(X_train.shape)
print(X_test.shape)
write_output(X_train, X_test, out_fn, distance, type, 1000)
def ecfp_multi(out_fn, dataset_name, num_files, dimension, distance, type):
print('prepare dataset ' + dataset_name)
import pickle
from scipy.sparse import vstack
path = '/data/chunjiangzhu/Enamine_680M_SparseMatrix/'
if type == 'bit':
dtype = numpy.bool
elif type == 'int':
dtype = numpy.int
else:
dtype = numpy.float
# vertically stack sparse matrices from multiple files
test_size = 3
with open(path + dataset_name + '_' + str(dimension) + '_trunk_0.pickle', 'rb') as handle:
Y = pickle.load(handle, encoding='latin1')
size = 10000000
print('select %i out of %i' %(size, Y.shape[0]))
Y = Y[:size]
X_test = Y[Y.shape[0] - test_size:]
X_train = Y[:Y.shape[0] - test_size]
# make them full matrices here
X_train = X_train.toarray()
X_test = X_test.toarray()
print('finish dataset preparation')
print(X_train.shape)
print(X_test.shape)
write_output(X_train, X_test, out_fn, distance, type, 1000)
DATASETS = {
'fashion-mnist-784-euclidean': fashion_mnist,
'gist-960-euclidean': gist,
'glove-25-angular': lambda out_fn: glove(out_fn, 25),
'glove-50-angular': lambda out_fn: glove(out_fn, 50),
'glove-100-angular': lambda out_fn: glove(out_fn, 100),
'glove-200-angular': lambda out_fn: glove(out_fn, 200),
'mnist-784-euclidean': mnist,
'random-xs-20-euclidean': lambda out_fn: random(out_fn, 20, 10000, 100, 'euclidean'),
'random-s-100-euclidean': lambda out_fn: random(out_fn, 100, 100000, 1000, 'euclidean'),
'random-xs-20-angular': lambda out_fn: random(out_fn, 20, 10000, 100, 'angular'),
'random-s-100-angular': lambda out_fn: random(out_fn, 100, 100000, 1000, 'angular'),
'sift-128-euclidean': sift,
'nytimes-256-angular': lambda out_fn: nytimes(out_fn, 256),
'nytimes-16-angular': lambda out_fn: nytimes(out_fn, 16),
'word2bits-800-hamming': lambda out_fn: word2bits(out_fn, '400K', 'w2b_bitlevel1_size800_vocab400K'),
'lastfm-64-dot': lambda out_fn: lastfm(out_fn, 64),
'sift-256-hamming': lambda out_fn: sift_hamming(out_fn, 'sift.hamming.256'),
# below are datasets Chunjiang added
'toy-1024-jaccard': lambda out_fn: ecfp(out_fn, 'toydata', 1024, 'jaccard', 'bit', 100),
'chembl-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl', 1024, 'jaccard', 'bit'),
'molport-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Molport', 1024, 'jaccard', 'bit'),
'chembl100K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl100K', 1024, 'jaccard', 'int'),
'chembl10K-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K', 1024, 'jaccard', 'int'),
'chembl-sparse-1024-jaccard': lambda out_fn: ecfp(out_fn, 'Chembl10K-sparse', 1024, 'jaccard', 'bit'),
'enamine100M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 5, 1024, 'jaccard', 'bit'),
'enamine10M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit'),
'enamine20M-sparse-1024-jaccard': lambda out_fn: ecfp_sparse_multi(out_fn, 'Enamine', 1, 1024, 'jaccard', 'bit'),
'enamine10M-1024-jaccard': lambda out_fn: ecfp_multi(out_fn, 'Enamine', 0.5, 1024, 'jaccard', 'bit')
}