diff --git a/billion-scale-image-search/.gitignore b/billion-scale-image-search/.gitignore index 14e52e9b4..78e62ff4d 100644 --- a/billion-scale-image-search/.gitignore +++ b/billion-scale-image-search/.gitignore @@ -9,5 +9,4 @@ data-plane-public-cert.pem data-plane-private-key.pem centroids.jsonl feed.jsonl -img_emb_0000.npy -metadata_0000.parquet +laion5b_100m_part_*.parquet diff --git a/billion-scale-image-search/README.md b/billion-scale-image-search/README.md index ea106dc84..0e75af0a4 100644 --- a/billion-scale-image-search/README.md +++ b/billion-scale-image-search/README.md @@ -163,19 +163,21 @@ $ vespa clone billion-scale-image-search myapp && cd myapp ## Download Vector + Metadata -These instructions use the first split file (0000) of a total of 2314 files in the LAION2B-en split. -Download the vector data file: +These instructions use a 100M-vector subset of the LAION-5B CLIP ViT-L/14 embeddings, +hosted on ClickHouse's S3 bucket as parquet files. Each parquet file contains 10M rows with +both the 768-dim CLIP vectors and all metadata columns (url, caption, NSFW, similarity, etc.). -
-$ curl --http1.1 -L -o img_emb_0000.npy \ - https://the-eye.eu/public/AI/cah/laion5b/embeddings/laion2B-en/img_emb/img_emb_0000.npy -+> **Note:** The parquet files are hosted by ClickHouse as part of their +> [example datasets](https://clickhouse.com/docs/getting-started/example-datasets/laion-5b-dataset). +> This is a third-party resource not maintained by Vespa - availability and terms of use +> are subject to change. The underlying LAION-5B data is released under +> [CC-BY-4.0](https://laion.ai/blog/laion-5b/) for research purposes. -Download the metadata file: +Download the first part (of 10 total):
-$ curl -L -o metadata_0000.parquet \ - https://the-eye.eu/public/AI/cah/laion5b/embeddings/laion2B-en/laion2B-en-metadata/metadata_0000.parquet +$ curl -L -o laion5b_100m_part_1_of_10.parquet \ + https://clickhouse-datasets.s3.amazonaws.com/laion-5b/laion5b_100m_part_1_of_10.parquetInstall python dependencies to process the files: @@ -189,17 +191,19 @@ centroids. Performing an incremental clustering can improve vector search recall indexing fewer centroids. For simplicity, this tutorial uses random sampling.
-$ python3 app/src/main/python/create-centroid-feed.py img_emb_0000.npy > centroids.jsonl +$ python3 app/src/main/python/create-centroid-feed.py laion5b_100m_part_1_of_10.parquet > centroids.jsonl-Generate the image feed, this merges the embedding data with the metadata and creates a Vespa -jsonl feed file, with one json operation per line. - +Generate the image feed, this reads both embedding data and metadata from the parquet file +and creates a Vespa jsonl feed file, with one json operation per line. +c
-$ python3 app/src/main/python/create-joined-feed.py metadata_0000.parquet img_emb_0000.npy > feed.jsonl +$ python3 app/src/main/python/create-joined-feed.py laion5b_100m_part_1_of_10.parquet > feed.jsonl-To process the entire dataset, we recommend starting several processes, each operating on separate split files +To process the entire dataset (100M vectors), download all 10 parts +(`laion5b_100m_part_1_of_10.parquet` through `laion5b_100m_part_10_of_10.parquet`) +and start several processes, each operating on separate part files, as the processing implementation is single-threaded. diff --git a/billion-scale-image-search/app/src/main/python/create-centroid-feed.py b/billion-scale-image-search/app/src/main/python/create-centroid-feed.py index 99842b60a..1da5c8c08 100644 --- a/billion-scale-image-search/app/src/main/python/create-centroid-feed.py +++ b/billion-scale-image-search/app/src/main/python/create-centroid-feed.py @@ -6,16 +6,19 @@ import numpy as np import numpy.random as r import mmh3 +import pyarrow.parquet as pq file = sys.argv[1] -vectors = np.load(file) +parquet_file = pq.ParquetFile(file) -for index in range(0, vectors.shape[0]): - if 0 == r.randint(0, 8): - vector = vectors[index].astype(np.float32) - id = mmh3.hash(vector.tobytes()) # 32 bits signed int - doc = { - "put": "id:laion:centroid::%i" % id, - "fields": {"id": id, "vector": {"values": vector.tolist()}}, - } - print(json.dumps(doc)) +for batch in parquet_file.iter_batches(batch_size=50000, columns=["vector"]): + vectors = np.array(batch.column("vector").to_pylist(), dtype=np.float32) + for index in range(0, vectors.shape[0]): + if 0 == r.randint(0, 8): + vector = vectors[index] + id = mmh3.hash(vector.tobytes()) # 32 bits signed int + doc = { + "put": "id:laion:centroid::%i" % id, + "fields": {"id": id, "vector": {"values": vector.tolist()}}, + } + print(json.dumps(doc)) diff --git a/billion-scale-image-search/app/src/main/python/create-joined-feed.py b/billion-scale-image-search/app/src/main/python/create-joined-feed.py index 255a3a926..1b9db8723 100644 --- a/billion-scale-image-search/app/src/main/python/create-joined-feed.py +++ b/billion-scale-image-search/app/src/main/python/create-joined-feed.py @@ -1,12 +1,11 @@ # Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #! /usr/bin/env python3 -import pandas as pd import sys import json import numpy as np import mmh3 -import binascii +import pyarrow.parquet as pq def compute_hash(url, text): if url is None: @@ -14,7 +13,7 @@ def compute_hash(url, text): if text is None: text = '' - + total = (url + text).encode("utf-8") return mmh3.hash64(total)[0] @@ -23,31 +22,32 @@ def nan_handler(number): return 0 else : return number - - -df = pd.read_parquet(sys.argv[1]) -vectors = np.load(sys.argv[2], mmap_mode='r') - -for index, row in df.iterrows(): - url = row['url'] - caption = row['caption'] - id = compute_hash(url, caption) - similarity = nan_handler(row['similarity']) - similarity_scaled = min(int(100*similarity), 127) - doc = { - "put": "id:laion:image::%i" % id, - "fields": { - "url": row['url'], - "caption": row['caption'], - "nsfw": row['NSFW'], - "similarity": similarity_scaled, - "height": row['height'], - "width": row['width'], - "license": row['LICENSE'], - "vector": { - "values": vectors[index].astype(np.float32).tolist() + + +parquet_file = pq.ParquetFile(sys.argv[1]) + +for batch in parquet_file.iter_batches(batch_size=50000): + df = batch.to_pandas() + for index, row in df.iterrows(): + url = row['url'] + caption = row['caption'] + id = compute_hash(url, caption) + similarity = nan_handler(row['similarity']) + similarity_scaled = min(int(100*similarity), 127) + vector = np.array(row['vector'], dtype=np.float32) + doc = { + "put": "id:laion:image::%i" % id, + "fields": { + "url": row['url'], + "caption": row['caption'], + "nsfw": row['NSFW'], + "similarity": similarity_scaled, + "height": row['height'], + "width": row['width'], + "license": row['LICENSE'], + "vector": { + "values": vector.tolist() + } } } - } - print(json.dumps(doc)) - + print(json.dumps(doc)) diff --git a/billion-scale-image-search/app/src/main/python/pca_train.py b/billion-scale-image-search/app/src/main/python/pca_train.py index c40c775c9..82bbd9561 100644 --- a/billion-scale-image-search/app/src/main/python/pca_train.py +++ b/billion-scale-image-search/app/src/main/python/pca_train.py @@ -3,17 +3,19 @@ from sklearn.decomposition import IncrementalPCA import numpy as np +import pyarrow.parquet as pq from tqdm import tqdm pca128 = IncrementalPCA(n_components=128) pca64 = IncrementalPCA(n_components=64) -files = ["{:04d}".format(i) for i in range(0,2314)] -sample = np.random.choice(files, size=200) -for s in tqdm(sample): - vectors = np.load("img_emb_%s.npy" % s) - pca128 = pca128.partial_fit(vectors) - pca64 = pca64.partial_fit(vectors) +files = ["laion5b_100m_part_{}_of_10.parquet".format(i) for i in range(1, 11)] +for f in tqdm(files): + parquet_file = pq.ParquetFile(f) + for batch in parquet_file.iter_batches(batch_size=50000, columns=["vector"]): + vectors = np.array(batch.column("vector").to_pylist(), dtype=np.float32) + pca128 = pca128.partial_fit(vectors) + pca64 = pca64.partial_fit(vectors) pca128_comp = np.asarray(pca128.components_) np.save("pca-128-components.npy", pca128_comp)