Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions billion-scale-image-search/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,4 @@ data-plane-public-cert.pem
data-plane-private-key.pem
centroids.jsonl
feed.jsonl
img_emb_0000.npy
metadata_0000.parquet
laion5b_100m_part_*.parquet
34 changes: 19 additions & 15 deletions billion-scale-image-search/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,19 +163,21 @@ $ vespa clone billion-scale-image-search myapp && cd myapp

## Download Vector + Metadata

These instructions use the first split file (0000) of a total of 2314 files in the LAION2B-en split.
Download the vector data file:
These instructions use a 100M-vector subset of the LAION-5B CLIP ViT-L/14 embeddings,
hosted on ClickHouse's S3 bucket as parquet files. Each parquet file contains 10M rows with
both the 768-dim CLIP vectors and all metadata columns (url, caption, NSFW, similarity, etc.).

<pre data-test="exec">
$ curl --http1.1 -L -o img_emb_0000.npy \
https://the-eye.eu/public/AI/cah/laion5b/embeddings/laion2B-en/img_emb/img_emb_0000.npy
</pre>
> **Note:** The parquet files are hosted by ClickHouse as part of their
> [example datasets](https://clickhouse.com/docs/getting-started/example-datasets/laion-5b-dataset).
> This is a third-party resource not maintained by Vespa - availability and terms of use
> are subject to change. The underlying LAION-5B data is released under
> [CC-BY-4.0](https://laion.ai/blog/laion-5b/) for research purposes.

Download the metadata file:
Download the first part (of 10 total):

<pre data-test="exec">
$ curl -L -o metadata_0000.parquet \
https://the-eye.eu/public/AI/cah/laion5b/embeddings/laion2B-en/laion2B-en-metadata/metadata_0000.parquet
$ curl -L -o laion5b_100m_part_1_of_10.parquet \
https://clickhouse-datasets.s3.amazonaws.com/laion-5b/laion5b_100m_part_1_of_10.parquet
</pre>

Install python dependencies to process the files:
Expand All @@ -189,17 +191,19 @@ centroids. Performing an incremental clustering can improve vector search recall
indexing fewer centroids. For simplicity, this tutorial uses random sampling.

<pre data-test="exec">
$ python3 app/src/main/python/create-centroid-feed.py img_emb_0000.npy > centroids.jsonl
$ python3 app/src/main/python/create-centroid-feed.py laion5b_100m_part_1_of_10.parquet > centroids.jsonl
</pre>

Generate the image feed, this merges the embedding data with the metadata and creates a Vespa
jsonl feed file, with one json operation per line.

Generate the image feed, this reads both embedding data and metadata from the parquet file
and creates a Vespa jsonl feed file, with one json operation per line.
c
<pre data-test="exec">
$ python3 app/src/main/python/create-joined-feed.py metadata_0000.parquet img_emb_0000.npy > feed.jsonl
$ python3 app/src/main/python/create-joined-feed.py laion5b_100m_part_1_of_10.parquet > feed.jsonl
</pre>

To process the entire dataset, we recommend starting several processes, each operating on separate split files
To process the entire dataset (100M vectors), download all 10 parts
(`laion5b_100m_part_1_of_10.parquet` through `laion5b_100m_part_10_of_10.parquet`)
and start several processes, each operating on separate part files,
as the processing implementation is single-threaded.


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,19 @@
import numpy as np
import numpy.random as r
import mmh3
import pyarrow.parquet as pq

file = sys.argv[1]
vectors = np.load(file)
parquet_file = pq.ParquetFile(file)

for index in range(0, vectors.shape[0]):
if 0 == r.randint(0, 8):
vector = vectors[index].astype(np.float32)
id = mmh3.hash(vector.tobytes()) # 32 bits signed int
doc = {
"put": "id:laion:centroid::%i" % id,
"fields": {"id": id, "vector": {"values": vector.tolist()}},
}
print(json.dumps(doc))
for batch in parquet_file.iter_batches(batch_size=50000, columns=["vector"]):
vectors = np.array(batch.column("vector").to_pylist(), dtype=np.float32)
for index in range(0, vectors.shape[0]):
if 0 == r.randint(0, 8):
vector = vectors[index]
id = mmh3.hash(vector.tobytes()) # 32 bits signed int
doc = {
"put": "id:laion:centroid::%i" % id,
"fields": {"id": id, "vector": {"values": vector.tolist()}},
}
print(json.dumps(doc))
Original file line number Diff line number Diff line change
@@ -1,20 +1,19 @@
# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#! /usr/bin/env python3

import pandas as pd
import sys
import json
import numpy as np
import mmh3
import binascii
import pyarrow.parquet as pq

def compute_hash(url, text):
if url is None:
url = ''

if text is None:
text = ''

total = (url + text).encode("utf-8")
return mmh3.hash64(total)[0]

Expand All @@ -23,31 +22,32 @@ def nan_handler(number):
return 0
else :
return number


df = pd.read_parquet(sys.argv[1])
vectors = np.load(sys.argv[2], mmap_mode='r')

for index, row in df.iterrows():
url = row['url']
caption = row['caption']
id = compute_hash(url, caption)
similarity = nan_handler(row['similarity'])
similarity_scaled = min(int(100*similarity), 127)
doc = {
"put": "id:laion:image::%i" % id,
"fields": {
"url": row['url'],
"caption": row['caption'],
"nsfw": row['NSFW'],
"similarity": similarity_scaled,
"height": row['height'],
"width": row['width'],
"license": row['LICENSE'],
"vector": {
"values": vectors[index].astype(np.float32).tolist()


parquet_file = pq.ParquetFile(sys.argv[1])

for batch in parquet_file.iter_batches(batch_size=50000):
df = batch.to_pandas()
for index, row in df.iterrows():
url = row['url']
caption = row['caption']
id = compute_hash(url, caption)
similarity = nan_handler(row['similarity'])
similarity_scaled = min(int(100*similarity), 127)
vector = np.array(row['vector'], dtype=np.float32)
doc = {
"put": "id:laion:image::%i" % id,
"fields": {
"url": row['url'],
"caption": row['caption'],
"nsfw": row['NSFW'],
"similarity": similarity_scaled,
"height": row['height'],
"width": row['width'],
"license": row['LICENSE'],
"vector": {
"values": vector.tolist()
}
}
}
}
print(json.dumps(doc))

print(json.dumps(doc))
14 changes: 8 additions & 6 deletions billion-scale-image-search/app/src/main/python/pca_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,19 @@

from sklearn.decomposition import IncrementalPCA
import numpy as np
import pyarrow.parquet as pq
from tqdm import tqdm

pca128 = IncrementalPCA(n_components=128)
pca64 = IncrementalPCA(n_components=64)

files = ["{:04d}".format(i) for i in range(0,2314)]
sample = np.random.choice(files, size=200)
for s in tqdm(sample):
vectors = np.load("img_emb_%s.npy" % s)
pca128 = pca128.partial_fit(vectors)
pca64 = pca64.partial_fit(vectors)
files = ["laion5b_100m_part_{}_of_10.parquet".format(i) for i in range(1, 11)]
for f in tqdm(files):
parquet_file = pq.ParquetFile(f)
for batch in parquet_file.iter_batches(batch_size=50000, columns=["vector"]):
vectors = np.array(batch.column("vector").to_pylist(), dtype=np.float32)
pca128 = pca128.partial_fit(vectors)
pca64 = pca64.partial_fit(vectors)

pca128_comp = np.asarray(pca128.components_)
np.save("pca-128-components.npy", pca128_comp)
Expand Down
Loading