vespa-engine · abhishekkrthakur · Feb 27, 2026
diff --git a/billion-scale-image-search/.gitignore b/billion-scale-image-search/.gitignore
@@ -9,5 +9,4 @@ data-plane-public-cert.pem
 data-plane-private-key.pem
 centroids.jsonl
 feed.jsonl
-img_emb_0000.npy
-metadata_0000.parquet
+laion5b_100m_part_*.parquet
diff --git a/billion-scale-image-search/README.md b/billion-scale-image-search/README.md
@@ -163,19 +163,21 @@ $ vespa clone billion-scale-image-search myapp && cd myapp
 
 ## Download Vector + Metadata
 
-These instructions use the first split file (0000) of a total of 2314 files in the LAION2B-en split.
-Download the vector data file:
+These instructions use a 100M-vector subset of the LAION-5B CLIP ViT-L/14 embeddings,
+hosted on ClickHouse's S3 bucket as parquet files. Each parquet file contains 10M rows with
+both the 768-dim CLIP vectors and all metadata columns (url, caption, NSFW, similarity, etc.).
 
-<pre data-test="exec">
-$ curl --http1.1 -L -o img_emb_0000.npy \
-  https://the-eye.eu/public/AI/cah/laion5b/embeddings/laion2B-en/img_emb/img_emb_0000.npy
-</pre>
+> **Note:** The parquet files are hosted by ClickHouse as part of their
+> [example datasets](https://clickhouse.com/docs/getting-started/example-datasets/laion-5b-dataset).
+> This is a third-party resource not maintained by Vespa - availability and terms of use
+> are subject to change. The underlying LAION-5B data is released under
+> [CC-BY-4.0](https://laion.ai/blog/laion-5b/) for research purposes.
 
-Download the metadata file:
+Download the first part (of 10 total):
 
 <pre data-test="exec">
-$ curl -L -o metadata_0000.parquet \
-  https://the-eye.eu/public/AI/cah/laion5b/embeddings/laion2B-en/laion2B-en-metadata/metadata_0000.parquet
+$ curl -L -o laion5b_100m_part_1_of_10.parquet \
+  https://clickhouse-datasets.s3.amazonaws.com/laion-5b/laion5b_100m_part_1_of_10.parquet
 </pre>
 
 Install python dependencies to process the files:
@@ -189,17 +191,19 @@ centroids. Performing an incremental clustering can improve vector search recall
 indexing fewer centroids. For simplicity, this tutorial uses random sampling.
 
 <pre data-test="exec">
-$ python3 app/src/main/python/create-centroid-feed.py img_emb_0000.npy > centroids.jsonl
+$ python3 app/src/main/python/create-centroid-feed.py laion5b_100m_part_1_of_10.parquet > centroids.jsonl
 </pre>
 
-Generate the image feed, this merges the embedding data with the metadata and creates a Vespa
-jsonl feed file, with one json operation per line.
-
+Generate the image feed, this reads both embedding data and metadata from the parquet file
+and creates a Vespa jsonl feed file, with one json operation per line.
+c
 <pre data-test="exec">
-$ python3 app/src/main/python/create-joined-feed.py metadata_0000.parquet img_emb_0000.npy > feed.jsonl
+$ python3 app/src/main/python/create-joined-feed.py laion5b_100m_part_1_of_10.parquet > feed.jsonl
 </pre>
 
-To process the entire dataset, we recommend starting several processes, each operating on separate split files
+To process the entire dataset (100M vectors), download all 10 parts
+(`laion5b_100m_part_1_of_10.parquet` through `laion5b_100m_part_10_of_10.parquet`)
+and start several processes, each operating on separate part files,
 as the processing implementation is single-threaded.
 
 

diff --git a/billion-scale-image-search/app/src/main/python/create-centroid-feed.py b/billion-scale-image-search/app/src/main/python/create-centroid-feed.py
@@ -6,16 +6,19 @@
 import numpy as np
 import numpy.random as r
 import mmh3
+import pyarrow.parquet as pq
 
 file = sys.argv[1]
-vectors = np.load(file)
+parquet_file = pq.ParquetFile(file)
 
-for index in range(0, vectors.shape[0]):
-    if 0 == r.randint(0, 8):
-        vector = vectors[index].astype(np.float32)
-        id = mmh3.hash(vector.tobytes())  # 32 bits signed int
-        doc = {
-            "put": "id:laion:centroid::%i" % id,
-            "fields": {"id": id, "vector": {"values": vector.tolist()}},
-        }
-        print(json.dumps(doc))
+for batch in parquet_file.iter_batches(batch_size=50000, columns=["vector"]):
+    vectors = np.array(batch.column("vector").to_pylist(), dtype=np.float32)
+    for index in range(0, vectors.shape[0]):
+        if 0 == r.randint(0, 8):
+            vector = vectors[index]
+            id = mmh3.hash(vector.tobytes())  # 32 bits signed int
+            doc = {
+                "put": "id:laion:centroid::%i" % id,
+                "fields": {"id": id, "vector": {"values": vector.tolist()}},
+            }
+            print(json.dumps(doc))
diff --git a/billion-scale-image-search/app/src/main/python/create-joined-feed.py b/billion-scale-image-search/app/src/main/python/create-joined-feed.py
@@ -1,20 +1,19 @@
 # Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 #! /usr/bin/env python3
 
-import pandas as pd
 import sys
 import json
 import numpy as np
 import mmh3
-import binascii
+import pyarrow.parquet as pq
 
 def compute_hash(url, text):
   if url is None:
     url = ''
 
   if text is None:
     text = ''
-  
+
   total = (url + text).encode("utf-8")
   return mmh3.hash64(total)[0]
 
@@ -23,31 +22,32 @@ def nan_handler(number):
     return 0
   else :
     return number
-
-
-df = pd.read_parquet(sys.argv[1])
-vectors = np.load(sys.argv[2], mmap_mode='r')
-
-for index, row in df.iterrows():
-  url = row['url']
-  caption = row['caption']
-  id = compute_hash(url, caption)
-  similarity = nan_handler(row['similarity'])
-  similarity_scaled = min(int(100*similarity), 127)
-  doc = {
-    "put": "id:laion:image::%i" % id,
-    "fields": {
-      "url": row['url'],
-      "caption": row['caption'],
-      "nsfw": row['NSFW'],
-      "similarity": similarity_scaled, 
-      "height": row['height'],
-      "width": row['width'],
-      "license": row['LICENSE'],
-      "vector": {
-        "values": vectors[index].astype(np.float32).tolist() 
+
+
+parquet_file = pq.ParquetFile(sys.argv[1])
+
+for batch in parquet_file.iter_batches(batch_size=50000):
+  df = batch.to_pandas()
+  for index, row in df.iterrows():
+    url = row['url']
+    caption = row['caption']
+    id = compute_hash(url, caption)
+    similarity = nan_handler(row['similarity'])
+    similarity_scaled = min(int(100*similarity), 127)
+    vector = np.array(row['vector'], dtype=np.float32)
+    doc = {
+      "put": "id:laion:image::%i" % id,
+      "fields": {
+        "url": row['url'],
+        "caption": row['caption'],
+        "nsfw": row['NSFW'],
+        "similarity": similarity_scaled,
+        "height": row['height'],
+        "width": row['width'],
+        "license": row['LICENSE'],
+        "vector": {
+          "values": vector.tolist()
+        }
       }
     }
-  }
-  print(json.dumps(doc))
-
+    print(json.dumps(doc))
diff --git a/billion-scale-image-search/app/src/main/python/pca_train.py b/billion-scale-image-search/app/src/main/python/pca_train.py
@@ -3,17 +3,19 @@
 
 from sklearn.decomposition import IncrementalPCA
 import numpy as np
+import pyarrow.parquet as pq
 from tqdm import tqdm
 
 pca128 = IncrementalPCA(n_components=128)
 pca64 = IncrementalPCA(n_components=64)
 
-files = ["{:04d}".format(i) for i in range(0,2314)]
-sample = np.random.choice(files, size=200)
-for s in tqdm(sample):
-  vectors = np.load("img_emb_%s.npy" % s)
-  pca128 = pca128.partial_fit(vectors)
-  pca64  = pca64.partial_fit(vectors)
+files = ["laion5b_100m_part_{}_of_10.parquet".format(i) for i in range(1, 11)]
+for f in tqdm(files):
+  parquet_file = pq.ParquetFile(f)
+  for batch in parquet_file.iter_batches(batch_size=50000, columns=["vector"]):
+    vectors = np.array(batch.column("vector").to_pylist(), dtype=np.float32)
+    pca128 = pca128.partial_fit(vectors)
+    pca64  = pca64.partial_fit(vectors)
 
 pca128_comp = np.asarray(pca128.components_)
 np.save("pca-128-components.npy", pca128_comp)