diff --git a/msmarco-ranking/README.md b/msmarco-ranking/README.md index cfeb8d6f1..892eba4c4 100644 --- a/msmarco-ranking/README.md +++ b/msmarco-ranking/README.md @@ -143,17 +143,17 @@ Note that the `@query` parameter substitution syntax requires Vespa 8.299 or abo
 vespa query 'query=what was the manhattan project' \
- 'yql=select * from passage where {targetHits: 100}nearestNeighbor(e5, q)'\
- 'input.query(q)=embed(e5, @query)' \
- 'input.query(qt)=embed(colbert, @query)' \
+ 'yql=select * from passage where {targetHits: 100}nearestNeighbor(e5_embedding, q)'\
+ 'input.query(q)=embed(e5_embedding_model, @query)' \
+ 'input.query(qt)=embed(colbert_embedding_model, @query)' \
  'ranking=e5-colbert'
 
 vespa query 'query=what was the manhattan project' \
- 'yql=select * from passage where userQuery() or ({targetHits: 100}nearestNeighbor(e5, q))'\
- 'input.query(q)=embed(e5, @query)' \
- 'input.query(qt)=embed(colbert, @query)' \
+ 'yql=select * from passage where userQuery() or ({targetHits: 100}nearestNeighbor(e5_embedding, q))'\
+ 'input.query(q)=embed(e5_embedding_model, @query)' \
+ 'input.query(qt)=embed(colbert_embedding_model, @query)' \
  'input.query(query_token_ids)=embed(tokenizer, @query)' \
  'ranking=e5-colbert-cross-encoder-rrf'
  
@@ -171,12 +171,15 @@ With the [evaluate_passage_run.py](python/evaluate_passage_run.py) we can run retrieval and ranking using the methods demonstrated. To do so, we need to index the entire dataset as follows: + +**Note** The ir_datasets utility will download MS Marco query evaluation data, +so the first run will take some time to complete (Upwards of several days to complete on a powerful laptop. Consider running the feeding on a dedicated machine or cloud instance). +
-ir_datasets export msmarco-passage docs --format jsonl |python3 python/to-vespa-feed.py | vespa feed -
+ir_datasets export msmarco-passage docs --format jsonl |python3 python/to-vespa-feed.py |vespa feed -
 
-Note that the ir_datasets utility will download MS Marco query evaluation data, -so the first run will take some time to complete. + **BM25(WAND) Single-phase sparse retrieval**
diff --git a/msmarco-ranking/python/evaluate_passage_run.py b/msmarco-ranking/python/evaluate_passage_run.py
index 23364cc01..03f42edf1 100755
--- a/msmarco-ranking/python/evaluate_passage_run.py
+++ b/msmarco-ranking/python/evaluate_passage_run.py
@@ -14,31 +14,31 @@
     'yql': 'select id from passage where userQuery()',
     'ranking': 'bm25',
     'query': '{query}',
-    'input.query(qt)':'embed(colbert, "{query}")'
+    'input.query(qt)':'embed(colbert_embedding_model, "{query}")'
   },
    'bm25-colbert': {
     'yql': 'select id from passage where userQuery()',
     'ranking': 'bm25-colbert',
-    'input.query(qt)':'embed(colbert, "{query}")',
+    'input.query(qt)':'embed(colbert_embedding_model, "{query}")',
     'query': '{query}',
     'ranking.rerankCount': 100
   },
   "e5": {
-    'yql': 'select id from passage where {targetHits: 10, hnsw.exploreAdditionalHits:100}nearestNeighbor(e5, q)',
-    'input.query(q)': 'embed(e5, "{query}")',
-    'ranking': 'e5',
+    'yql': 'select id from passage where {targetHits: 10, hnsw.exploreAdditionalHits:100}nearestNeighbor(e5_embedding, q)',
+    'input.query(q)': 'embed(e5_embedding_model, "{query}")',
+    'ranking': 'e5-similarity',
   },
   "e5-colbert": {
-    'yql': 'select id from passage where {targetHits: 100, hnsw.exploreAdditionalHits:100}nearestNeighbor(e5, q)',
-    'input.query(q)': 'embed(e5, "{query}")',
-    'input.query(qt)':'embed(colbert, "{query}")',
+    'yql': 'select id from passage where {targetHits: 100, hnsw.exploreAdditionalHits:100}nearestNeighbor(e5_embedding, q)',
+    'input.query(q)': 'embed(e5_embedding_model, "{query}")',
+    'input.query(qt)':'embed(colbert_embedding_model, "{query}")',
     'ranking': 'e5-colbert',
     'ranking.rerankCount': 100
   },
   "e5-colbert-cross-encoder-rrf": {
-    'yql': 'select id from passage where {targetHits: 100,hnsw.exploreAdditionalHits:100}nearestNeighbor(e5, q)',
-    'input.query(q)': 'embed(e5, "{query}")',
-    'input.query(qt)':'embed(colbert, "{query}")',
+    'yql': 'select id from passage where {targetHits: 100,hnsw.exploreAdditionalHits:100}nearestNeighbor(e5_embedding, q)',
+    'input.query(q)': 'embed(e5_embedding_model, "{query}")',
+    'input.query(qt)':'embed(colbert_embedding_model, "{query}")',
     'input.query(query_token_ids)':'embed(tokenizer, "{query}")',
     'ranking': 'e5-colbert-cross-encoder-rrf',
     'ranking.rerankCount': 100, 
diff --git a/msmarco-ranking/schemas/passage.sd b/msmarco-ranking/schemas/passage.sd
index 2d2219e43..906864c9b 100644
--- a/msmarco-ranking/schemas/passage.sd
+++ b/msmarco-ranking/schemas/passage.sd
@@ -4,7 +4,7 @@ schema passage {
 
   document passage {
 
-     field id type string {
+    field id type string {
       indexing: summary | attribute
     }
     field text type string {
@@ -22,9 +22,9 @@ schema passage {
     attribute: paged
   }
 
-  field e5 type tensor(x[384]) {
-    # e5 prefix instruction for document 
-    indexing: input text | embed e5 | attribute | index
+  field e5_embedding type tensor(x[384]) {
+    # Using the e5 embedding model defined in services.xml
+    indexing: input text | embed e5_embedding_model | attribute | index
     attribute {
       distance-metric: angular
     }
@@ -36,9 +36,9 @@ schema passage {
     }
   }
 
-  field colbert type tensor(dt{}, x[16]) {
+  field colbert_embeddings type tensor(dt{}, x[16]) {
     # No index - used for ranking, not retrieval 
-    indexing: input text | embed colbert | attribute
+    indexing: input text | embed colbert_embedding_model | attribute
     attribute: paged
   }
 
@@ -56,35 +56,29 @@ schema passage {
     }
   }
 
-  rank-profile e5 {
+  rank-profile e5-similarity {
     inputs {
       query(q) tensor(x[384])
     }
     first-phase {
-      expression: closeness(field, e5)
+      expression: closeness(field, e5_embedding)
     } 
   }
 
-  rank-profile bm25-colbert inherits e5-colbert {
-    first-phase {
-      expression: bm25(text)
-    }
-  }
-
-  rank-profile e5-colbert inherits e5 {
+  rank-profile e5-colbert inherits e5-similarity {
     inputs {
       query(qt) tensor(qt{},x[128])
       query(q) tensor(x[384])
     }
     function cos_sim() {
-      expression: cos(distance(field, e5))
+      expression: cos(distance(field, e5_embedding))
     }
     function max_sim() {
       expression {
         sum(
           reduce(
             sum(
-              query(qt) * unpack_bits(attribute(colbert)), x
+              query(qt) * unpack_bits(attribute(colbert_embeddings)), x
             ),
             max, dt
           ),
@@ -100,8 +94,14 @@ schema passage {
     match-features: max_sim() cos_sim()
   }
 
+  rank-profile bm25-colbert inherits e5-colbert {
+    # Overrides the first-phase expression fo e5-colbert rank-profile 
+    first-phase {
+      expression: bm25(text)
+    }
+  }
+
   rank-profile e5-colbert-rrf inherits e5-colbert {
-   
     global-phase {
       rerank-count: 200
       expression: reciprocal_rank(cos_sim) + reciprocal_rank(max_sim)
@@ -129,7 +129,7 @@ schema passage {
         sum(
           reduce(
             sum(
-              query(qt) * unpack_bits(attribute(colbert)), x
+              query(qt) * unpack_bits(attribute(colbert_embeddings)), x
             ),
             max, dt
           ),
@@ -138,7 +138,7 @@ schema passage {
        }
     }
     function e5_cos_sim() {
-      expression: cos(distance(field, e5))
+      expression: cos(distance(field, e5_embedding))
     }
     function cross_encoder() {
       expression: onnx(ranker){d0:0,d1:0}
diff --git a/msmarco-ranking/services.xml b/msmarco-ranking/services.xml
index 70b1fdaed..7218a7b4d 100644
--- a/msmarco-ranking/services.xml
+++ b/msmarco-ranking/services.xml
@@ -14,7 +14,7 @@
     
 
      
-    
+    
             
             
             
@@ -24,7 +24,7 @@
     
 
     
-    
+