diff --git a/msmarco-ranking/README.md b/msmarco-ranking/README.md index cfeb8d6f1..892eba4c4 100644 --- a/msmarco-ranking/README.md +++ b/msmarco-ranking/README.md @@ -143,17 +143,17 @@ Note that the `@query` parameter substitution syntax requires Vespa 8.299 or abo
vespa query 'query=what was the manhattan project' \
- 'yql=select * from passage where {targetHits: 100}nearestNeighbor(e5, q)'\
- 'input.query(q)=embed(e5, @query)' \
- 'input.query(qt)=embed(colbert, @query)' \
+ 'yql=select * from passage where {targetHits: 100}nearestNeighbor(e5_embedding, q)'\
+ 'input.query(q)=embed(e5_embedding_model, @query)' \
+ 'input.query(qt)=embed(colbert_embedding_model, @query)' \
'ranking=e5-colbert'
vespa query 'query=what was the manhattan project' \
- 'yql=select * from passage where userQuery() or ({targetHits: 100}nearestNeighbor(e5, q))'\
- 'input.query(q)=embed(e5, @query)' \
- 'input.query(qt)=embed(colbert, @query)' \
+ 'yql=select * from passage where userQuery() or ({targetHits: 100}nearestNeighbor(e5_embedding, q))'\
+ 'input.query(q)=embed(e5_embedding_model, @query)' \
+ 'input.query(qt)=embed(colbert_embedding_model, @query)' \
'input.query(query_token_ids)=embed(tokenizer, @query)' \
'ranking=e5-colbert-cross-encoder-rrf'
@@ -171,12 +171,15 @@ With the [evaluate_passage_run.py](python/evaluate_passage_run.py)
we can run retrieval and ranking using the methods demonstrated.
To do so, we need to index the entire dataset as follows:
+
+**Note** The ir_datasets utility will download MS Marco query evaluation data,
+so the first run will take some time to complete (Upwards of several days to complete on a powerful laptop. Consider running the feeding on a dedicated machine or cloud instance).
+
-ir_datasets export msmarco-passage docs --format jsonl |python3 python/to-vespa-feed.py | vespa feed - +ir_datasets export msmarco-passage docs --format jsonl |python3 python/to-vespa-feed.py |vespa feed --Note that the ir_datasets utility will download MS Marco query evaluation data, -so the first run will take some time to complete. + **BM25(WAND) Single-phase sparse retrieval**
diff --git a/msmarco-ranking/python/evaluate_passage_run.py b/msmarco-ranking/python/evaluate_passage_run.py
index 23364cc01..03f42edf1 100755
--- a/msmarco-ranking/python/evaluate_passage_run.py
+++ b/msmarco-ranking/python/evaluate_passage_run.py
@@ -14,31 +14,31 @@
'yql': 'select id from passage where userQuery()',
'ranking': 'bm25',
'query': '{query}',
- 'input.query(qt)':'embed(colbert, "{query}")'
+ 'input.query(qt)':'embed(colbert_embedding_model, "{query}")'
},
'bm25-colbert': {
'yql': 'select id from passage where userQuery()',
'ranking': 'bm25-colbert',
- 'input.query(qt)':'embed(colbert, "{query}")',
+ 'input.query(qt)':'embed(colbert_embedding_model, "{query}")',
'query': '{query}',
'ranking.rerankCount': 100
},
"e5": {
- 'yql': 'select id from passage where {targetHits: 10, hnsw.exploreAdditionalHits:100}nearestNeighbor(e5, q)',
- 'input.query(q)': 'embed(e5, "{query}")',
- 'ranking': 'e5',
+ 'yql': 'select id from passage where {targetHits: 10, hnsw.exploreAdditionalHits:100}nearestNeighbor(e5_embedding, q)',
+ 'input.query(q)': 'embed(e5_embedding_model, "{query}")',
+ 'ranking': 'e5-similarity',
},
"e5-colbert": {
- 'yql': 'select id from passage where {targetHits: 100, hnsw.exploreAdditionalHits:100}nearestNeighbor(e5, q)',
- 'input.query(q)': 'embed(e5, "{query}")',
- 'input.query(qt)':'embed(colbert, "{query}")',
+ 'yql': 'select id from passage where {targetHits: 100, hnsw.exploreAdditionalHits:100}nearestNeighbor(e5_embedding, q)',
+ 'input.query(q)': 'embed(e5_embedding_model, "{query}")',
+ 'input.query(qt)':'embed(colbert_embedding_model, "{query}")',
'ranking': 'e5-colbert',
'ranking.rerankCount': 100
},
"e5-colbert-cross-encoder-rrf": {
- 'yql': 'select id from passage where {targetHits: 100,hnsw.exploreAdditionalHits:100}nearestNeighbor(e5, q)',
- 'input.query(q)': 'embed(e5, "{query}")',
- 'input.query(qt)':'embed(colbert, "{query}")',
+ 'yql': 'select id from passage where {targetHits: 100,hnsw.exploreAdditionalHits:100}nearestNeighbor(e5_embedding, q)',
+ 'input.query(q)': 'embed(e5_embedding_model, "{query}")',
+ 'input.query(qt)':'embed(colbert_embedding_model, "{query}")',
'input.query(query_token_ids)':'embed(tokenizer, "{query}")',
'ranking': 'e5-colbert-cross-encoder-rrf',
'ranking.rerankCount': 100,
diff --git a/msmarco-ranking/schemas/passage.sd b/msmarco-ranking/schemas/passage.sd
index 2d2219e43..906864c9b 100644
--- a/msmarco-ranking/schemas/passage.sd
+++ b/msmarco-ranking/schemas/passage.sd
@@ -4,7 +4,7 @@ schema passage {
document passage {
- field id type string {
+ field id type string {
indexing: summary | attribute
}
field text type string {
@@ -22,9 +22,9 @@ schema passage {
attribute: paged
}
- field e5 type tensor(x[384]) {
- # e5 prefix instruction for document
- indexing: input text | embed e5 | attribute | index
+ field e5_embedding type tensor(x[384]) {
+ # Using the e5 embedding model defined in services.xml
+ indexing: input text | embed e5_embedding_model | attribute | index
attribute {
distance-metric: angular
}
@@ -36,9 +36,9 @@ schema passage {
}
}
- field colbert type tensor(dt{}, x[16]) {
+ field colbert_embeddings type tensor(dt{}, x[16]) {
# No index - used for ranking, not retrieval
- indexing: input text | embed colbert | attribute
+ indexing: input text | embed colbert_embedding_model | attribute
attribute: paged
}
@@ -56,35 +56,29 @@ schema passage {
}
}
- rank-profile e5 {
+ rank-profile e5-similarity {
inputs {
query(q) tensor(x[384])
}
first-phase {
- expression: closeness(field, e5)
+ expression: closeness(field, e5_embedding)
}
}
- rank-profile bm25-colbert inherits e5-colbert {
- first-phase {
- expression: bm25(text)
- }
- }
-
- rank-profile e5-colbert inherits e5 {
+ rank-profile e5-colbert inherits e5-similarity {
inputs {
query(qt) tensor(qt{},x[128])
query(q) tensor(x[384])
}
function cos_sim() {
- expression: cos(distance(field, e5))
+ expression: cos(distance(field, e5_embedding))
}
function max_sim() {
expression {
sum(
reduce(
sum(
- query(qt) * unpack_bits(attribute(colbert)), x
+ query(qt) * unpack_bits(attribute(colbert_embeddings)), x
),
max, dt
),
@@ -100,8 +94,14 @@ schema passage {
match-features: max_sim() cos_sim()
}
+ rank-profile bm25-colbert inherits e5-colbert {
+ # Overrides the first-phase expression fo e5-colbert rank-profile
+ first-phase {
+ expression: bm25(text)
+ }
+ }
+
rank-profile e5-colbert-rrf inherits e5-colbert {
-
global-phase {
rerank-count: 200
expression: reciprocal_rank(cos_sim) + reciprocal_rank(max_sim)
@@ -129,7 +129,7 @@ schema passage {
sum(
reduce(
sum(
- query(qt) * unpack_bits(attribute(colbert)), x
+ query(qt) * unpack_bits(attribute(colbert_embeddings)), x
),
max, dt
),
@@ -138,7 +138,7 @@ schema passage {
}
}
function e5_cos_sim() {
- expression: cos(distance(field, e5))
+ expression: cos(distance(field, e5_embedding))
}
function cross_encoder() {
expression: onnx(ranker){d0:0,d1:0}
diff --git a/msmarco-ranking/services.xml b/msmarco-ranking/services.xml
index 70b1fdaed..7218a7b4d 100644
--- a/msmarco-ranking/services.xml
+++ b/msmarco-ranking/services.xml
@@ -14,7 +14,7 @@
-
+
@@ -24,7 +24,7 @@
-
+