Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 12 additions & 9 deletions msmarco-ranking/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,17 +143,17 @@ Note that the `@query` parameter substitution syntax requires Vespa 8.299 or abo

<pre data-test="exec" data-test-assert-contains='Manhattan'>
vespa query 'query=what was the manhattan project' \
'yql=select * from passage where {targetHits: 100}nearestNeighbor(e5, q)'\
'input.query(q)=embed(e5, @query)' \
'input.query(qt)=embed(colbert, @query)' \
'yql=select * from passage where {targetHits: 100}nearestNeighbor(e5_embedding, q)'\
'input.query(q)=embed(e5_embedding_model, @query)' \
'input.query(qt)=embed(colbert_embedding_model, @query)' \
'ranking=e5-colbert'
</pre>

<pre data-test="exec" data-test-assert-contains='Manhattan'>
vespa query 'query=what was the manhattan project' \
'yql=select * from passage where userQuery() or ({targetHits: 100}nearestNeighbor(e5, q))'\
'input.query(q)=embed(e5, @query)' \
'input.query(qt)=embed(colbert, @query)' \
'yql=select * from passage where userQuery() or ({targetHits: 100}nearestNeighbor(e5_embedding, q))'\
'input.query(q)=embed(e5_embedding_model, @query)' \
'input.query(qt)=embed(colbert_embedding_model, @query)' \
'input.query(query_token_ids)=embed(tokenizer, @query)' \
'ranking=e5-colbert-cross-encoder-rrf'
</pre>
Expand All @@ -171,12 +171,15 @@ With the [evaluate_passage_run.py](python/evaluate_passage_run.py)
we can run retrieval and ranking using the methods demonstrated.

To do so, we need to index the entire dataset as follows:

**Note** The ir_datasets utility will download MS Marco query evaluation data,
so the first run will take some time to complete (Upwards of several days to complete on a powerful laptop. Consider running the feeding on a dedicated machine or cloud instance).

<pre>
ir_datasets export msmarco-passage docs --format jsonl |python3 python/to-vespa-feed.py | vespa feed -
ir_datasets export msmarco-passage docs --format jsonl |python3 python/to-vespa-feed.py |vespa feed -
</pre>

Note that the ir_datasets utility will download MS Marco query evaluation data,
so the first run will take some time to complete.


**BM25(WAND) Single-phase sparse retrieval**
<pre>
Expand Down
22 changes: 11 additions & 11 deletions msmarco-ranking/python/evaluate_passage_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,31 +14,31 @@
'yql': 'select id from passage where userQuery()',
'ranking': 'bm25',
'query': '{query}',
'input.query(qt)':'embed(colbert, "{query}")'
'input.query(qt)':'embed(colbert_embedding_model, "{query}")'
},
'bm25-colbert': {
'yql': 'select id from passage where userQuery()',
'ranking': 'bm25-colbert',
'input.query(qt)':'embed(colbert, "{query}")',
'input.query(qt)':'embed(colbert_embedding_model, "{query}")',
'query': '{query}',
'ranking.rerankCount': 100
},
"e5": {
'yql': 'select id from passage where {targetHits: 10, hnsw.exploreAdditionalHits:100}nearestNeighbor(e5, q)',
'input.query(q)': 'embed(e5, "{query}")',
'ranking': 'e5',
'yql': 'select id from passage where {targetHits: 10, hnsw.exploreAdditionalHits:100}nearestNeighbor(e5_embedding, q)',
'input.query(q)': 'embed(e5_embedding_model, "{query}")',
'ranking': 'e5-similarity',
},
"e5-colbert": {
'yql': 'select id from passage where {targetHits: 100, hnsw.exploreAdditionalHits:100}nearestNeighbor(e5, q)',
'input.query(q)': 'embed(e5, "{query}")',
'input.query(qt)':'embed(colbert, "{query}")',
'yql': 'select id from passage where {targetHits: 100, hnsw.exploreAdditionalHits:100}nearestNeighbor(e5_embedding, q)',
'input.query(q)': 'embed(e5_embedding_model, "{query}")',
'input.query(qt)':'embed(colbert_embedding_model, "{query}")',
'ranking': 'e5-colbert',
'ranking.rerankCount': 100
},
"e5-colbert-cross-encoder-rrf": {
'yql': 'select id from passage where {targetHits: 100,hnsw.exploreAdditionalHits:100}nearestNeighbor(e5, q)',
'input.query(q)': 'embed(e5, "{query}")',
'input.query(qt)':'embed(colbert, "{query}")',
'yql': 'select id from passage where {targetHits: 100,hnsw.exploreAdditionalHits:100}nearestNeighbor(e5_embedding, q)',
'input.query(q)': 'embed(e5_embedding_model, "{query}")',
'input.query(qt)':'embed(colbert_embedding_model, "{query}")',
'input.query(query_token_ids)':'embed(tokenizer, "{query}")',
'ranking': 'e5-colbert-cross-encoder-rrf',
'ranking.rerankCount': 100,
Expand Down
40 changes: 20 additions & 20 deletions msmarco-ranking/schemas/passage.sd
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ schema passage {

document passage {

field id type string {
field id type string {
indexing: summary | attribute
}
field text type string {
Expand All @@ -22,9 +22,9 @@ schema passage {
attribute: paged
}

field e5 type tensor<bfloat16>(x[384]) {
# e5 prefix instruction for document
indexing: input text | embed e5 | attribute | index
field e5_embedding type tensor<bfloat16>(x[384]) {
# Using the e5 embedding model defined in services.xml
indexing: input text | embed e5_embedding_model | attribute | index
attribute {
distance-metric: angular
}
Expand All @@ -36,9 +36,9 @@ schema passage {
}
}

field colbert type tensor<int8>(dt{}, x[16]) {
field colbert_embeddings type tensor<int8>(dt{}, x[16]) {
# No index - used for ranking, not retrieval
indexing: input text | embed colbert | attribute
indexing: input text | embed colbert_embedding_model | attribute
attribute: paged
}

Expand All @@ -56,35 +56,29 @@ schema passage {
}
}

rank-profile e5 {
rank-profile e5-similarity {
inputs {
query(q) tensor<float>(x[384])
}
first-phase {
expression: closeness(field, e5)
expression: closeness(field, e5_embedding)
}
}

rank-profile bm25-colbert inherits e5-colbert {
first-phase {
expression: bm25(text)
}
}

rank-profile e5-colbert inherits e5 {
rank-profile e5-colbert inherits e5-similarity {
inputs {
query(qt) tensor<float>(qt{},x[128])
query(q) tensor<float>(x[384])
}
function cos_sim() {
expression: cos(distance(field, e5))
expression: cos(distance(field, e5_embedding))
}
function max_sim() {
expression {
sum(
reduce(
sum(
query(qt) * unpack_bits(attribute(colbert)), x
query(qt) * unpack_bits(attribute(colbert_embeddings)), x
),
max, dt
),
Expand All @@ -100,8 +94,14 @@ schema passage {
match-features: max_sim() cos_sim()
}

rank-profile bm25-colbert inherits e5-colbert {
# Overrides the first-phase expression fo e5-colbert rank-profile
first-phase {
expression: bm25(text)
}
}

rank-profile e5-colbert-rrf inherits e5-colbert {

global-phase {
rerank-count: 200
expression: reciprocal_rank(cos_sim) + reciprocal_rank(max_sim)
Expand Down Expand Up @@ -129,7 +129,7 @@ schema passage {
sum(
reduce(
sum(
query(qt) * unpack_bits(attribute(colbert)), x
query(qt) * unpack_bits(attribute(colbert_embeddings)), x
),
max, dt
),
Expand All @@ -138,7 +138,7 @@ schema passage {
}
}
function e5_cos_sim() {
expression: cos(distance(field, e5))
expression: cos(distance(field, e5_embedding))
}
function cross_encoder() {
expression: onnx(ranker){d0:0,d1:0}
Expand Down
4 changes: 2 additions & 2 deletions msmarco-ranking/services.xml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
<document-api/>

<!-- See https://docs.vespa.ai/en/embedding.html#huggingface-embedder -->
<component id="e5" type="hugging-face-embedder">
<component id="e5_embedding_model" type="hugging-face-embedder">
<transformer-model url="https://huggingface.co/intfloat/e5-small-v2/resolve/main/model.onnx"/>
<tokenizer-model url="https://huggingface.co/intfloat/e5-small-v2/raw/main/tokenizer.json"/>
<prepend>
Expand All @@ -24,7 +24,7 @@
</component>

<!-- See https://docs.vespa.ai/en/embedding.html#colbert-embedder -->
<component id="colbert" type="colbert-embedder">
<component id="colbert_embedding_model" type="colbert-embedder">
<transformer-model url="https://huggingface.co/colbert-ir/colbertv2.0/resolve/main/model.onnx"/>
<tokenizer-model url="https://huggingface.co/colbert-ir/colbertv2.0/raw/main/tokenizer.json"/>
</component>
Expand Down