From 6b527e52c463161badc670bc357052b9102a7bbe Mon Sep 17 00:00:00 2001 From: Mushkan Rana Date: Sun, 22 Mar 2026 09:17:36 +0530 Subject: [PATCH] Support multi-field TSV parsing in collection loader --- src/indexing.jl | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/indexing.jl b/src/indexing.jl index 9f40765..b93bb63 100644 --- a/src/indexing.jl +++ b/src/indexing.jl @@ -21,12 +21,32 @@ Type representing an ColBERT indexer. An [`Indexer`] wrapping a [`ColBERTConfig`](@ref) along with the trained ColBERT model. """ +function parse_tsv_line(line::String) + parts = split(line, '\t') + + # Skip invalid lines + if length(parts) < 2 + return nothing + end + + # Combine title + body + extra fields + return strip(join(parts[2:end], " ")) +end function Indexer(config::ColBERTConfig) tokenizer, bert, linear = load_hgf_pretrained_local(config.checkpoint) bert = bert |> Flux.gpu linear = linear |> Flux.gpu - collection = config.collection isa String ? readlines(config.collection) : - config.collection + collection = + if config.collection isa String + lines = readlines(config.collection) + + [ + doc for doc in (parse_tsv_line(line) for line in lines) + if doc !== nothing + ] + else + config.collection + end punctuations_and_padsym = [string.(collect("!\"#\$%&\'()*+,-./:;<=>?@[\\]^_`{|}~")); tokenizer.padsym] skiplist = config.mask_punctuation ?