From 17dfbe4f9abde794ed274acf315f4f8a53e493eb Mon Sep 17 00:00:00 2001 From: amalal-mazrua <50999159+amalal-mazrua@users.noreply.github.com> Date: Fri, 10 Apr 2026 14:43:47 +0300 Subject: [PATCH 1/2] fix BERTUnfactoredDisambiguator.pretrained() clips text when word token unknow which return [] --- camel_tools/disambig/bert/_bert_morph_dataset.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/camel_tools/disambig/bert/_bert_morph_dataset.py b/camel_tools/disambig/bert/_bert_morph_dataset.py index 6ae4aea..02feb26 100644 --- a/camel_tools/disambig/bert/_bert_morph_dataset.py +++ b/camel_tools/disambig/bert/_bert_morph_dataset.py @@ -141,6 +141,13 @@ def _featurize_input(self, prepared_sentences, label_list, max_seq_length, for word, label in zip(sentence.words, sentence.labels): word_tokens = tokenizer.tokenize(word) + # if word_tokens ==[]: + # If the word is not in the vocabulary, we use the + # [UNK] token to represent it. + if not word_tokens or all(token == tokenizer.unk_token for token in word_tokens): + # word_tokens = ['[UNK]'] + # Use the [UNK] label id for the unknown word + word_tokens = [tokenizer.unk_token] # bert-base-multilingual-cased sometimes output "nothing ([]) # when calling tokenize with just a space. if len(word_tokens) > 0: From 31f47e260e403354cc0a049aee052abe55b3639c Mon Sep 17 00:00:00 2001 From: amalal-mazrua <50999159+amalal-mazrua@users.noreply.github.com> Date: Fri, 10 Apr 2026 16:52:17 +0300 Subject: [PATCH 2/2] refactore _featurize_input --- .../disambig/bert/_bert_morph_dataset.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/camel_tools/disambig/bert/_bert_morph_dataset.py b/camel_tools/disambig/bert/_bert_morph_dataset.py index 02feb26..1903754 100644 --- a/camel_tools/disambig/bert/_bert_morph_dataset.py +++ b/camel_tools/disambig/bert/_bert_morph_dataset.py @@ -140,21 +140,19 @@ def _featurize_input(self, prepared_sentences, label_list, max_seq_length, label_ids = [] for word, label in zip(sentence.words, sentence.labels): + if word is None or word.strip() == "": + continue # skip empty or whitespace words and do not add them to the tokens and label_ids lists word_tokens = tokenizer.tokenize(word) - # if word_tokens ==[]: - # If the word is not in the vocabulary, we use the - # [UNK] token to represent it. + # If the word is not in the vocabulary, we use the [UNK] token to represent it. if not word_tokens or all(token == tokenizer.unk_token for token in word_tokens): # word_tokens = ['[UNK]'] # Use the [UNK] label id for the unknown word word_tokens = [tokenizer.unk_token] - # bert-base-multilingual-cased sometimes output "nothing ([]) - # when calling tokenize with just a space. - if len(word_tokens) > 0: - tokens.append(word_tokens) - # Use the real label id for the first token of the word, - # and padding ids for the remaining tokens - label_ids.append([label_map[label]] + + + tokens.append(word_tokens) + # Use the real label id for the first token of the word, + # and padding ids for the remaining tokens + label_ids.append([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))