From 17dfbe4f9abde794ed274acf315f4f8a53e493eb Mon Sep 17 00:00:00 2001
From: amalal-mazrua <50999159+amalal-mazrua@users.noreply.github.com>
Date: Fri, 10 Apr 2026 14:43:47 +0300
Subject: [PATCH 1/2] fix BERTUnfactoredDisambiguator.pretrained() clips text
 when word token unknow which return []

---
 camel_tools/disambig/bert/_bert_morph_dataset.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/camel_tools/disambig/bert/_bert_morph_dataset.py b/camel_tools/disambig/bert/_bert_morph_dataset.py
index 6ae4aea..02feb26 100644
--- a/camel_tools/disambig/bert/_bert_morph_dataset.py
+++ b/camel_tools/disambig/bert/_bert_morph_dataset.py
@@ -141,6 +141,13 @@ def _featurize_input(self, prepared_sentences, label_list, max_seq_length,
 
             for word, label in zip(sentence.words, sentence.labels):
                 word_tokens = tokenizer.tokenize(word)
+                # if word_tokens ==[]:
+                # If the word is not in the vocabulary, we use the
+                # [UNK] token to represent it.
+                if not word_tokens or all(token == tokenizer.unk_token for token in word_tokens):
+                    # word_tokens = ['[UNK]']
+                    # Use the [UNK] label id for the unknown word
+                    word_tokens = [tokenizer.unk_token]
                 # bert-base-multilingual-cased sometimes output "nothing ([])
                 # when calling tokenize with just a space.
                 if len(word_tokens) > 0:

From 31f47e260e403354cc0a049aee052abe55b3639c Mon Sep 17 00:00:00 2001
From: amalal-mazrua <50999159+amalal-mazrua@users.noreply.github.com>
Date: Fri, 10 Apr 2026 16:52:17 +0300
Subject: [PATCH 2/2] refactore _featurize_input

---
 .../disambig/bert/_bert_morph_dataset.py       | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/camel_tools/disambig/bert/_bert_morph_dataset.py b/camel_tools/disambig/bert/_bert_morph_dataset.py
index 02feb26..1903754 100644
--- a/camel_tools/disambig/bert/_bert_morph_dataset.py
+++ b/camel_tools/disambig/bert/_bert_morph_dataset.py
@@ -140,21 +140,19 @@ def _featurize_input(self, prepared_sentences, label_list, max_seq_length,
             label_ids = []
 
             for word, label in zip(sentence.words, sentence.labels):
+                if word is None or word.strip() == "":
+                    continue  # skip empty or whitespace words and do not add them to the tokens and label_ids lists
                 word_tokens = tokenizer.tokenize(word)
-                # if word_tokens ==[]:
-                # If the word is not in the vocabulary, we use the
-                # [UNK] token to represent it.
+                # If the word is not in the vocabulary, we use the [UNK] token to represent it.
                 if not word_tokens or all(token == tokenizer.unk_token for token in word_tokens):
                     # word_tokens = ['[UNK]']
                     # Use the [UNK] label id for the unknown word
                     word_tokens = [tokenizer.unk_token]
-                # bert-base-multilingual-cased sometimes output "nothing ([])
-                # when calling tokenize with just a space.
-                if len(word_tokens) > 0:
-                    tokens.append(word_tokens)
-                    # Use the real label id for the first token of the word,
-                    # and padding ids for the remaining tokens
-                    label_ids.append([label_map[label]] +
+
+                tokens.append(word_tokens)
+                # Use the real label id for the first token of the word,
+                # and padding ids for the remaining tokens
+                label_ids.append([label_map[label]] +
                                      [pad_token_label_id] *
                                      (len(word_tokens) - 1))