Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions camel_tools/disambig/bert/_bert_morph_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,14 +140,19 @@ def _featurize_input(self, prepared_sentences, label_list, max_seq_length,
label_ids = []

for word, label in zip(sentence.words, sentence.labels):
if word is None or word.strip() == "":
continue # skip empty or whitespace words and do not add them to the tokens and label_ids lists
word_tokens = tokenizer.tokenize(word)
# bert-base-multilingual-cased sometimes output "nothing ([])
# when calling tokenize with just a space.
if len(word_tokens) > 0:
tokens.append(word_tokens)
# Use the real label id for the first token of the word,
# and padding ids for the remaining tokens
label_ids.append([label_map[label]] +
# If the word is not in the vocabulary, we use the [UNK] token to represent it.
if not word_tokens or all(token == tokenizer.unk_token for token in word_tokens):
# word_tokens = ['[UNK]']
# Use the [UNK] label id for the unknown word
word_tokens = [tokenizer.unk_token]

tokens.append(word_tokens)
# Use the real label id for the first token of the word,
# and padding ids for the remaining tokens
label_ids.append([label_map[label]] +
[pad_token_label_id] *
(len(word_tokens) - 1))

Expand Down