-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenizer.py
More file actions
111 lines (92 loc) · 3.77 KB
/
tokenizer.py
File metadata and controls
111 lines (92 loc) · 3.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/env python
"""
Tool for de/tokenizing and using BPE.
Supported tokenizers:
- Moses.
- Sentencepiece
- Mecab. (Tokenization only.)
- Stanford Word Segmenter. (Tokenization only.)
"""
from sacremoses import MosesTokenizer, MosesDetokenizer
import sentencepiece as spm
from subword_nmt.apply_bpe import BPE
import os, codecs, sys
from subprocess import run
from re import sub
class Tokenizer:
def __init__(self, tokenizer, model=None, bpe_codes=None):
"""
:param tokenizer: (str) tokenizer to use (moses, sentencepiece, mecab or stanford).
:param model: (str) path to tokenizer model.
:param bpe_codes: (str) path to bpe_codes.
"""
# Set tokenizer.
if tokenizer == 'moses':
self.tokenizer, self.detokenizer = MosesTokenizer(), MosesDetokenizer()
elif tokenizer == 'sentencepiece':
self.tokenizer = spm.SentencePieceProcessor()
if model is None:
raise ValueError("Tokenizer model is mandatory for Sentencepiece.")
self.tokenizer.Load(model)
self.detokenizer = None
elif tokenizer == 'mecab':
self.tokenizer = None
self.detokenizer = None
elif tokenizer == 'stanford':
self.tokenizer = None
self.detokenizer = None
else:
raise ValueError("Invalid value for tokenizer. Supported tokenizers: moses, "
"sentencepiece, mecab and stanford")
self.tokenizer_type = tokenizer
self.bpe = None if bpe_codes is None else BPE(codecs.open(bpe_codes, encoding='utf-8'))
def tokenize(self, sentence):
"""
This method tokenizes a sentence.
:param sentence: (str) sentence.
:return: (str) tokenized sentence.
"""
if self.tokenizer_type == 'moses':
return " ".join(self.tokenizer.tokenize(sentence))
if self.tokenizer_type == 'sentencepiece':
return " ".join(self.tokenizer.EncodeAsPieces(sentence))
dir_path = os.path.dirname(os.path.realpath(sys.argv[0]))
if self.tokenizer_type == 'mecab':
try:
sentence_ = run([dir_path + '/tokenizers/mecab/bin/mecab', '-O', 'wakati'], capture_output=True,
encoding='utf-8', input=sentence).stdout.strip()
except:
raise ValueError("Mecab not install.")
return sentence_
if self.tokenizer_type == 'stanford':
try:
sentence_ = run([dir_path + '/tokenizers/stanford_segmenter/segment.sh', 'ctb', '/dev/stdin',
'UTF-8', '0'], capture_output=True, encoding='utf-8', input=sentence).stdout.strip()
except:
raise ValueError("Stanford Word Segmenter not install.")
return sentence_
def detokenize(self, sentence):
"""
This method detokenizes a sentence.
:param sentence: (str) sentence.
:return: (str) detokenized sentence.
"""
if self.tokenizer_type == 'moses':
return self.detokenizer.detokenize(sentence.split())
if self.tokenizer_type == 'sentencepiece':
return self.tokenizer.DecodePieces(sentence.split())
raise ValueError("Detokenization not yet implemented for this tokenizer.")
def apply_bpe(self, sentence):
"""
This method applies BPE to a sentence.
:param sentence: (str) sentence.
:return: (str) BPE sentence.
"""
return self.bpe.segment(sentence)
def remove_bpe(self, sentence):
"""
This method removes BPE from a sentence.
:param sentence: (str) BPE sentence.
:return: sentence.
"""
return sub("(@@ )|(@@ ?$)", '', sentence)