-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcode_tokenizer.py
More file actions
30 lines (27 loc) · 1.06 KB
/
code_tokenizer.py
File metadata and controls
30 lines (27 loc) · 1.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from keras.preprocessing.text import Tokenizer, maketrans
from my_tokenizer import MyTokenizer
# we do not want to filter anything in code.
def base_code_filter():
return ""
# this class overrides methods to be able to use the text_to_word_sequence function above.
class CodeTokenizer(MyTokenizer):
def text_to_word_sequence(self, text, filters=base_code_filter(), lower=True, split=" "):
''' Tokenizes code. All consecutive alphanumeric characters are grouped into one token.
Thereby trying to heuristically match identifiers.
All other symbols are seen as one token.
Whitespace is stripped, except the newline token.
'''
if lower:
text = text.lower() #type: str
seq = []
curr = ""
for c in text:
if c.isalnum():
curr += c
else:
if curr != "":
seq.append(curr)
curr = ""
if not c.isspace() or c == '\n':
seq.append(c)
return [_f for _f in seq if _f]