-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenize.cpp
More file actions
90 lines (82 loc) · 5.28 KB
/
tokenize.cpp
File metadata and controls
90 lines (82 loc) · 5.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
//#include "global.h"
#include "node.hpp"
#include <ctre.hpp>
namespace turtle {
void tokenize(std::string &filedata, std::vector<turtle::_Lexeme> &Lexemes)
{
/*
Paste into regex101.com
Replace regex comments with )"${2}R"( by using (\(\?#([^)]*)\))|^
[a-zA-Z]{0,2}?(("|')(\2{2})?)((?:[^\\"]|\\.|\\)*\1)?|(?# //capture strings - check later on if string
// prefix is valid and the string terminates
)(#[^\r\n]*)|(?# //capture comments
)([\n\r][ \t]*)|(?# //capture newlines
)(\\[^\r\n]*)|(?# //capture \TheBackslashAndAnythingAfterIt
)((?#
)(\.{3})|(?# //capture ...
)(->)|(?# //capture ->
//fucking floating point numbers
)(\d[\d_]*\.[\d_]*\d[\d_]*[eE]-?[\d_]*)|(?# //capture exponential floating point literals
)(\d[\d_]*\.[\d_]*\d[\d_]*[\w]*)|(?# //capture floating point literals -> \d.\d [suffix]
)(\d[\d_]*\.[eE]-?[\d_]*)|(?# //capture exponential floating point literals
)(\d[\d_]*\.\w*)|(?# //capture floating point literals -> \d. [suffix]
)(\.\d[\d_]*[eE]-?[\d_]*)|(?# //capture exponential floating point literals
)(\.\d[\d_]*\w*)|(?# //capture floating point literals -> .\d [suffix]
)(\d[\d_]*[eE]-?[\d_]*)|(?# //capture exponential literals
)([<>*\/]{2})=?|(?# //capture 2-3 character operators
)([!%&*+\-<=>@\/\\^|:]=)(?# //capture 2 caracter operators
))|(?#
)([!-\/:-@\[-^{-~]|[^\s!-\/:-@\[-^{-~]+)|(?# //capture anything else
)(\s+) //capture whitespace in order to keep track of position with ctre
*/
//when rEgEX is A LaNGUAgE
static constexpr ctll::fixed_string TokenRegex {
R"([a-zA-Z]{0,2}?(("|')(\2{2})?)((?:[^\\"]|\\.|\\)*\1)?|)" //capture strings - check later on if string
// prefix is valid and the string terminates
R"((#[^\r\n]*)|)" //capture comments
R"(([\n\r][ \t]*)|)" //capture newlines
R"((\\[^\r\n]*)|)" //capture \TheBackslashAndAnythingAfterIt
R"(()"
R"((\.{3})|)" //capture ...
R"((->)|)" //capture ->
//fucking floating point numbers
R"((\d[\d_]*\.[\d_]*\d[\d_]*[eE]-?[\d_]*)|)" //capture exponential floating point literals
R"((\d[\d_]*\.[\d_]*\d[\d_]*[\w]*)|)" //capture floating point literals -> \d.\d [suffix]
R"((\d[\d_]*\.[eE]-?[\d_]*)|)" //capture exponential floating point literals
R"((\d[\d_]*\.\w*)|)" //capture floating point literals -> \d. [suffix]
R"((\.\d[\d_]*[eE]-?[\d_]*)|)" //capture exponential floating point literals
R"((\.\d[\d_]*\w*)|)" //capture floating point literals -> .\d [suffix]
R"((\d[\d_]*[eE]-?[\d_]*)|)" //capture exponential literals
R"(([<>*\/]{2})=?|)" //capture 2-3 character operators
R"(([!%&*+\-<=>@\/\\^|:]=))" //capture 2 caracter operators
R"()|)"
R"((\{\}|\(\)|\[\])|)" //capture empty braces. Due to the fact that theres nothing in
// them we can combine them as a single token
R"(([!-\/:-@\[-^{-~]|[^\s!-\/:-@\[-^{-~]+)|)" //capture anything else
R"((\s+))" //capture whitespace in order to keep track of position with ctre
};
const auto& matches = ctre::range<TokenRegex>(static_cast<std::string_view>(filedata));
//std::distance is not constexpr thus it does not work with ctre
auto distance = [](const auto& first, const auto& last){
size_t i = 0;
for(auto it = first; it != last; ++it){
++i;
}
return i;
};
Lexemes.reserve(distance(matches.begin(), matches.end()));
unsigned int ln = 0, nl_pos = 0;
uint_fast64_t position = 0;
for (const auto & match : matches){
position += match.size();
const auto& str = match.str();
if(str[0] == '\n' || str[0] == '\r'){
++ln;
nl_pos = position;
}
if(str[0] != ' '){
Lexemes.push_back({str, (uint_fast64_t)(position - nl_pos), ln});
}
}
}
}