|
1 | 1 | #ifndef SOURCEMETA_CORE_URI_NORMALIZE_H_ |
2 | 2 | #define SOURCEMETA_CORE_URI_NORMALIZE_H_ |
3 | 3 |
|
4 | | -#include <string> // std::string |
| 4 | +#include <cstddef> // std::size_t |
| 5 | +#include <string> // std::string |
| 6 | +#include <string_view> // std::string_view |
5 | 7 |
|
6 | 8 | namespace sourcemeta::core { |
7 | 9 |
|
8 | | -// Normalize a URI path by removing "." and ".." segments |
9 | | -// Updates the path in-place according to RFC 3986 path segment normalization |
10 | | -// Handles: |
11 | | -// - Removal of "." segments |
12 | | -// - Resolution of ".." segments with proper backtracking |
13 | | -// - Preservation of leading ".." for relative paths |
14 | | -// - Preservation of trailing slashes |
15 | | -// - Preservation of empty segments (consecutive slashes) |
| 10 | +// Remove "." and ".." segments from a URI path per RFC 3986 Section 5.2.4 |
| 11 | +// (Remove Dot Segments). For absolute paths this matches the specification |
| 12 | +// verbatim. For relative paths, leading "../" blocks are preserved as an |
| 13 | +// extension because the spec algorithm assumes a path that has already been |
| 14 | +// merged with an absolute base; applied to a stand-alone relative path it |
| 15 | +// would discard semantic intent that is needed at later resolution time. |
16 | 16 | inline auto normalize_path(std::string &path) -> void { |
17 | | - if (path.empty() || path == "/") { |
18 | | - return; |
19 | | - } |
20 | | - |
21 | | - std::string canonical_path; |
22 | | - const auto had_leading_slash = path.starts_with("/"); |
23 | | - const auto had_trailing_slash = path.ends_with('/') && path != "/"; |
24 | | - bool last_segment_was_dot_or_dotdot{false}; |
25 | | - canonical_path.reserve(path.size()); |
26 | | - if (had_leading_slash) { |
27 | | - canonical_path = "/"; |
28 | | - } |
29 | | - |
30 | | - std::string::size_type minimum_position = had_leading_slash ? 1 : 0; |
31 | | - std::string::size_type read_position = had_leading_slash ? 1 : 0; |
32 | | - std::string::size_type segment_start = read_position; |
33 | | - |
34 | | - if (!had_leading_slash && read_position < path.size() && |
35 | | - path[read_position] == '.') { |
36 | | - if (read_position + 1 < path.size() && path[read_position + 1] == '/') { |
37 | | - read_position += 2; |
38 | | - segment_start = read_position; |
39 | | - } |
40 | | - } |
41 | | - |
42 | | - while (read_position <= path.size()) { |
43 | | - if (read_position == path.size() || path[read_position] == '/') { |
44 | | - const auto segment_length = read_position - segment_start; |
45 | | - if (segment_length == 0 && read_position == path.size() && |
46 | | - had_trailing_slash) { |
47 | | - break; |
48 | | - } |
49 | | - |
50 | | - if (segment_length == 2 && path[segment_start] == '.' && |
51 | | - path[segment_start + 1] == '.') { |
52 | | - last_segment_was_dot_or_dotdot = true; |
53 | | - if (canonical_path.size() > minimum_position) { |
54 | | - if (!canonical_path.empty() && canonical_path.back() == '/' && |
55 | | - (canonical_path.size() < 2 || |
56 | | - canonical_path[canonical_path.size() - 2] != '/')) { |
57 | | - canonical_path.pop_back(); |
58 | | - } |
59 | | - |
60 | | - while (canonical_path.size() > minimum_position && |
61 | | - canonical_path.back() != '/') { |
62 | | - canonical_path.pop_back(); |
63 | | - } |
64 | | - |
65 | | - if (!canonical_path.empty() && canonical_path.back() == '/' && |
66 | | - canonical_path.size() > minimum_position) { |
67 | | - canonical_path.pop_back(); |
68 | | - } |
69 | | - } else { |
70 | | - if (!had_leading_slash) { |
71 | | - if (canonical_path.size() > 0) { |
72 | | - canonical_path += '/'; |
73 | | - } |
74 | | - |
75 | | - canonical_path.append(".."); |
76 | | - minimum_position = canonical_path.size(); |
77 | | - } |
78 | | - } |
79 | | - } else if (segment_length == 1 && path[segment_start] == '.') { |
80 | | - last_segment_was_dot_or_dotdot = true; |
81 | | - } else if (segment_length == 0) { |
82 | | - last_segment_was_dot_or_dotdot = false; |
83 | | - if (canonical_path.size() >= minimum_position) { |
84 | | - canonical_path += '/'; |
| 17 | + const std::string buffer{std::move(path)}; |
| 18 | + std::string_view input{buffer}; |
| 19 | + std::string output; |
| 20 | + output.reserve(buffer.size()); |
| 21 | + const bool is_absolute{!input.empty() && input.front() == '/'}; |
| 22 | + |
| 23 | + while (!input.empty()) { |
| 24 | + if (input.starts_with("../")) { |
| 25 | + output.append("../"); |
| 26 | + input.remove_prefix(3); |
| 27 | + } else if (input.starts_with("./") || input.starts_with("/./")) { |
| 28 | + input.remove_prefix(2); |
| 29 | + } else if (input == "/.") { |
| 30 | + output.push_back('/'); |
| 31 | + break; |
| 32 | + } else if (input.starts_with("/../")) { |
| 33 | + input.remove_prefix(3); |
| 34 | + const auto last_slash{output.rfind('/')}; |
| 35 | + if (last_slash == std::string::npos) { |
| 36 | + output.clear(); |
| 37 | + if (!is_absolute && !input.empty() && input.front() == '/') { |
| 38 | + input.remove_prefix(1); |
85 | 39 | } |
86 | 40 | } else { |
87 | | - last_segment_was_dot_or_dotdot = false; |
88 | | - if (canonical_path.size() > 0 && |
89 | | - (canonical_path.size() > minimum_position || !had_leading_slash)) { |
90 | | - canonical_path += '/'; |
| 41 | + output.resize(last_slash); |
| 42 | + } |
| 43 | + } else if (input == "/..") { |
| 44 | + const auto last_slash{output.rfind('/')}; |
| 45 | + if (last_slash == std::string::npos) { |
| 46 | + output.clear(); |
| 47 | + if (is_absolute) { |
| 48 | + output.push_back('/'); |
91 | 49 | } |
92 | | - canonical_path.append(path, segment_start, segment_length); |
| 50 | + } else { |
| 51 | + output.resize(last_slash); |
| 52 | + output.push_back('/'); |
93 | 53 | } |
94 | | - |
95 | | - ++read_position; |
96 | | - segment_start = read_position; |
| 54 | + break; |
| 55 | + } else if (input == ".") { |
| 56 | + break; |
| 57 | + } else if (input == "..") { |
| 58 | + if (!is_absolute) { |
| 59 | + output.append("../"); |
| 60 | + } |
| 61 | + break; |
97 | 62 | } else { |
98 | | - ++read_position; |
| 63 | + const std::size_t next_slash{input.starts_with('/') ? input.find('/', 1) |
| 64 | + : input.find('/')}; |
| 65 | + if (next_slash == std::string_view::npos) { |
| 66 | + output.append(input); |
| 67 | + break; |
| 68 | + } |
| 69 | + output.append(input.substr(0, next_slash)); |
| 70 | + input.remove_prefix(next_slash); |
99 | 71 | } |
100 | 72 | } |
101 | 73 |
|
102 | | - if ((had_trailing_slash || last_segment_was_dot_or_dotdot) && |
103 | | - !canonical_path.empty() && canonical_path != "/" && |
104 | | - !canonical_path.ends_with('/')) { |
105 | | - canonical_path += '/'; |
106 | | - } |
107 | | - |
108 | | - path = std::move(canonical_path); |
| 74 | + path = std::move(output); |
109 | 75 | } |
110 | 76 |
|
111 | 77 | } // namespace sourcemeta::core |
|
0 commit comments