Skip to content

Commit fc0dd4e

Browse files
authored
Implement performant URI path manipulation utilities (#2387)
Signed-off-by: Juan Cruz Viotti <jv@jviotti.com>
1 parent ec9009d commit fc0dd4e

9 files changed

Lines changed: 959 additions & 156 deletions

File tree

src/core/uri/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
sourcemeta_library(NAMESPACE sourcemeta PROJECT core NAME uri
22
PRIVATE_HEADERS error.h
33
SOURCES uri.cc parse.cc accessors.cc setters.cc recompose.cc canonicalize.cc
4-
resolution.cc filesystem.cc query.cc escaping.h normalize.h grammar.h)
4+
resolution.cc filesystem.cc query.cc path.cc escaping.h normalize.h
5+
grammar.h)
56

67
if(SOURCEMETA_CORE_INSTALL)
78
sourcemeta_library_install(NAMESPACE sourcemeta PROJECT core NAME uri)

src/core/uri/include/sourcemeta/core/uri.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -641,6 +641,37 @@ class SOURCEMETA_CORE_URI_EXPORT URI {
641641
[[nodiscard]] static auto is_uri_reference(std::string_view input) noexcept
642642
-> bool;
643643

644+
/// Strip a URI path prefix and return the remaining suffix. For example:
645+
///
646+
/// ```cpp
647+
/// #include <sourcemeta/core/uri.h>
648+
/// #include <cassert>
649+
///
650+
/// const auto result{
651+
/// sourcemeta::core::URI::strip_path_prefix("/foo/bar/baz", "/foo")};
652+
/// assert(result.has_value());
653+
/// assert(result.value() == "bar/baz");
654+
/// ```
655+
[[nodiscard]] static auto strip_path_prefix(std::string_view path,
656+
std::string_view prefix)
657+
-> std::optional<std::string>;
658+
659+
/// Replace a URI path prefix with a new prefix. For example:
660+
///
661+
/// ```cpp
662+
/// #include <sourcemeta/core/uri.h>
663+
/// #include <cassert>
664+
///
665+
/// const auto result{sourcemeta::core::URI::rebase_path(
666+
/// "/foo/bar/baz", "/foo", "https://example.com")};
667+
/// assert(result.has_value());
668+
/// assert(result.value() == "https://example.com/bar/baz");
669+
/// ```
670+
[[nodiscard]] static auto rebase_path(std::string_view path,
671+
std::string_view old_prefix,
672+
std::string_view new_prefix)
673+
-> std::optional<std::string>;
674+
644675
private:
645676
auto parse(std::string_view input) -> void;
646677

src/core/uri/normalize.h

Lines changed: 59 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -1,111 +1,77 @@
11
#ifndef SOURCEMETA_CORE_URI_NORMALIZE_H_
22
#define SOURCEMETA_CORE_URI_NORMALIZE_H_
33

4-
#include <string> // std::string
4+
#include <cstddef> // std::size_t
5+
#include <string> // std::string
6+
#include <string_view> // std::string_view
57

68
namespace sourcemeta::core {
79

8-
// Normalize a URI path by removing "." and ".." segments
9-
// Updates the path in-place according to RFC 3986 path segment normalization
10-
// Handles:
11-
// - Removal of "." segments
12-
// - Resolution of ".." segments with proper backtracking
13-
// - Preservation of leading ".." for relative paths
14-
// - Preservation of trailing slashes
15-
// - Preservation of empty segments (consecutive slashes)
10+
// Remove "." and ".." segments from a URI path per RFC 3986 Section 5.2.4
11+
// (Remove Dot Segments). For absolute paths this matches the specification
12+
// verbatim. For relative paths, leading "../" blocks are preserved as an
13+
// extension because the spec algorithm assumes a path that has already been
14+
// merged with an absolute base; applied to a stand-alone relative path it
15+
// would discard semantic intent that is needed at later resolution time.
1616
inline auto normalize_path(std::string &path) -> void {
17-
if (path.empty() || path == "/") {
18-
return;
19-
}
20-
21-
std::string canonical_path;
22-
const auto had_leading_slash = path.starts_with("/");
23-
const auto had_trailing_slash = path.ends_with('/') && path != "/";
24-
bool last_segment_was_dot_or_dotdot{false};
25-
canonical_path.reserve(path.size());
26-
if (had_leading_slash) {
27-
canonical_path = "/";
28-
}
29-
30-
std::string::size_type minimum_position = had_leading_slash ? 1 : 0;
31-
std::string::size_type read_position = had_leading_slash ? 1 : 0;
32-
std::string::size_type segment_start = read_position;
33-
34-
if (!had_leading_slash && read_position < path.size() &&
35-
path[read_position] == '.') {
36-
if (read_position + 1 < path.size() && path[read_position + 1] == '/') {
37-
read_position += 2;
38-
segment_start = read_position;
39-
}
40-
}
41-
42-
while (read_position <= path.size()) {
43-
if (read_position == path.size() || path[read_position] == '/') {
44-
const auto segment_length = read_position - segment_start;
45-
if (segment_length == 0 && read_position == path.size() &&
46-
had_trailing_slash) {
47-
break;
48-
}
49-
50-
if (segment_length == 2 && path[segment_start] == '.' &&
51-
path[segment_start + 1] == '.') {
52-
last_segment_was_dot_or_dotdot = true;
53-
if (canonical_path.size() > minimum_position) {
54-
if (!canonical_path.empty() && canonical_path.back() == '/' &&
55-
(canonical_path.size() < 2 ||
56-
canonical_path[canonical_path.size() - 2] != '/')) {
57-
canonical_path.pop_back();
58-
}
59-
60-
while (canonical_path.size() > minimum_position &&
61-
canonical_path.back() != '/') {
62-
canonical_path.pop_back();
63-
}
64-
65-
if (!canonical_path.empty() && canonical_path.back() == '/' &&
66-
canonical_path.size() > minimum_position) {
67-
canonical_path.pop_back();
68-
}
69-
} else {
70-
if (!had_leading_slash) {
71-
if (canonical_path.size() > 0) {
72-
canonical_path += '/';
73-
}
74-
75-
canonical_path.append("..");
76-
minimum_position = canonical_path.size();
77-
}
78-
}
79-
} else if (segment_length == 1 && path[segment_start] == '.') {
80-
last_segment_was_dot_or_dotdot = true;
81-
} else if (segment_length == 0) {
82-
last_segment_was_dot_or_dotdot = false;
83-
if (canonical_path.size() >= minimum_position) {
84-
canonical_path += '/';
17+
const std::string buffer{std::move(path)};
18+
std::string_view input{buffer};
19+
std::string output;
20+
output.reserve(buffer.size());
21+
const bool is_absolute{!input.empty() && input.front() == '/'};
22+
23+
while (!input.empty()) {
24+
if (input.starts_with("../")) {
25+
output.append("../");
26+
input.remove_prefix(3);
27+
} else if (input.starts_with("./") || input.starts_with("/./")) {
28+
input.remove_prefix(2);
29+
} else if (input == "/.") {
30+
output.push_back('/');
31+
break;
32+
} else if (input.starts_with("/../")) {
33+
input.remove_prefix(3);
34+
const auto last_slash{output.rfind('/')};
35+
if (last_slash == std::string::npos) {
36+
output.clear();
37+
if (!is_absolute && !input.empty() && input.front() == '/') {
38+
input.remove_prefix(1);
8539
}
8640
} else {
87-
last_segment_was_dot_or_dotdot = false;
88-
if (canonical_path.size() > 0 &&
89-
(canonical_path.size() > minimum_position || !had_leading_slash)) {
90-
canonical_path += '/';
41+
output.resize(last_slash);
42+
}
43+
} else if (input == "/..") {
44+
const auto last_slash{output.rfind('/')};
45+
if (last_slash == std::string::npos) {
46+
output.clear();
47+
if (is_absolute) {
48+
output.push_back('/');
9149
}
92-
canonical_path.append(path, segment_start, segment_length);
50+
} else {
51+
output.resize(last_slash);
52+
output.push_back('/');
9353
}
94-
95-
++read_position;
96-
segment_start = read_position;
54+
break;
55+
} else if (input == ".") {
56+
break;
57+
} else if (input == "..") {
58+
if (!is_absolute) {
59+
output.append("../");
60+
}
61+
break;
9762
} else {
98-
++read_position;
63+
const std::size_t next_slash{input.starts_with('/') ? input.find('/', 1)
64+
: input.find('/')};
65+
if (next_slash == std::string_view::npos) {
66+
output.append(input);
67+
break;
68+
}
69+
output.append(input.substr(0, next_slash));
70+
input.remove_prefix(next_slash);
9971
}
10072
}
10173

102-
if ((had_trailing_slash || last_segment_was_dot_or_dotdot) &&
103-
!canonical_path.empty() && canonical_path != "/" &&
104-
!canonical_path.ends_with('/')) {
105-
canonical_path += '/';
106-
}
107-
108-
path = std::move(canonical_path);
74+
path = std::move(output);
10975
}
11076

11177
} // namespace sourcemeta::core

src/core/uri/path.cc

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
#include <sourcemeta/core/uri.h>
2+
3+
#include "escaping.h"
4+
#include "grammar.h"
5+
#include "normalize.h"
6+
7+
#include <cstddef> // std::size_t
8+
#include <optional> // std::optional, std::nullopt
9+
#include <string> // std::string
10+
#include <string_view> // std::string_view
11+
12+
namespace {
13+
14+
auto canonicalize_path(const std::string_view input, std::string &output)
15+
-> bool {
16+
output.assign(input);
17+
if (output.empty()) {
18+
return true;
19+
}
20+
if (output.front() != sourcemeta::core::URI_SLASH) {
21+
return false;
22+
}
23+
24+
for (std::size_t index{0}; index < output.size();) {
25+
const auto character{output[index]};
26+
if (character == sourcemeta::core::URI_PERCENT) {
27+
if (!sourcemeta::core::uri_is_percent_encoded(output, index)) {
28+
return false;
29+
}
30+
index += 3;
31+
} else if (character == sourcemeta::core::URI_SLASH ||
32+
sourcemeta::core::uri_is_pchar(character)) {
33+
++index;
34+
} else {
35+
return false;
36+
}
37+
}
38+
39+
sourcemeta::core::uri_normalize_percent_encoding_inplace(output);
40+
sourcemeta::core::uri_unescape_unreserved_inplace(output);
41+
sourcemeta::core::normalize_path(output);
42+
return true;
43+
}
44+
45+
} // namespace
46+
47+
namespace sourcemeta::core {
48+
49+
auto URI::strip_path_prefix(const std::string_view path,
50+
const std::string_view prefix)
51+
-> std::optional<std::string> {
52+
std::string path_canonical;
53+
std::string prefix_canonical;
54+
if (!canonicalize_path(path, path_canonical) ||
55+
!canonicalize_path(prefix, prefix_canonical)) {
56+
return std::nullopt;
57+
}
58+
59+
std::size_t suffix_start{0};
60+
const bool prefix_provides_boundary{prefix_canonical.ends_with(URI_SLASH)};
61+
if (!prefix_canonical.empty()) {
62+
if (!path_canonical.starts_with(prefix_canonical)) {
63+
return std::nullopt;
64+
}
65+
if (!prefix_provides_boundary &&
66+
path_canonical.size() > prefix_canonical.size() &&
67+
path_canonical[prefix_canonical.size()] != URI_SLASH) {
68+
return std::nullopt;
69+
}
70+
suffix_start = prefix_canonical.size();
71+
}
72+
if (!prefix_provides_boundary && suffix_start < path_canonical.size() &&
73+
path_canonical[suffix_start] == URI_SLASH) {
74+
++suffix_start;
75+
}
76+
77+
path_canonical.erase(0, suffix_start);
78+
return path_canonical;
79+
}
80+
81+
auto URI::rebase_path(const std::string_view path,
82+
const std::string_view old_prefix,
83+
const std::string_view new_prefix)
84+
-> std::optional<std::string> {
85+
const auto suffix{URI::strip_path_prefix(path, old_prefix)};
86+
if (!suffix.has_value()) {
87+
return std::nullopt;
88+
}
89+
const bool needs_separator{!suffix.value().empty() && !new_prefix.empty() &&
90+
!new_prefix.ends_with(URI_SLASH)};
91+
std::string result;
92+
result.reserve(new_prefix.size() + (needs_separator ? 1 : 0) +
93+
suffix.value().size());
94+
result.append(new_prefix);
95+
if (needs_separator) {
96+
result.push_back(URI_SLASH);
97+
}
98+
result.append(suffix.value());
99+
return result;
100+
}
101+
102+
} // namespace sourcemeta::core

0 commit comments

Comments
 (0)