From 62458b737221d8ffe8654aceb65ff4eb0d7d6f80 Mon Sep 17 00:00:00 2001 From: Vincenzo Petrucci Date: Sat, 16 May 2026 14:28:24 +0200 Subject: [PATCH 1/4] feat: support strtotime article offsets --- examples/strtotime-relative/main.php | 6 ++ src/codegen/runtime/system/strtotime/data.rs | 3 + src/codegen/runtime/system/strtotime/mod.rs | 16 +++- .../runtime/system/strtotime/offsets.rs | 84 +++++++++++++++++-- tests/codegen/system.rs | 26 ++++++ 5 files changed, 127 insertions(+), 8 deletions(-) diff --git a/examples/strtotime-relative/main.php b/examples/strtotime-relative/main.php index f5ac3235..5fe069a7 100644 --- a/examples/strtotime-relative/main.php +++ b/examples/strtotime-relative/main.php @@ -9,6 +9,12 @@ $past = strtotime("3 days ago"); echo "3 days ago = " . date("Y-m-d", $past) . "\n"; +$article = strtotime("a day ago"); +echo "a day ago = " . date("Y-m-d H:i", $article) . "\n"; + +$hour = strtotime("an hour"); +echo "an hour = " . date("H:i", $hour) . "\n"; + $weekday = strtotime("next Monday"); echo "next Monday = " . date("Y-m-d (D)", $weekday) . "\n"; diff --git a/src/codegen/runtime/system/strtotime/data.rs b/src/codegen/runtime/system/strtotime/data.rs index 18f9cd8e..a0fbc4c6 100644 --- a/src/codegen/runtime/system/strtotime/data.rs +++ b/src/codegen/runtime/system/strtotime/data.rs @@ -14,6 +14,7 @@ /// Kinds 0-5: bare keywords. Kinds 6-8: modifiers consumed by the weekday strategy. /// Kind 9: `ago` (consumed by the offsets strategy as a trailing suffix). /// Kinds 10-16: weekday names (10=Sun..16=Sat) — full and abbreviated forms share the same kind. +/// Kinds 17-18: "a"/"an" relative magnitudes consumed by the offsets strategy. const KEYWORDS: &[(&str, u8)] = &[ ("now", 0), ("today", 1), @@ -25,6 +26,8 @@ const KEYWORDS: &[(&str, u8)] = &[ ("last", 7), ("this", 8), ("ago", 9), + ("a", 17), + ("an", 18), ("sunday", 10), ("monday", 11), ("tuesday", 12), diff --git a/src/codegen/runtime/system/strtotime/mod.rs b/src/codegen/runtime/system/strtotime/mod.rs index 4c230d2e..34a6acb7 100644 --- a/src/codegen/runtime/system/strtotime/mod.rs +++ b/src/codegen/runtime/system/strtotime/mod.rs @@ -109,6 +109,8 @@ fn emit_dispatcher_arm64(emitter: &mut Emitter) { emitter.instruction("sub w10, w9, #97"); // 'a' = 97 emitter.instruction("cmp w10, #25"); // ASCII alpha (a-z) ? emitter.instruction("b.hi __rt_strtotime_fail"); // not alpha → fail + emitter.instruction("cmp w9, #97"); // possible "a/an " article-relative form ? + emitter.instruction("b.eq __rt_strtotime_offsets_entry"); // let offsets parse or reject the article form // -- alpha: try keyword table match -- emitter.instruction("add x6, sp, #64"); // x6 = lc16 buffer ptr (candidate) @@ -134,7 +136,11 @@ fn emit_dispatcher_arm64(emitter: &mut Emitter) { emitter.instruction("cmp x9, #10"); // kind 9 = bare "ago" (not a top-level term) emitter.instruction("b.lt __rt_strtotime_fail"); // → fail emitter.instruction("cmp x9, #16"); // kind 10..16 = weekday name ? - emitter.instruction("b.gt __rt_strtotime_fail"); // unknown kind → fail + emitter.instruction("b.le __rt_strtotime_alpha_direct_weekday"); // yes → direct weekday strategy + emitter.instruction("cmp x9, #18"); // kind 17..18 = a/an relative magnitude ? + emitter.instruction("b.le __rt_strtotime_offsets_entry"); // let the offsets strategy parse the full relative expression + emitter.instruction("b __rt_strtotime_fail"); // unknown kind → fail + emitter.label("__rt_strtotime_alpha_direct_weekday"); emitter.instruction("ldr x8, [sp, #56]"); // reload trimmed input length emitter.instruction("cmp x10, x8"); // weekday consumed the whole input ? emitter.instruction("b.ne __rt_strtotime_fail"); // trailing junk after weekday → fail @@ -210,6 +216,8 @@ fn emit_dispatcher_linux_x86_64(emitter: &mut Emitter) { emitter.instruction("sub ecx, 97"); // 'a' = 97 emitter.instruction("cmp ecx, 25"); // ASCII alpha ? emitter.instruction("ja __rt_strtotime_fail_linux_x86_64"); // not alpha → fail + emitter.instruction("cmp al, 97"); // possible "a/an " article-relative form ? + emitter.instruction("je __rt_strtotime_offsets_entry_linux_x86_64"); // let offsets parse or reject the article form // -- alpha: try keyword table match -- // Args (caller-saved): rdi = candidate ptr, rsi = table base, rcx = available bytes. @@ -236,7 +244,11 @@ fn emit_dispatcher_linux_x86_64(emitter: &mut Emitter) { emitter.instruction("cmp rdx, 10"); // kind 9 = bare "ago" → fail emitter.instruction("jl __rt_strtotime_fail_linux_x86_64"); // below 10 → fail emitter.instruction("cmp rdx, 16"); // weekday name ? - emitter.instruction("jg __rt_strtotime_fail_linux_x86_64"); // unknown kind → fail + emitter.instruction("jle __rt_strtotime_alpha_direct_weekday_linux_x86_64"); // yes → direct weekday strategy + emitter.instruction("cmp rdx, 18"); // kind 17..18 = a/an relative magnitude ? + emitter.instruction("jle __rt_strtotime_offsets_entry_linux_x86_64"); // let the offsets strategy parse the full relative expression + emitter.instruction("jmp __rt_strtotime_fail_linux_x86_64"); // unknown kind → fail + emitter.label("__rt_strtotime_alpha_direct_weekday_linux_x86_64"); emitter.instruction("cmp rax, QWORD PTR [rbp - 72]"); // weekday consumed the whole input ? emitter.instruction("jne __rt_strtotime_fail_linux_x86_64"); // trailing junk after weekday → fail emitter.instruction("jmp __rt_strtotime_weekdays_entry_linux_x86_64"); // yes → weekdays diff --git a/src/codegen/runtime/system/strtotime/offsets.rs b/src/codegen/runtime/system/strtotime/offsets.rs index 8100f1b6..a09aae3a 100644 --- a/src/codegen/runtime/system/strtotime/offsets.rs +++ b/src/codegen/runtime/system/strtotime/offsets.rs @@ -1,5 +1,5 @@ //! Purpose: -//! Emits the relative-offset parser sub-routine for `__rt_strtotime` — supports `[+-]?N unit`, composite forms (`"+1 day 2 hours"`), and trailing `ago`. +//! Emits the relative-offset parser sub-routine for `__rt_strtotime` — supports `[+-]?N unit`, `a/an unit`, composite forms (`"+1 day 2 hours"`), and trailing `ago`. //! Combines per-term offsets into `tm_*` fields via the now_tm helper, then normalizes through libc `mktime` for DST-aware day/week math. //! //! Called from: @@ -63,10 +63,46 @@ fn emit_offsets_arm64(emitter: &mut Emitter) { emitter.instruction("str w12, [sp, #104]"); // save sign across helper calls // -- parse decimal magnitude -- - emitter.instruction("mov x10, x3"); // remember cursor before parse_dec + emitter.instruction("mov x12, x3"); // remember cursor before parse_dec emitter.instruction("bl __rt_strtotime_parse_dec"); // x5 = value, x3 = new cursor - emitter.instruction("cmp x3, x10"); // cursor advanced ? - emitter.instruction("b.eq __rt_strtotime_fail"); // no digits → fail + emitter.instruction("cmp x3, x12"); // cursor advanced ? + emitter.instruction("b.ne __rt_strtotime_offsets_value_ready"); // numeric value parsed → continue + + // -- accept PHP relative articles: "a day", "an hour" -- + emitter.instruction("cmp x3, x4"); // any bytes left for a/an ? + emitter.instruction("b.ge __rt_strtotime_fail"); // no → fail + emitter.instruction("ldrb w14, [x3]"); // load candidate article first byte + emitter.instruction("orr w14, w14, #0x20"); // lowercase ASCII + emitter.instruction("cmp w14, #97"); // 'a' ? + emitter.instruction("b.ne __rt_strtotime_fail"); // no article → fail + emitter.instruction("add x15, x3, #1"); // position after "a" + emitter.instruction("cmp x15, x4"); // input ended after "a" ? + emitter.instruction("b.ge __rt_strtotime_offsets_article_a"); // consume it and let unit parsing fail if missing + emitter.instruction("ldrb w14, [x3, #1]"); // load byte after "a" + emitter.instruction("orr w14, w14, #0x20"); // lowercase ASCII + emitter.instruction("cmp w14, #110"); // 'n' ? + emitter.instruction("b.eq __rt_strtotime_offsets_article_an_check"); // maybe "an" + emitter.instruction("sub w14, w14, #97"); // normalize byte after "a" for alpha boundary check + emitter.instruction("cmp w14, #25"); // alpha immediately after "a" ? + emitter.instruction("b.ls __rt_strtotime_fail"); // yes → not the article word + emitter.label("__rt_strtotime_offsets_article_a"); + emitter.instruction("add x3, x3, #1"); // consume "a" + emitter.instruction("mov x5, #1"); // article magnitude = 1 + emitter.instruction("b __rt_strtotime_offsets_value_ready"); // continue with unit parsing + emitter.label("__rt_strtotime_offsets_article_an_check"); + emitter.instruction("add x15, x3, #2"); // position after "an" + emitter.instruction("cmp x15, x4"); // input ended after "an" ? + emitter.instruction("b.ge __rt_strtotime_offsets_article_an"); // consume it and let unit parsing fail if missing + emitter.instruction("ldrb w14, [x3, #2]"); // load byte after "an" + emitter.instruction("orr w14, w14, #0x20"); // lowercase ASCII + emitter.instruction("sub w14, w14, #97"); // normalize for alpha boundary check + emitter.instruction("cmp w14, #25"); // alpha immediately after "an" ? + emitter.instruction("b.ls __rt_strtotime_fail"); // yes → not the article word + emitter.label("__rt_strtotime_offsets_article_an"); + emitter.instruction("add x3, x3, #2"); // consume "an" + emitter.instruction("mov x5, #1"); // article magnitude = 1 + + emitter.label("__rt_strtotime_offsets_value_ready"); emitter.instruction("bl __rt_strtotime_skip_ws"); // WS between number and unit // -- lowercase next 16 bytes from cursor into [sp+64..79] for unit match -- @@ -251,12 +287,48 @@ fn emit_offsets_linux_x86_64(emitter: &mut Emitter) { emitter.label("__rt_strtotime_offsets_save_sign_linux_x86_64"); emitter.instruction("mov DWORD PTR [rsp + 104], ecx"); // save sign across helper calls - // -- parse decimal magnitude -- + // -- parse decimal magnitude or PHP relative articles a/an -- emitter.label("__rt_strtotime_offsets_parse_value_linux_x86_64"); emitter.instruction("mov r11, rdi"); // cursor before parse_dec emitter.instruction("call __rt_strtotime_parse_dec_linux_x86_64"); // rax = value, rdi = new cursor emitter.instruction("cmp rdi, r11"); // cursor advanced ? - emitter.instruction("je __rt_strtotime_fail_linux_x86_64"); // no digits → fail + emitter.instruction("jne __rt_strtotime_offsets_value_ready_linux_x86_64"); // numeric value parsed → continue + emitter.instruction("cmp rdi, r10"); // any bytes left for a/an ? + emitter.instruction("jge __rt_strtotime_fail_linux_x86_64"); // no → fail + emitter.instruction("movzx eax, BYTE PTR [rdi]"); // load candidate article first byte + emitter.instruction("or al, 32"); // lowercase ASCII + emitter.instruction("cmp al, 97"); // 'a' ? + emitter.instruction("jne __rt_strtotime_fail_linux_x86_64"); // no article → fail + emitter.instruction("lea r8, [rdi + 1]"); // position after "a" + emitter.instruction("cmp r8, r10"); // input ended after "a" ? + emitter.instruction("jge __rt_strtotime_offsets_article_a_linux_x86_64"); // consume it and let unit parsing fail if missing + emitter.instruction("movzx eax, BYTE PTR [rdi + 1]"); // load byte after "a" + emitter.instruction("or al, 32"); // lowercase ASCII + emitter.instruction("cmp al, 110"); // 'n' ? + emitter.instruction("je __rt_strtotime_offsets_article_an_check_linux_x86_64"); // maybe "an" + emitter.instruction("mov ecx, eax"); // copy byte for boundary check + emitter.instruction("sub ecx, 97"); // normalize byte after "a" + emitter.instruction("cmp ecx, 25"); // alpha immediately after "a" ? + emitter.instruction("jbe __rt_strtotime_fail_linux_x86_64"); // yes → not the article word + emitter.label("__rt_strtotime_offsets_article_a_linux_x86_64"); + emitter.instruction("inc rdi"); // consume "a" + emitter.instruction("mov rax, 1"); // article magnitude = 1 + emitter.instruction("jmp __rt_strtotime_offsets_value_ready_linux_x86_64"); // continue with unit parsing + emitter.label("__rt_strtotime_offsets_article_an_check_linux_x86_64"); + emitter.instruction("lea r8, [rdi + 2]"); // position after "an" + emitter.instruction("cmp r8, r10"); // input ended after "an" ? + emitter.instruction("jge __rt_strtotime_offsets_article_an_linux_x86_64"); // consume it and let unit parsing fail if missing + emitter.instruction("movzx eax, BYTE PTR [rdi + 2]"); // load byte after "an" + emitter.instruction("or al, 32"); // lowercase ASCII + emitter.instruction("mov ecx, eax"); // copy byte for boundary check + emitter.instruction("sub ecx, 97"); // normalize byte after "an" + emitter.instruction("cmp ecx, 25"); // alpha immediately after "an" ? + emitter.instruction("jbe __rt_strtotime_fail_linux_x86_64"); // yes → not the article word + emitter.label("__rt_strtotime_offsets_article_an_linux_x86_64"); + emitter.instruction("add rdi, 2"); // consume "an" + emitter.instruction("mov rax, 1"); // article magnitude = 1 + + emitter.label("__rt_strtotime_offsets_value_ready_linux_x86_64"); emitter.instruction("mov QWORD PTR [rsp + 120], rax"); // save value across upcoming helpers emitter.instruction("call __rt_strtotime_skip_ws_linux_x86_64"); // WS between number and unit diff --git a/tests/codegen/system.rs b/tests/codegen/system.rs index 02f18d0c..7f23b445 100644 --- a/tests/codegen/system.rs +++ b/tests/codegen/system.rs @@ -426,6 +426,32 @@ if ($diff >= 3590 && $diff <= 3610) echo "ok"; assert_eq!(out, "ok"); } +#[test] +fn test_strtotime_offset_article_day_ago() { + let out = compile_and_run( + r#"= 86400 - 3700 && $diff <= 86400 + 3700) echo "ok"; +"#, + ); + assert_eq!(out, "ok"); +} + +#[test] +fn test_strtotime_offset_article_an_hour() { + let out = compile_and_run( + r#"= 3590 && $diff <= 3610) echo "ok"; +"#, + ); + assert_eq!(out, "ok"); +} + #[test] fn test_strtotime_offset_plus_30_seconds() { let out = compile_and_run( From ffb03d2bb3f811b15b519766aad87e64ac89ab10 Mon Sep 17 00:00:00 2001 From: Vincenzo Petrucci Date: Sat, 16 May 2026 14:28:35 +0200 Subject: [PATCH 2/4] feat: support preg_replace backreferences --- examples/date-json-regex/main.php | 3 + src/codegen/runtime/system/preg_replace.rs | 140 ++++++++++++++++++--- tests/codegen/system.rs | 20 +++ 3 files changed, 147 insertions(+), 16 deletions(-) diff --git a/examples/date-json-regex/main.php b/examples/date-json-regex/main.php index af45b99b..fb9ce26a 100644 --- a/examples/date-json-regex/main.php +++ b/examples/date-json-regex/main.php @@ -50,6 +50,9 @@ $cleaned = preg_replace("/[ ]+/", " ", "hello world test"); echo "Cleaned: " . $cleaned . "\n"; +$name = preg_replace("/([a-z]+) ([a-z]+)/", '$2, $1', "ada lovelace"); +echo "Name swap: " . $name . "\n"; + // Split $parts = preg_split("/[,;]+/", "one,two;;three,four"); echo "Parts: " . count($parts) . "\n"; diff --git a/src/codegen/runtime/system/preg_replace.rs b/src/codegen/runtime/system/preg_replace.rs index 615dff02..f8070295 100644 --- a/src/codegen/runtime/system/preg_replace.rs +++ b/src/codegen/runtime/system/preg_replace.rs @@ -10,6 +10,8 @@ use crate::codegen::{abi, emit::Emitter, platform::Arch}; +const PREG_REPLACE_NMATCH: usize = 10; + /// __rt_preg_replace: replace all regex matches in subject string. /// Input: x1=pattern ptr, x2=pattern len, x3=replacement ptr, x4=replacement len, /// x5=subject ptr, x6=subject len @@ -22,7 +24,8 @@ pub(crate) fn emit_preg_replace(emitter: &mut Emitter) { let regex_t_size = emitter.platform.regex_t_size(); let regmatch_off = regex_t_size; - let pattern_ptr_off = regmatch_off + emitter.platform.regmatch_t_size(); + let regmatches_size = emitter.platform.regmatch_t_size() * PREG_REPLACE_NMATCH; + let pattern_ptr_off = regmatch_off + regmatches_size; let pattern_len_off = pattern_ptr_off + 8; let replacement_ptr_off = pattern_len_off + 8; let replacement_len_off = replacement_ptr_off + 8; @@ -99,7 +102,7 @@ pub(crate) fn emit_preg_replace(emitter: &mut Emitter) { emitter.instruction("cbz w9, __rt_preg_replace_done"); // end of string emitter.instruction("mov x0, sp"); // regex_t - emitter.instruction("mov x2, #1"); // nmatch + emitter.instruction(&format!("mov x2, #{}", PREG_REPLACE_NMATCH)); // capture full match plus replacement backreference groups emitter.instruction(&format!("add x3, sp, #{}", regmatch_off)); // regmatch_t buffer emitter.instruction("mov x4, #0"); // eflags emitter.bl_c("regexec"); // execute @@ -119,7 +122,7 @@ pub(crate) fn emit_preg_replace(emitter: &mut Emitter) { emitter.instruction("add x12, x12, #1"); // increment emitter.instruction("b __rt_preg_replace_pre"); // continue - // -- copy replacement string -- + // -- copy replacement string, expanding $1 and \1 style backreferences -- emitter.label("__rt_preg_replace_repl"); emitter.instruction(&format!("ldr x1, [sp, #{}]", replacement_ptr_off)); // replacement ptr emitter.instruction(&format!("ldr x2, [sp, #{}]", replacement_len_off)); // replacement len @@ -128,15 +131,70 @@ pub(crate) fn emit_preg_replace(emitter: &mut Emitter) { emitter.instruction("cmp x12, x2"); // check if done emitter.instruction("b.ge __rt_preg_replace_advance"); // done emitter.instruction("ldrb w13, [x1, x12]"); // load replacement byte - emitter.instruction("strb w13, [x11]"); // write byte + emitter.instruction("cmp w13, #36"); // '$' may introduce a replacement backreference + emitter.instruction("b.eq __rt_preg_replace_backref_probe"); // inspect the next byte for a group number + emitter.instruction("cmp w13, #92"); // '\\' may introduce a replacement backreference + emitter.instruction("b.eq __rt_preg_replace_backref_probe"); // inspect the next byte for a group number + emitter.label("__rt_preg_replace_repl_literal"); + emitter.instruction("strb w13, [x11]"); // write literal replacement byte emitter.instruction("add x11, x11, #1"); // advance output - emitter.instruction("add x12, x12, #1"); // increment + emitter.instruction("add x12, x12, #1"); // consume one replacement byte emitter.instruction("b __rt_preg_replace_repl_copy"); // continue + emitter.label("__rt_preg_replace_backref_probe"); + emitter.instruction("add x14, x12, #1"); // index of potential group digit + emitter.instruction("cmp x14, x2"); // replacement ended after marker ? + emitter.instruction("b.ge __rt_preg_replace_repl_literal"); // yes → keep marker literal + emitter.instruction("ldrb w15, [x1, x14]"); // load potential group digit + emitter.instruction("sub w15, w15, #48"); // convert ASCII digit to group index + emitter.instruction("cmp w15, #9"); // is it 0..9 ? + emitter.instruction("b.hi __rt_preg_replace_repl_literal"); // no → keep marker literal + emitter.instruction("mov x14, x15"); // x14 = group index + if emitter.platform.regmatch_t_size() == 16 { + emitter.instruction("lsl x14, x14, #4"); // group index * sizeof(regmatch_t) + emitter.instruction(&format!("add x14, x14, #{}", regmatch_off)); // offset to selected regmatch_t + emitter.instruction("ldr x15, [sp, x14]"); // load rm_so for selected capture + emitter.instruction(&format!( + "add x14, x14, #{}", + emitter.platform.regmatch_rm_eo_offset() + )); // offset to rm_eo + emitter.instruction("ldr x16, [sp, x14]"); // load rm_eo for selected capture + } else { + emitter.instruction("lsl x14, x14, #3"); // group index * sizeof(regmatch_t) + emitter.instruction(&format!("add x14, x14, #{}", regmatch_off)); // offset to selected regmatch_t + emitter.instruction("ldrsw x15, [sp, x14]"); // load rm_so for selected capture + emitter.instruction(&format!( + "add x14, x14, #{}", + emitter.platform.regmatch_rm_eo_offset() + )); // offset to rm_eo + emitter.instruction("ldrsw x16, [sp, x14]"); // load rm_eo for selected capture + } + emitter.instruction("cmp x15, #0"); // capture was matched ? + emitter.instruction("b.lt __rt_preg_replace_backref_consume"); // unmatched captures expand to an empty string + emitter.instruction("sub x16, x16, x15"); // capture length = rm_eo - rm_so + emitter.instruction(&format!("ldr x17, [sp, #{}]", current_pos_off)); // reload current subject cursor + emitter.instruction("add x17, x17, x15"); // capture source pointer = current + rm_so + emitter.instruction("mov x9, #0"); // capture copy index + emitter.label("__rt_preg_replace_backref_copy"); + emitter.instruction("cmp x9, x16"); // copied entire capture ? + emitter.instruction("b.ge __rt_preg_replace_backref_consume"); // yes → consume marker and group digit + emitter.instruction("ldrb w10, [x17, x9]"); // load capture byte + emitter.instruction("strb w10, [x11]"); // append capture byte + emitter.instruction("add x11, x11, #1"); // advance output + emitter.instruction("add x9, x9, #1"); // advance capture index + emitter.instruction("b __rt_preg_replace_backref_copy"); // continue copying capture bytes + emitter.label("__rt_preg_replace_backref_consume"); + emitter.instruction("add x12, x12, #2"); // consume marker plus group digit + emitter.instruction("b __rt_preg_replace_repl_copy"); // continue scanning replacement + // -- advance past match -- emitter.label("__rt_preg_replace_advance"); emitter.instruction(&format!("str x11, [sp, #{}]", output_write_off)); // save output write pos emitter.instruction(&emitter.platform.regoff_load_instr("x9", "sp", regmatch_off + emitter.platform.regmatch_rm_eo_offset())); // load rm_eo with the native regoff_t width + emitter.instruction("cmp x9, #0"); // zero-length match ? + emitter.instruction("b.gt __rt_preg_replace_advance_ok"); // non-empty match advances by rm_eo + emitter.instruction("mov x9, #1"); // force progress after zero-length matches + emitter.label("__rt_preg_replace_advance_ok"); emitter.instruction(&format!("ldr x10, [sp, #{}]", current_pos_off)); // current pos emitter.instruction("add x10, x10, x9"); // advance past match emitter.instruction(&format!("str x10, [sp, #{}]", current_pos_off)); // save new pos @@ -186,7 +244,8 @@ pub(crate) fn emit_preg_replace(emitter: &mut Emitter) { fn emit_preg_replace_linux_x86_64(emitter: &mut Emitter) { let regex_t_size = emitter.platform.regex_t_size(); let regmatch_off = regex_t_size; - let pattern_ptr_off = regmatch_off + emitter.platform.regmatch_t_size(); + let regmatches_size = emitter.platform.regmatch_t_size() * PREG_REPLACE_NMATCH; + let pattern_ptr_off = regmatch_off + regmatches_size; let pattern_len_off = pattern_ptr_off + 8; let replacement_ptr_off = pattern_len_off + 8; let replacement_len_off = replacement_ptr_off + 8; @@ -266,7 +325,7 @@ fn emit_preg_replace_linux_x86_64(emitter: &mut Emitter) { emitter.instruction("test r9d, r9d"); // treat the terminating null byte as the end-of-subject condition emitter.instruction("jz __rt_preg_replace_done_linux_x86_64"); // finish replacement when the full subject payload has been consumed emitter.instruction("lea rdi, [rsp]"); // pass the compiled regex_t storage as the first regexec() argument - emitter.instruction("mov edx, 1"); // request exactly one regmatch_t capture because replacement only needs the full match extent + emitter.instruction(&format!("mov edx, {}", PREG_REPLACE_NMATCH)); // capture full match plus replacement backreference groups emitter.instruction(&format!("lea rcx, [rsp + {}]", regmatch_off)); // pass the local regmatch_t buffer as the match extent output slot emitter.instruction("xor r8d, r8d"); // pass eflags = 0 so regexec() matches from the current subject cursor emitter.bl_c("regexec"); // execute the compiled POSIX regex at the current subject cursor @@ -287,18 +346,67 @@ fn emit_preg_replace_linux_x86_64(emitter: &mut Emitter) { emitter.instruction("jmp __rt_preg_replace_pre_linux_x86_64"); // continue copying the remaining unmatched prefix bytes emitter.label("__rt_preg_replace_repl_linux_x86_64"); - emitter.instruction(&format!("mov rax, QWORD PTR [rsp + {}]", replacement_ptr_off)); // reload the elephc replacement pointer before appending the replacement literal bytes - emitter.instruction(&format!("mov rdx, QWORD PTR [rsp + {}]", replacement_len_off)); // reload the elephc replacement length before appending the replacement literal bytes - emitter.instruction("xor ecx, ecx"); // start copying the replacement literal from offset zero + emitter.instruction(&format!("mov rax, QWORD PTR [rsp + {}]", replacement_ptr_off)); // reload the elephc replacement pointer before scanning replacement bytes + emitter.instruction(&format!("mov rdx, QWORD PTR [rsp + {}]", replacement_len_off)); // reload the elephc replacement length before scanning replacement bytes + emitter.instruction("xor ecx, ecx"); // start scanning the replacement string from offset zero emitter.label("__rt_preg_replace_repl_copy_linux_x86_64"); - emitter.instruction("cmp rcx, rdx"); // stop copying once the full replacement literal has been appended + emitter.instruction("cmp rcx, rdx"); // stop once the full replacement string has been scanned emitter.instruction("jge __rt_preg_replace_advance_linux_x86_64"); // move on to advancing the current subject cursor past the regex match - emitter.instruction("mov r8b, BYTE PTR [rax + rcx]"); // load one replacement literal byte from the elephc replacement string payload - emitter.instruction("mov BYTE PTR [r11], r8b"); // append the replacement literal byte into the replacement output buffer - emitter.instruction("add r11, 1"); // advance the replacement output write cursor after appending one literal byte - emitter.instruction("add rcx, 1"); // advance the replacement literal byte index - emitter.instruction("jmp __rt_preg_replace_repl_copy_linux_x86_64"); // continue copying the remaining replacement literal bytes + emitter.instruction("mov r8b, BYTE PTR [rax + rcx]"); // load one replacement byte from the elephc replacement string payload + emitter.instruction("cmp r8b, 36"); // '$' may introduce a replacement backreference + emitter.instruction("je __rt_preg_replace_backref_probe_linux_x86_64"); // inspect the next byte for a group number + emitter.instruction("cmp r8b, 92"); // '\\' may introduce a replacement backreference + emitter.instruction("je __rt_preg_replace_backref_probe_linux_x86_64"); // inspect the next byte for a group number + emitter.label("__rt_preg_replace_repl_literal_linux_x86_64"); + emitter.instruction("mov BYTE PTR [r11], r8b"); // append a literal replacement byte into the output buffer + emitter.instruction("add r11, 1"); // advance the replacement output write cursor after appending one byte + emitter.instruction("add rcx, 1"); // consume one replacement byte + emitter.instruction("jmp __rt_preg_replace_repl_copy_linux_x86_64"); // continue scanning the replacement string + + emitter.label("__rt_preg_replace_backref_probe_linux_x86_64"); + emitter.instruction("lea r10, [rcx + 1]"); // index of potential group digit after the marker + emitter.instruction("cmp r10, rdx"); // replacement ended after marker ? + emitter.instruction("jge __rt_preg_replace_repl_literal_linux_x86_64"); // yes → keep marker literal + emitter.instruction("movzx r9d, BYTE PTR [rax + r10]"); // load potential group digit + emitter.instruction("sub r9d, 48"); // convert ASCII digit to group index + emitter.instruction("cmp r9d, 9"); // is it 0..9 ? + emitter.instruction("ja __rt_preg_replace_repl_literal_linux_x86_64"); // no → keep marker literal + emitter.instruction("mov r10, r9"); // r10 = group index + if emitter.platform.regmatch_t_size() == 16 { + emitter.instruction("shl r10, 4"); // group index * sizeof(regmatch_t) + emitter.instruction(&format!("add r10, {}", regmatch_off)); // offset to selected regmatch_t + emitter.instruction("mov rsi, QWORD PTR [rsp + r10]"); // load rm_so for selected capture + emitter.instruction(&format!( + "mov rdi, QWORD PTR [rsp + r10 + {}]", + emitter.platform.regmatch_rm_eo_offset() + )); // load rm_eo for selected capture + } else { + emitter.instruction("shl r10, 3"); // group index * sizeof(regmatch_t) + emitter.instruction(&format!("add r10, {}", regmatch_off)); // offset to selected regmatch_t + emitter.instruction("movsxd rsi, DWORD PTR [rsp + r10]"); // load rm_so for selected capture + emitter.instruction(&format!( + "movsxd rdi, DWORD PTR [rsp + r10 + {}]", + emitter.platform.regmatch_rm_eo_offset() + )); // load rm_eo for selected capture + } + emitter.instruction("cmp rsi, 0"); // capture was matched ? + emitter.instruction("jl __rt_preg_replace_backref_consume_linux_x86_64"); // unmatched captures expand to an empty string + emitter.instruction(&format!("mov r10, QWORD PTR [rsp + {}]", current_pos_off)); // reload current subject cursor + emitter.instruction("add r10, rsi"); // capture source pointer = current + rm_so + emitter.instruction("sub rdi, rsi"); // capture length = rm_eo - rm_so + emitter.instruction("xor esi, esi"); // capture copy index = 0 + emitter.label("__rt_preg_replace_backref_copy_linux_x86_64"); + emitter.instruction("cmp rsi, rdi"); // copied entire capture ? + emitter.instruction("jge __rt_preg_replace_backref_consume_linux_x86_64"); // yes → consume marker and group digit + emitter.instruction("mov r8b, BYTE PTR [r10 + rsi]"); // load capture byte + emitter.instruction("mov BYTE PTR [r11], r8b"); // append capture byte into the output buffer + emitter.instruction("add r11, 1"); // advance output after appending capture byte + emitter.instruction("add rsi, 1"); // advance capture index + emitter.instruction("jmp __rt_preg_replace_backref_copy_linux_x86_64"); // continue copying capture bytes + emitter.label("__rt_preg_replace_backref_consume_linux_x86_64"); + emitter.instruction("add rcx, 2"); // consume marker plus group digit + emitter.instruction("jmp __rt_preg_replace_repl_copy_linux_x86_64"); // continue scanning the replacement string emitter.label("__rt_preg_replace_advance_linux_x86_64"); emitter.instruction(&format!("mov QWORD PTR [rsp + {}], r11", output_write_off)); // preserve the updated replacement output write cursor before advancing the subject cursor diff --git a/tests/codegen/system.rs b/tests/codegen/system.rs index 7f23b445..16a56366 100644 --- a/tests/codegen/system.rs +++ b/tests/codegen/system.rs @@ -959,6 +959,26 @@ fn test_preg_replace_case_insensitive() { let out = compile_and_run(r#" Date: Sat, 16 May 2026 14:28:52 +0200 Subject: [PATCH 3/4] docs: document date and regex parity --- ROADMAP.md | 2 +- docs/internals/the-runtime.md | 4 ++-- docs/php/strings.md | 4 ++-- docs/php/system-and-io.md | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ROADMAP.md b/ROADMAP.md index 1f92d50c..8c2873dd 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -442,7 +442,7 @@ runtime helpers, and standard-library surfaces. - [x] Mixed indexed/associative array union — model `array + array` across indexed/hash representations while preserving PHP's shared int/string key space and left-key precedence - [X] Callable parity follow-up — support captured method/static first-class callables in the remaining callback runtimes (`array_reduce()`, `array_walk()`, `usort()`, `uksort()`, `uasort()`), direct callable expression calls such as `($obj->method(...))()`, non-local method receivers such as `(new Foo())->method(...)`, nullsafe first-class callables, broader builtin first-class callable wrappers, and the remaining `call_user_func_array()` by-reference callback gaps - [ ] Runtime-value compatibility polishing v2 — continue with PHP's uninitialized typed-property state, integer overflow promotion, broader loose-comparison semantics, and future warning/notice sites as they are added -- [ ] Broader date and regex PHP parity — expand `strtotime()` relative formats and PCRE-compatible regex features/captures/backreferences (JSON parity now closed: see v0.8.x base + v0.20.x polish) +- [x] Broader date and regex PHP parity — expand `strtotime()` relative formats with `a/an ` article offsets and add `preg_replace()` capture backreference expansion (`$0`..`$9`, `\0`..`\9`) over the POSIX bridge (JSON parity now closed: see v0.8.x base + v0.20.x polish) - [x] JSON encoder optimization — folded `__rt_json_assoc_is_list_shape` into the main associative-array encoding walk. `__rt_json_encode_assoc` now emits a provisional object form, tracks whether keys remain `0..count-1` while iterating the hash once, and compacts the finished buffer in-place to `[...]` only for real list-shape payloads. Object-shape inputs still stay object form, and `JSON_FORCE_OBJECT` disables compaction. - [x] JSON decoder optimization — fused the `__rt_json_validate` pre-pass into `__rt_json_decode_mixed` for `json_decode()`. The wrapper now calls the checked structural decoder directly; the decoder trims the input once, validates scalar strings/numbers at the point where they are decoded, enforces depth around containers, records syntax/depth/UTF-16 errors internally, and returns null-on-error for the PHP-facing wrapper. `json_validate()` keeps the standalone RFC 8259 validator surface. - [x] JSON encoder optimization — extended the `_json_active_flags` callee-saved-register cache to `__rt_json_encode_assoc` and `__rt_json_encode_array_dynamic` (`x19` ARM64 / `r15` x86_64). The recursive encoder chain now preserves that cache: `__rt_json_encode_object` no longer clobbers ARM64 `x19`, and the x86_64 string encoder keeps `r15` dedicated to cached flags during UTF-8 decoding. diff --git a/docs/internals/the-runtime.md b/docs/internals/the-runtime.md index cae0d971..d1449154 100644 --- a/docs/internals/the-runtime.md +++ b/docs/internals/the-runtime.md @@ -343,7 +343,7 @@ The fatal uncaught-exception path writes `Fatal error: uncaught exception` to st |---|---|---|---| | `__rt_date` | Format a Unix timestamp using PHP date format characters (Y, m, d, H, i, s, l, F, etc.). Uses `localtime_r()` from libc and static lookup tables (`_day_names`, `_month_names`) for day/month names | `x1`/`x2` = format string, `x0` = timestamp | `x1`/`x2` = formatted string | | `__rt_mktime` | Create a Unix timestamp from date components (hour, minute, second, month, day, year). Populates a `tm` struct on the stack and calls libc `mktime()` | `x0`-`x5` = h, m, s, mon, day, year | `x0` = Unix timestamp | -| `__rt_strtotime` | Parse trimmed date/time strings through strategy emitters: ISO dates/datetimes, time-only forms, bare keywords (`now`, `today`, `tomorrow`, `yesterday`, `midnight`, `noon`), relative offsets (`+1 day`, `3 months ago`), and named weekdays with `next` / `last` / `this`. Successful paths populate a `tm` struct and call libc `mktime()`; malformed input returns `-1`. | `x1`/`x2` = date string | `x0` = Unix timestamp or `-1` | +| `__rt_strtotime` | Parse trimmed date/time strings through strategy emitters: ISO dates/datetimes, time-only forms, bare keywords (`now`, `today`, `tomorrow`, `yesterday`, `midnight`, `noon`), relative offsets (`+1 day`, `3 months ago`, `a/an ` article forms), and named weekdays with `next` / `last` / `this`. Successful paths populate a `tm` struct and call libc `mktime()`; malformed input returns `-1`. | `x1`/`x2` = date string | `x0` = Unix timestamp or `-1` | ### JSON routines @@ -383,7 +383,7 @@ All regex routines use **POSIX extended regular expressions** via libc's `regcom |---|---|---|---| | `__rt_preg_match` | Test if a regex matches the subject string. Compiles the pattern, executes once, frees | pattern + subject strings | `x0` = 1 (match) or 0 (no match) | | `__rt_preg_match_all` | Count all non-overlapping matches by repeatedly executing the regex with advancing offsets | pattern + subject strings | `x0` = match count | -| `__rt_preg_replace` | Replace all regex matches with a replacement string. Builds the result incrementally in the concat buffer | pattern + replacement + subject | `x1`/`x2` = result string | +| `__rt_preg_replace` | Replace all regex matches with a replacement string. Builds the result incrementally in the concat buffer and expands `$0`..`$9` / `\0`..`\9` from the `regexec()` capture vector | pattern + replacement + subject | `x1`/`x2` = result string | | `__rt_preg_split` | Split the subject string at regex match boundaries. Returns a string array of the non-matching segments | pattern + subject strings | `x0` = array pointer | ## I/O routines diff --git a/docs/php/strings.md b/docs/php/strings.md index b2d57e08..f9426570 100644 --- a/docs/php/strings.md +++ b/docs/php/strings.md @@ -132,12 +132,12 @@ Read-only. Negative indices count from end. Out-of-bounds returns empty string. | `ctype_space()` | `ctype_space($str): bool` | All chars are whitespace | | `preg_match()` | `preg_match($pattern, $subject): int` | Test if regex matches (1 or 0). Uses POSIX extended regex. | | `preg_match_all()` | `preg_match_all($pattern, $subject): int` | Count all non-overlapping matches | -| `preg_replace()` | `preg_replace($pattern, $replacement, $subject): string` | Replace all regex matches | +| `preg_replace()` | `preg_replace($pattern, $replacement, $subject): string` | Replace all regex matches; supports `$0`..`$9` and `\0`..`\9` replacement backreferences | | `preg_split()` | `preg_split($pattern, $subject): array` | Split string by regex pattern | ### Regex limitations - Uses POSIX extended regex via libc, with translation of common PCRE shorthands (`\s`, `\d`, `\w`) +- `preg_replace()` expands `$0`..`$9` and `\0`..`\9` to captured groups; unmatched optional groups expand to an empty string - Lookahead, lookbehind, non-greedy quantifiers are not supported - `preg_match()` does not support `$matches` capture parameter -- `preg_replace()` does not support backreferences like `$1` diff --git a/docs/php/system-and-io.md b/docs/php/system-and-io.md index 87d4ecb3..c0575a6b 100644 --- a/docs/php/system-and-io.md +++ b/docs/php/system-and-io.md @@ -51,7 +51,7 @@ sidebar: - **ISO date / datetime** — `YYYY-MM-DD`, `YYYY-MM-DD HH:MM`, `YYYY-MM-DD HH:MM:SS`, `YYYY-MM-DDTHH:MM`, or `YYYY-MM-DDTHH:MM:SS`. Lowercase `t` is also accepted as the date/time separator. - **Bare keywords** — `now`, `today`, `tomorrow`, `yesterday`, `midnight`, `noon`. (`midnight` is an alias for `today`.) - **Time-only** — `H:MM`, `HH:MM`, `H:MM:SS`, `HH:MM:SS` — combined with today's date. -- **Relative offsets** — `[+-]?N unit [N unit ...]` and `N unit ago` (negates the whole expression). Units: `sec(s)`, `second(s)`, `min(s)`, `minute(s)`, `hour(s)`, `day(s)`, `week(s)`, `month(s)`, `year(s)`. Composite forms like `"+1 day 2 hours"` and `"3 months ago"` are supported. Day/week offsets honor DST through libc `mktime` normalization. +- **Relative offsets** — `[+-]?N unit [N unit ...]`, `a/an unit`, and `N unit ago` / `a/an unit ago` (negates the whole expression). Units: `sec(s)`, `second(s)`, `min(s)`, `minute(s)`, `hour(s)`, `day(s)`, `week(s)`, `month(s)`, `year(s)`. Composite forms like `"+1 day 2 hours"`, `"an hour"`, and `"a day ago"` are supported. Day/week offsets honor DST through libc `mktime` normalization. - **Named weekdays** — `Monday`..`Sunday` and 3-letter abbreviations `Mon`..`Sun`. Modifiers: `next ` (next future occurrence; today + 7 if today matches), `last ` (most recent past; today - 7 if today matches), `this ` (delta may be zero when today matches). Result is midnight of the target day. Currently out of scope (not accepted): timezone offsets (`+0200`, `UTC`, ...), `@unix_timestamp` form, `first/last day of` patterns, `MM/DD/YYYY` and `DD-Mon-YYYY` alternative date shapes, `nth of ` patterns. Malformed input returns `-1`. @@ -131,10 +131,10 @@ Encoding rules for objects: |---|---|---| | `preg_match()` | `preg_match($pattern, $subject): int` | Test regex match (1 or 0) | | `preg_match_all()` | `preg_match_all($pattern, $subject): int` | Count all non-overlapping matches | -| `preg_replace()` | `preg_replace($pattern, $replacement, $subject): string` | Replace all regex matches | +| `preg_replace()` | `preg_replace($pattern, $replacement, $subject): string` | Replace all regex matches; `$0`..`$9` and `\0`..`\9` replacement backreferences expand captured groups | | `preg_split()` | `preg_split($pattern, $subject): array` | Split string by regex | -Uses POSIX extended regex with common PCRE shorthand translation (`\s`, `\d`, `\w`). Lookahead, lookbehind, non-greedy quantifiers not supported. +Uses POSIX extended regex with common PCRE shorthand translation (`\s`, `\d`, `\w`). Replacement backreferences `$0`..`$9` and `\0`..`\9` are expanded by `preg_replace()`. Lookahead, lookbehind, non-greedy quantifiers, and the `$matches` output parameter for `preg_match()` are not supported. ## File I/O From e9798bc0b72c99088d805325635432819841bc7c Mon Sep 17 00:00:00 2001 From: Vincenzo Petrucci Date: Sat, 16 May 2026 14:55:21 +0200 Subject: [PATCH 4/4] chore: satisfy preg_replace instruction comment check --- src/codegen/runtime/system/preg_replace.rs | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) diff --git a/src/codegen/runtime/system/preg_replace.rs b/src/codegen/runtime/system/preg_replace.rs index f8070295..cfdd3b71 100644 --- a/src/codegen/runtime/system/preg_replace.rs +++ b/src/codegen/runtime/system/preg_replace.rs @@ -23,6 +23,7 @@ pub(crate) fn emit_preg_replace(emitter: &mut Emitter) { } let regex_t_size = emitter.platform.regex_t_size(); + let regmatch_rm_eo_off = emitter.platform.regmatch_rm_eo_offset(); let regmatch_off = regex_t_size; let regmatches_size = emitter.platform.regmatch_t_size() * PREG_REPLACE_NMATCH; let pattern_ptr_off = regmatch_off + regmatches_size; @@ -154,19 +155,13 @@ pub(crate) fn emit_preg_replace(emitter: &mut Emitter) { emitter.instruction("lsl x14, x14, #4"); // group index * sizeof(regmatch_t) emitter.instruction(&format!("add x14, x14, #{}", regmatch_off)); // offset to selected regmatch_t emitter.instruction("ldr x15, [sp, x14]"); // load rm_so for selected capture - emitter.instruction(&format!( - "add x14, x14, #{}", - emitter.platform.regmatch_rm_eo_offset() - )); // offset to rm_eo + emitter.instruction(&format!("add x14, x14, #{}", regmatch_rm_eo_off)); // offset to rm_eo emitter.instruction("ldr x16, [sp, x14]"); // load rm_eo for selected capture } else { emitter.instruction("lsl x14, x14, #3"); // group index * sizeof(regmatch_t) emitter.instruction(&format!("add x14, x14, #{}", regmatch_off)); // offset to selected regmatch_t emitter.instruction("ldrsw x15, [sp, x14]"); // load rm_so for selected capture - emitter.instruction(&format!( - "add x14, x14, #{}", - emitter.platform.regmatch_rm_eo_offset() - )); // offset to rm_eo + emitter.instruction(&format!("add x14, x14, #{}", regmatch_rm_eo_off)); // offset to rm_eo emitter.instruction("ldrsw x16, [sp, x14]"); // load rm_eo for selected capture } emitter.instruction("cmp x15, #0"); // capture was matched ? @@ -243,6 +238,7 @@ pub(crate) fn emit_preg_replace(emitter: &mut Emitter) { fn emit_preg_replace_linux_x86_64(emitter: &mut Emitter) { let regex_t_size = emitter.platform.regex_t_size(); + let regmatch_rm_eo_off = emitter.platform.regmatch_rm_eo_offset(); let regmatch_off = regex_t_size; let regmatches_size = emitter.platform.regmatch_t_size() * PREG_REPLACE_NMATCH; let pattern_ptr_off = regmatch_off + regmatches_size; @@ -377,18 +373,12 @@ fn emit_preg_replace_linux_x86_64(emitter: &mut Emitter) { emitter.instruction("shl r10, 4"); // group index * sizeof(regmatch_t) emitter.instruction(&format!("add r10, {}", regmatch_off)); // offset to selected regmatch_t emitter.instruction("mov rsi, QWORD PTR [rsp + r10]"); // load rm_so for selected capture - emitter.instruction(&format!( - "mov rdi, QWORD PTR [rsp + r10 + {}]", - emitter.platform.regmatch_rm_eo_offset() - )); // load rm_eo for selected capture + emitter.instruction(&format!("mov rdi, QWORD PTR [rsp + r10 + {}]", regmatch_rm_eo_off)); // load rm_eo for selected capture } else { emitter.instruction("shl r10, 3"); // group index * sizeof(regmatch_t) emitter.instruction(&format!("add r10, {}", regmatch_off)); // offset to selected regmatch_t emitter.instruction("movsxd rsi, DWORD PTR [rsp + r10]"); // load rm_so for selected capture - emitter.instruction(&format!( - "movsxd rdi, DWORD PTR [rsp + r10 + {}]", - emitter.platform.regmatch_rm_eo_offset() - )); // load rm_eo for selected capture + emitter.instruction(&format!("movsxd rdi, DWORD PTR [rsp + r10 + {}]", regmatch_rm_eo_off)); // load rm_eo for selected capture } emitter.instruction("cmp rsi, 0"); // capture was matched ? emitter.instruction("jl __rt_preg_replace_backref_consume_linux_x86_64"); // unmatched captures expand to an empty string