DataDog · vinckama · Jun 18, 2025
@@ -27,7 +27,7 @@ regex-automata = "0.4.7"
 # Switch over to the original repo when this issue is resolved: https://github.com/rust-lang/regex/issues/1241
 regex-automata-fork = { git = "https://github.com/fbryden/regex", rev = "6952250af962ca3e364da47382b16dba9c703431", package = "regex-automata" }
 regex-syntax = "0.7.5"
-serde = { version = "1.0", features = ["derive"] }
+serde = { version = "1.0", features = ["derive", "rc"] }
 serde_with = "3.6.1"
 strum = { version = "0.25", features = ["derive"] }
 thiserror = "1.0.58"

@@ -40,6 +40,7 @@ pub use path::{Path, PathSegment};
 pub use rule_match::{ReplacementType, RuleMatch};
 pub use scanner::shared_pool::{SharedPool, SharedPoolGuard};
 
+pub use parser::{ast::Ast, regex_parser::parse_regex_pattern};
 pub use scanner::error::MatchValidationError;
 pub use scanner::{
     config::RuleConfig,
@@ -52,7 +53,8 @@ pub use scanner::{
 };
 pub use scoped_ruleset::ExclusionCheck;
 pub use validation::{
-    get_regex_complexity_estimate_very_slow, validate_regex, RegexValidationError,
+    get_regex_complexity_estimate_very_slow, validate_regex, validate_regex_and_get_ast,
+    RegexValidationError,
 };
 
 #[cfg(any(feature = "testing", feature = "bench"))]

@@ -1,55 +1,58 @@
+use serde::{Deserialize, Serialize};
 use std::rc::Rc;
 
 /// The Abstract Syntax Tree describing a regex pattern. The AST is designed
 /// to preserve behavior, but doesn't necessarily preserve the exact syntax.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(tag = "type", content = "content")]
 pub enum Ast {
     Empty,
     Literal(Literal),
     Concat(Vec<Ast>),
     Group(Rc<Group>),
     CharacterClass(CharacterClass),
-    // May be empty
     Alternation(Vec<Ast>),
     Repetition(Repetition),
     Assertion(AssertionType),
     Flags(Flags),
 }
 
-#[derive(Copy, Clone, Debug)]
+#[derive(Copy, Clone, Debug, Serialize, Deserialize)]
 pub struct Literal {
+    #[serde(rename = "value")]
     pub c: char,
 
     // whether a literal is escaped or not can change the behavior in some cases,
     // such as whether or not it's ignored by the `x` (extended / verbose) flag.
     pub escaped: bool,
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(tag = "group_type", content = "content")]
 pub enum Group {
     Capturing(CaptureGroup),
     NonCapturing(NonCapturingGroup),
     NamedCapturing(NamedCapturingGroup),
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct CaptureGroup {
     pub inner: Ast,
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct NonCapturingGroup {
     pub flags: Flags,
     pub inner: Ast,
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct NamedCapturingGroup {
     pub name: String,
     pub inner: Ast,
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub enum CharacterClass {
     Bracket(BracketCharacterClass),
     Perl(PerlCharacterClass),
@@ -61,13 +64,13 @@ pub enum CharacterClass {
     UnicodeProperty(UnicodePropertyClass),
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct UnicodePropertyClass {
     pub negate: bool,
     pub name: String,
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub enum QuantifierKind {
     /// *
     ZeroOrMore,
@@ -83,13 +86,13 @@ pub enum QuantifierKind {
     OneOrMore,
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct Quantifier {
     pub lazy: bool,
     pub kind: QuantifierKind,
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub enum PerlCharacterClass {
     Digit,
     Space,
@@ -99,13 +102,13 @@ pub enum PerlCharacterClass {
     NonWord,
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct BracketCharacterClass {
     pub negated: bool,
     pub items: Vec<BracketCharacterClassItem>,
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub enum BracketCharacterClassItem {
     Literal(char),
     Range(char, char),
@@ -118,13 +121,14 @@ pub enum BracketCharacterClassItem {
     NotVerticalWhitespace,
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct AsciiClass {
     pub negated: bool,
     pub kind: AsciiClassKind,
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
 pub enum AsciiClassKind {
     Alnum,
     Alpha,
@@ -142,13 +146,16 @@ pub enum AsciiClassKind {
     Xdigit,
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct Repetition {
+    #[serde(rename = "quantifier")]
     pub quantifier: Quantifier,
+    #[serde(rename = "expression")]
     pub inner: Rc<Ast>,
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
 pub enum AssertionType {
     /// \b
     WordBoundary,
@@ -167,20 +174,17 @@ pub enum AssertionType {
 
     /// \z
     EndText,
-
-    /// \Z
     EndTextOptionalNewline,
 }
 
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct Flags {
-    /// Flags before a "-"
     pub add: Vec<Flag>,
-    /// Flags after a "-"
     pub remove: Vec<Flag>,
 }
 
-#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
+#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+#[serde(rename_all = "lowercase")]
 pub enum Flag {
     /// i
     CaseInsensitive,

@@ -1,5 +1,6 @@
 use crate::normalization::rust_regex_adapter::{convert_to_rust_regex, QUANTIFIER_LIMIT};
 use crate::parser::error::ParseError;
+use crate::parser::regex_parser::parse_regex_pattern;
 use regex_automata::meta::{self};
 use thiserror::Error;
 
@@ -40,6 +41,16 @@ pub fn validate_regex(input: &str) -> Result<(), RegexValidationError> {
     validate_and_create_regex(input).map(|_| ())
 }
 
+/// Checks that a regex pattern is valid for using in an SDS scanner and return the AST if valid.
+pub fn validate_regex_and_get_ast(
+    input: &str,
+) -> Result<crate::parser::ast::Ast, RegexValidationError> {
+    // This is the same as `validate_and_create_regex`, but removes the actual Regex type
+    // to create a more stable API for external users of the crate.
+    let sds_ast = parse_regex_pattern(input)?;
+    Ok(sds_ast)
+}
+
 pub fn get_regex_complexity_estimate_very_slow(input: &str) -> Result<usize, RegexValidationError> {
     // The regex crate doesn't directly give you access to the "complexity", but it does
     // reject if it's too large, so we can binary search to find the limit.
@@ -115,7 +126,7 @@ fn build_regex(
 mod test {
     use crate::validation::{
         get_regex_complexity_estimate_very_slow, validate_and_create_regex, validate_regex,
-        RegexValidationError,
+        validate_regex_and_get_ast, RegexValidationError,
     };
 
     #[test]
@@ -183,4 +194,62 @@ mod test {
             Ok(1_040_136)
         );
     }
+
+    #[test]
+    fn test_parse_regex_pattern() {
+        let pattern: &'static str = "^(?:\\w|b)?";
+        let ast = validate_regex_and_get_ast(pattern).unwrap();
+        let json = serde_json::to_string_pretty(&ast).unwrap();
+        assert_eq!(
+            r###"{
+  "type": "Concat",
+  "content": [
+    {
+      "type": "Assertion",
+      "content": "startline"
+    },
+    {
+      "type": "Repetition",
+      "content": {
+        "quantifier": {
+          "lazy": false,
+          "kind": "ZeroOrOne"
+        },
+        "expression": {
+          "type": "Group",
+          "content": {
+            "group_type": "NonCapturing",
+            "content": {
+              "flags": {
+                "add": [],
+                "remove": []
+              },
+              "inner": {
+                "type": "Alternation",
+                "content": [
+                  {
+                    "type": "CharacterClass",
+                    "content": {
+                      "Perl": "Word"
+                    }
+                  },
+                  {
+                    "type": "Literal",
+                    "content": {
+                      "value": "b",
+                      "escaped": false
+                    }
+                  }
+                ]
+              }
+            }
+          }
+        }
+      }
+    }
+  ]
+}"###,
+            json
+        );
+    }
 }