diff --git a/sds/Cargo.toml b/sds/Cargo.toml index 026d755d..28c4648f 100644 --- a/sds/Cargo.toml +++ b/sds/Cargo.toml @@ -25,7 +25,7 @@ nom = "7.1.3" regex = "1.9.5" regex-automata = "0.4.4" regex-syntax = "0.7.5" -serde = { version = "1.0", features = ["derive"] } +serde = { version = "1.0", features = ["derive", "rc"] } serde_with = "3.6.1" thiserror = "1.0.58" metrics = "0.24.0" diff --git a/sds/src/lib.rs b/sds/src/lib.rs index 3583cbef..ddee2169 100644 --- a/sds/src/lib.rs +++ b/sds/src/lib.rs @@ -49,7 +49,8 @@ pub use scanner::{ }; pub use scoped_ruleset::ExclusionCheck; pub use validation::{ - get_regex_complexity_estimate_very_slow, validate_regex, RegexValidationError, + get_regex_complexity_estimate_very_slow, validate_regex, validate_regex_and_get_ast, + RegexValidationError, }; #[cfg(any(feature = "testing", feature = "bench"))] diff --git a/sds/src/normalization/rust_regex_adapter.rs b/sds/src/normalization/rust_regex_adapter.rs index 01cb4c8d..3a04e3d7 100644 --- a/sds/src/normalization/rust_regex_adapter.rs +++ b/sds/src/normalization/rust_regex_adapter.rs @@ -42,6 +42,12 @@ pub fn convert_to_rust_regex(pattern: &str) -> Result { Ok(regex_ast.to_string()) } +pub fn convert_to_ast(pattern: &str) -> Result { + let sds_ast = parse_regex_pattern(pattern)?; + convert_ast(&sds_ast)?; + Ok(sds_ast) +} + // This is private since only ASTs generated from the parser are supported. // (Manually crafted ASTs may cause issues). fn convert_ast(sds_ast: &SdsAst) -> Result { diff --git a/sds/src/parser/ast.rs b/sds/src/parser/ast.rs index 6ce33902..9d7ee651 100644 --- a/sds/src/parser/ast.rs +++ b/sds/src/parser/ast.rs @@ -1,23 +1,58 @@ +use serde::{Serialize, Serializer}; use std::rc::Rc; - +use serde::ser::SerializeMap; +use regex_syntax::ast::Alternation; /// The Abstract Syntax Tree describing a regex pattern. The AST is designed /// to preserve behavior, but doesn't necessarily preserve the exact syntax. -#[derive(Clone, Debug)] +#[derive(Serialize, Clone, Debug)] pub enum Ast { Empty, + //Char Literal(Literal), + //abc - Alternative Concat(Vec), + // Group Group(Rc), + // CharacterClass CharacterClass(CharacterClass), // May be empty + // Disjunction + // a|b|c Alternation(Vec), + // Repetition Repetition(Repetition), + // Assertion Assertion(AssertionType), + // Tree -> Flags Flags(Flags), } -#[derive(Copy, Clone, Debug)] + +impl Serialize for Ast { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut state = serializer.serialize_map(Some(2))?; + match self { + Ast::Literal(literal) => { + serializer..se + } + } + state.serialize_entry("type", "Alternative")?; + + if let Ast::Alternation(expression) = self { + state.serialize_entry("expressions", expression)?; + } else { + state.serialize_entry("expressions", &vec![])?; + } + state.end() + } +} + +#[derive(Serialize, Copy, Clone, Debug)] pub struct Literal { + #[serde(rename = "value")] pub c: char, // whether a literal is escaped or not can change the behavior in some cases, @@ -32,24 +67,52 @@ pub enum Group { NamedCapturing(NamedCapturingGroup), } -#[derive(Clone, Debug)] +impl Serialize for Group { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + let mut state = serializer.serialize_map(Some(4))?; + state.serialize_entry("type", "Group")?; + match self { + Group::Capturing(group) => { + state.serialize_entry("capturing", &false)?; + state.serialize_entry("name", "")?; + state.serialize_entry("expression", &group.inner)?; + } + Group::NonCapturing(group) => { + state.serialize_entry("capturing", &true)?; + state.serialize_entry("name", "")?; + state.serialize_entry("expression", &group.inner)?; + } + Group::NamedCapturing(group) => { + state.serialize_entry("capturing", &true)?; + state.serialize_entry("name", &group.name)?; + state.serialize_entry("expression", &group.inner)?; + } + } + state.end() + } +} + +#[derive(Serialize, Clone, Debug)] pub struct CaptureGroup { pub inner: Ast, } -#[derive(Clone, Debug)] +#[derive(Serialize, Clone, Debug)] pub struct NonCapturingGroup { pub flags: Flags, pub inner: Ast, } -#[derive(Clone, Debug)] +#[derive(Serialize, Clone, Debug)] pub struct NamedCapturingGroup { pub name: String, pub inner: Ast, } -#[derive(Clone, Debug)] +#[derive(Serialize, Clone, Debug)] pub enum CharacterClass { Bracket(BracketCharacterClass), Perl(PerlCharacterClass), @@ -61,13 +124,13 @@ pub enum CharacterClass { UnicodeProperty(UnicodePropertyClass), } -#[derive(Clone, Debug)] +#[derive(Serialize, Clone, Debug)] pub struct UnicodePropertyClass { pub negate: bool, pub name: String, } -#[derive(Clone, Debug)] +#[derive(Serialize, Clone, Debug)] pub enum QuantifierKind { /// * ZeroOrMore, @@ -83,13 +146,13 @@ pub enum QuantifierKind { OneOrMore, } -#[derive(Clone, Debug)] +#[derive(Serialize, Clone, Debug)] pub struct Quantifier { pub lazy: bool, pub kind: QuantifierKind, } -#[derive(Clone, Debug)] +#[derive(Serialize, Clone, Debug)] pub enum PerlCharacterClass { Digit, Space, @@ -99,13 +162,13 @@ pub enum PerlCharacterClass { NonWord, } -#[derive(Clone, Debug)] +#[derive(Serialize, Clone, Debug)] pub struct BracketCharacterClass { pub negated: bool, pub items: Vec, } -#[derive(Clone, Debug)] +#[derive(Serialize, Clone, Debug)] pub enum BracketCharacterClassItem { Literal(char), Range(char, char), @@ -118,13 +181,13 @@ pub enum BracketCharacterClassItem { NotVerticalWhitespace, } -#[derive(Clone, Debug)] +#[derive(Serialize, Clone, Debug)] pub struct AsciiClass { pub negated: bool, pub kind: AsciiClassKind, } -#[derive(Clone, Debug)] +#[derive(Serialize, Clone, Debug)] pub enum AsciiClassKind { Alnum, Alpha, @@ -142,13 +205,13 @@ pub enum AsciiClassKind { Xdigit, } -#[derive(Clone, Debug)] +#[derive(Serialize, Clone, Debug)] pub struct Repetition { pub quantifier: Quantifier, pub inner: Rc, } -#[derive(Clone, Debug)] +#[derive(Serialize, Clone, Debug)] pub enum AssertionType { /// \b WordBoundary, @@ -172,7 +235,7 @@ pub enum AssertionType { EndTextOptionalNewline, } -#[derive(Clone, Debug)] +#[derive(Serialize, Clone, Debug)] pub struct Flags { /// Flags before a "-" pub add: Vec, @@ -180,7 +243,7 @@ pub struct Flags { pub remove: Vec, } -#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Serialize, Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] pub enum Flag { /// i CaseInsensitive, diff --git a/sds/src/validation.rs b/sds/src/validation.rs index 89921b65..17974d59 100644 --- a/sds/src/validation.rs +++ b/sds/src/validation.rs @@ -1,4 +1,6 @@ -use crate::normalization::rust_regex_adapter::{convert_to_rust_regex, QUANTIFIER_LIMIT}; +use crate::normalization::rust_regex_adapter::{ + convert_to_ast, convert_to_rust_regex, QUANTIFIER_LIMIT, +}; use crate::parser::error::ParseError; use regex_automata::meta::{self}; use thiserror::Error; @@ -40,6 +42,17 @@ pub fn validate_regex(input: &str) -> Result<(), RegexValidationError> { validate_and_create_regex(input).map(|_| ()) } +/// Checks that a regex pattern is valid for using in an SDS scanner +pub fn validate_regex_and_get_ast( + input: &str, +) -> Result { + // This is the same as `validate_and_create_regex`, but removes the actual Regex type + // to create a more stable API for external users of the crate. + validate_and_create_regex(input)?; + let ast = convert_to_ast(input)?; + Ok(ast) +} + pub fn get_regex_complexity_estimate_very_slow(input: &str) -> Result { // The regex crate doesn't directly give you access to the "complexity", but it does // reject if it's too large, so we can binary search to find the limit. @@ -113,6 +126,7 @@ fn build_regex( #[cfg(test)] mod test { + use crate::validate_regex_and_get_ast; use crate::validation::{ get_regex_complexity_estimate_very_slow, validate_and_create_regex, validate_regex, RegexValidationError, @@ -135,6 +149,14 @@ mod test { assert!(validate_regex("(a|)b").is_ok(),); } + #[test] + fn test_ast() { + // simple case that matches (only) empty string + assert_eq!( + validate_regex_and_get_ast("(a|.{2,4})b").map(|ast| {serde_json::to_string(&ast).unwrap()}).unwrap(), + "{\"type\":\"Concat\",\"content\":[{\"type\":\"Group\",\"content\":{\"Capturing\":{\"inner\":{\"type\":\"Alternation\",\"content\":[{\"type\":\"Literal\",\"content\":{\"value\":\"a\",\"escaped\":false}},{\"type\":\"Repetition\",\"content\":{\"quantifier\":{\"lazy\":false,\"kind\":{\"RangeMinMax\":[2,4]}},\"inner\":{\"type\":\"CharacterClass\",\"content\":\"Dot\"}}}]}}}},{\"type\":\"Literal\",\"content\":{\"value\":\"b\",\"escaped\":false}}]}"); + } + #[test] fn too_complex_pattern_is_rejected() { assert_eq!(