From dc2ea01ded68262426417750815095211708a9b2 Mon Sep 17 00:00:00 2001 From: Thomas Feldmann Date: Wed, 16 Nov 2022 16:50:40 +0100 Subject: [PATCH 1/3] start working on v2 --- simplematch.py | 4 +-- simplematch/__init__.py | 0 simplematch/converters.py | 71 ++++++++++++++++++++++++++++++++++++++ simplematch/simplematch.py | 53 ++++++++++++++++++++++++++++ 4 files changed, 126 insertions(+), 2 deletions(-) create mode 100644 simplematch/__init__.py create mode 100644 simplematch/converters.py create mode 100644 simplematch/simplematch.py diff --git a/simplematch.py b/simplematch.py index 281ad5b..22418a8 100644 --- a/simplematch.py +++ b/simplematch.py @@ -18,7 +18,7 @@ def register_type(name, regex, converter=str): - """ register a type to be available for the {value:type} matching syntax """ + """register a type to be available for the {value:type} matching syntax""" cleaned = TYPE_CLEANUP_REGEX.sub("(?:", regex) types[name] = Type(regex=cleaned, converter=converter) @@ -133,7 +133,7 @@ def _create_regex(self, pattern): @staticmethod def _grouplist(match): - """ extract unnamed match groups """ + """extract unnamed match groups""" # https://stackoverflow.com/a/53385788/300783 named = match.groupdict() ignored_groups = set() diff --git a/simplematch/__init__.py b/simplematch/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/simplematch/converters.py b/simplematch/converters.py new file mode 100644 index 0000000..dc1b44e --- /dev/null +++ b/simplematch/converters.py @@ -0,0 +1,71 @@ +from decimal import Decimal + + +class Str: + regex = r".*" + + def to_python(self, value: str) -> str: + return value + + +class Int: + regex = r"[+-]?[0-9]" + + def __init__(self, len=None, *, min=None, max=None): + pass + + def to_python(self, value: str) -> int: + return int(value) + + +class Float: + regex = r"[+-]?([0-9]*[.])?[0-9]+" + + def to_python(self, value: str) -> float: + return float(value) + + +class Decimal(Float): + def to_python(self, value: str) -> Decimal: + return Decimal(value) + + +class FourDigitYear(Int): + regex = "[0-9]{4}" + + def to_python(self, value: str) -> int: + return int(value) + + +class Letters(Str): + regex = r"[a-zA-Z]+" + + +class RomanNumeral(Int): + pass + + +class Bitcoin(Str): + regex = r"(bc1|[13])[a-zA-HJ-NP-Z0-9]{25,39}" + + +class Email(Str): + regex = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+" + + +class Url(Str): + regex = ( + r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b" + r"([-a-zA-Z0-9()!@:%_\+.~#?&\/\/=]*)" + ) + + +class IpV4(Str): + regex = ( + r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)" + r"(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}" + ) + + +class SocialSecurityNumber(Str): + regex = r"(?!0{3})(?!6{3})[0-8]\d{2}-(?!0{2})\d{2}-(?!0{4})\d{4}" diff --git a/simplematch/simplematch.py b/simplematch/simplematch.py new file mode 100644 index 0000000..f6834b5 --- /dev/null +++ b/simplematch/simplematch.py @@ -0,0 +1,53 @@ +""" +simplematch +""" +import re + + +class Environment: + def __init__(self, block_start_string: str, block_end_string: str): + # https://regex101.com/r/xS2B04/3 + safe_chars = r"[^:\[\]{}{}]".format(block_start_string, block_end_string) + self.block_regex = re.compile( + r""" + (?") + + +txt = """ + \{test} + {test:test[123]} + °C + < year : int[max=4]>-- + <:float>*<:float><:float[ len = 2, case_sensitive]> + <:float>\**\ + + """ + +for x in DEFAULT_ENV.parse(txt): + print(x) From c74ef0bf55915bc993b3e304e483303184cf849a Mon Sep 17 00:00:00 2001 From: Thomas Feldmann Date: Thu, 17 Nov 2022 12:51:43 +0100 Subject: [PATCH 2/3] wip --- pyproject.toml | 5 ++ simplematch/converters.py | 84 ++++++++++++++--- simplematch/py.typed | 0 simplematch/simplematch.py | 178 +++++++++++++++++++++++++++++-------- test_simplematch.py | 3 +- 5 files changed, 218 insertions(+), 52 deletions(-) create mode 100644 simplematch/py.typed diff --git a/pyproject.toml b/pyproject.toml index 7ac141c..8e40bb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,11 @@ classifiers = [ [tool.poetry.dependencies] python = "^3.5" +[tool.isort] +profile = "black" +skip_gitignore = true +line_length = 88 + [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" diff --git a/simplematch/converters.py b/simplematch/converters.py index dc1b44e..c32888c 100644 --- a/simplematch/converters.py +++ b/simplematch/converters.py @@ -1,39 +1,42 @@ -from decimal import Decimal +import decimal +from ipaddress import IPv4Address class Str: regex = r".*" - def to_python(self, value: str) -> str: + @staticmethod + def to_python(value: str) -> str: return value class Int: regex = r"[+-]?[0-9]" - def __init__(self, len=None, *, min=None, max=None): - pass - - def to_python(self, value: str) -> int: + @staticmethod + def to_python(value: str) -> int: return int(value) class Float: regex = r"[+-]?([0-9]*[.])?[0-9]+" - def to_python(self, value: str) -> float: + @staticmethod + def to_python(value: str) -> float: return float(value) class Decimal(Float): - def to_python(self, value: str) -> Decimal: - return Decimal(value) + @staticmethod + def to_python(value: str) -> decimal.Decimal: + return decimal.Decimal(value) class FourDigitYear(Int): regex = "[0-9]{4}" - def to_python(self, value: str) -> int: + @staticmethod + def to_python(value: str) -> int: return int(value) @@ -42,7 +45,7 @@ class Letters(Str): class RomanNumeral(Int): - pass + regex = r"M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})" class Bitcoin(Str): @@ -60,12 +63,69 @@ class Url(Str): ) -class IpV4(Str): +class IpV4: regex = ( r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)" r"(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}" ) + def to_python(self, value) -> IPv4Address: + return IPv4Address(value) + + +class IpV6: + regex = ( + r"(([0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,7}:|([0-9a-fA" + r"-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|([0-9a-fA-F]{1,4}:){1,5}(:[0-9a-fA-F]{1,4}){" + r"1,2}|([0-9a-fA-F]{1,4}:){1,4}(:[0-9a-fA-F]{1,4}){1,3}|([0-9a-fA-F]{1,4}:){1,3" + r"}(:[0-9a-fA-F]{1,4}){1,4}|([0-9a-fA-F]{1,4}:){1,2}(:[0-9a-fA-F]{1,4}){1,5}|[0" + r"-9a-fA-F]{1,4}:((:[0-9a-fA-F]{1,4}){1,6})|:((:[0-9a-fA-F]{1,4}){1,7}|:)|fe80:" + r"(:[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(ffff(:0{1,4}){0,1}:){0,1}((25[0-5" + r"]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0" + r"-9])|([0-9a-fA-F]{1,4}:){1,4}:((25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3," + r"3}(25[0-5]|(2[0-4]|1{0,1}[0-9]){0,1}[0-9]))" + ) + + +class Port: + regex = ( + r"((6553[0-5])|(655[0-2][0-9])|(65[0-4][0-9]{2})|(6[0-4][0-9]{3})|" + r"([1-5][0-9]{4})|([0-5]{0,5})|([0-9]{1,4}))" + ) + + +class MacAddress: + regex = r"[a-fA-F0-9]{2}(:[a-fA-F0-9]{2}){5}" + class SocialSecurityNumber(Str): regex = r"(?!0{3})(?!6{3})[0-8]\d{2}-(?!0{2})\d{2}-(?!0{4})\d{4}" + + +class CreditCard: + regex = ( + r"(^4[0-9]{12}(?:[0-9]{3})?$)|(^(?:5[1-5][0-9]{2}|222[1-9]|22[3-9][0-9]|2[3-6][" + r"0-9]{2}|27[01][0-9]|2720)[0-9]{12}$)|(3[47][0-9]{13})|(^3(?:0[0-5]|[68][0-9])" + r"[0-9]{11}$)|(^6(?:011|5[0-9]{2})[0-9]{12}$)|(^(?:2131|1800|35\d{3})\d{11}$)" + ) + + +class LatLon: + regex = r"((\-?|\+?)?\d+(\.\d+)?),\s*((\-?|\+?)?\d+(\.\d+)?)" + + +class SemanticVersion: + regex = ( + r"(0|[1-9]\d*)\.(0|[1-9]\d*)\.(0|[1-9]\d*)" + r"(?:-((?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)" + r"(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?" + r"(?:\+([0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?" + ) + + +class JiraIssueTicket: + regex = r"[A-Z]{2,}-\d+" + + +class Hashtag: + regex = r"#[^ !@#$%^&*(),.?\":{}|<>]*" diff --git a/simplematch/py.typed b/simplematch/py.typed new file mode 100644 index 0000000..e69de29 diff --git a/simplematch/simplematch.py b/simplematch/simplematch.py index f6834b5..b7afb3c 100644 --- a/simplematch/simplematch.py +++ b/simplematch/simplematch.py @@ -2,52 +2,152 @@ simplematch """ import re +from typing import NamedTuple, Optional +from collections import defaultdict + +from . import converters as cv + + +class Block(NamedTuple): + name: Optional[str] + converter: Optional[str] + args: Optional[str] + + +def block_parser_regex(block_start_string: str = "<", block_end_string: str = ">"): + """ + Assembles a regular expression which matches wildcards (`*`) and blocks + in the form of + + `` + + Block delimiters (`<` and `>`) can be changed via the `block_start_string` and + `block_end_string` arguments. + + Matches have three captures: (`name`, `converter`, `args`). + """ + # https://regex101.com/r/xS2B04/3 + safe_chars = r"[^:\[\]%s%s]" % (block_start_string, block_end_string) + regex = re.compile( + r""" + (? str: + """ + This does two things: + 1. replaces a sm-syntax block with the regular expression given by the converter + 2. Adds the converter in the temporary list of converters + """ + # strip whitespace from within the block + name, _converter, _args = ( + x.strip() if x is not None else None for x in match.groups() + ) + # handle wildcard (*) + if name is _converter is _args is None: + return r".*" + converter = self.converters.get(_converter, cv.Str)() + self._tmp_converters[name or self.unnamed_key].append(converter) + return converter.regex + def parse_pattern(self, pattern: str): + self._tmp_converters.clear() + result = self.block_parser_regex.sub(self._replacer, pattern) + return result, dict(self._tmp_converters) -DEFAULT_ENV = Environment(block_start_string="<", block_end_string=">") +DEFAULT_ENV = Environment( + block_start_string="<", + block_end_string=">", + unnamed_key="unnamed", +) -txt = """ - \{test} - {test:test[123]} - °C - < year : int[max=4]>-- - <:float>*<:float><:float[ len = 2, case_sensitive]> - <:float>\**\ - - """ -for x in DEFAULT_ENV.parse(txt): - print(x) +class Matcher: + def __init__( + self, + pattern: str = "*", + case_sensitive: bool = True, + environment=DEFAULT_ENV, + ): + self.pattern = pattern + self.case_sensitive = case_sensitive + self.environment = environment + self.regex, self.converters = self.environment.parse_pattern(pattern) + print("Regex: ", self.regex) + print("Conve: ", self.converters) + + +Matcher("*Test") +Matcher(" °C wheather ") +Matcher("<:url><:url>") + +# txt = """ +# \{test} +# {test:test[123]} +# °C +# < year : int[max=4]>-- +# <:float>*<:float><:float[ len = 2, case_sensitive]> +# <:float>\**\ +# +# """ + +# for x in DEFAULT_ENV.parse(txt): +# print(x) diff --git a/test_simplematch.py b/test_simplematch.py index b50e9f5..4e733ca 100644 --- a/test_simplematch.py +++ b/test_simplematch.py @@ -78,7 +78,7 @@ def test_simple_matching(): # should return None object if no match assert sm.match("{folder}/{filename}?{params}", "hello.js?p=1") is None - # should match strings with . (dot) and ? (question mart) sights + # should match strings with . (dot) and ? (question mark) signs assert sm.match("{folder}/{filename}?{params}", "home/hello.js?p=1") == dict( folder="home", filename="hello.js", params="p=1" ) @@ -240,6 +240,7 @@ def test_type_ccard(inp, result): ("https://xkcd.com/2293/", True), ("https://this-shouldn't.match@example.com", False), ("http://www.example.com/", True), + ("http:/ww.example.com/", False), ), ) def test_type_url(inp, is_url): From b05da52ca28f45b3aa6d48e1c218b740f1390d4c Mon Sep 17 00:00:00 2001 From: Thomas Feldmann Date: Thu, 17 Nov 2022 14:50:09 +0100 Subject: [PATCH 3/3] add QuantifierMixin --- simplematch/converters.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/simplematch/converters.py b/simplematch/converters.py index c32888c..7c85e29 100644 --- a/simplematch/converters.py +++ b/simplematch/converters.py @@ -2,7 +2,12 @@ from ipaddress import IPv4Address -class Str: +class QuantifierMixin: + def __init__(self, args): + pass + + +class Str(QuantifierMixin): regex = r".*" @staticmethod @@ -10,7 +15,7 @@ def to_python(value: str) -> str: return value -class Int: +class Int(QuantifierMixin): regex = r"[+-]?[0-9]" @staticmethod