-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnormalization.py
More file actions
35 lines (26 loc) · 990 Bytes
/
normalization.py
File metadata and controls
35 lines (26 loc) · 990 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import re
import unicodedata
# Hebrew cantillation and niqqud marks.
HEBREW_NIKKUD_RE = re.compile(r"[\u0591-\u05C7]")
WHITESPACE_RE = re.compile(r"\s+")
def strip_nikkud(text):
"""Return Hebrew text without niqqud/cantillation marks."""
if text is None:
return ""
normalized = unicodedata.normalize("NFC", str(text))
return HEBREW_NIKKUD_RE.sub("", normalized)
def has_nikkud(text):
"""Return True if the text contains Hebrew niqqud/cantillation marks."""
if not text:
return False
return HEBREW_NIKKUD_RE.search(str(text)) is not None
def compute_search_key(text, language_code=""):
"""Build a normalized search key for indexed lookups."""
if text is None:
return ""
value = unicodedata.normalize("NFC", str(text)).strip()
code = (language_code or "").strip().lower()
if code in {"he", "heb", "hebrew"}:
value = strip_nikkud(value)
value = WHITESPACE_RE.sub(" ", value)
return value.lower()