From ce303ea436752cc1912bc0bdc1da3e308fb9856b Mon Sep 17 00:00:00 2001 From: RastislavKish Date: Fri, 20 Mar 2026 06:36:06 +0100 Subject: [PATCH] Fix regular expressions used for cleaning the recognized text The \s and \S meta characters used by the regular expressions for cleaning the recognized text from whitespaces are apparently no longer recognized by the current versions of Python like 3.13. From a brief research it's not obvious to me what exactly were these characters matching, some sources say \s matched [ \t\r\n] and \S [^ \t\r\n]. So I used [ \t\n] as a replacement for \s and [ ~t] in the single instance where it was supposed to match anything except non-white-space characters and line endings. I have removed the \r characters altogether from the recognized text before applying the regular expressions to simplify handling the newlines. Otherwise the changes should exactly follow the previous regular expressions. --- src/ocrdesktop/main.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/ocrdesktop/main.py b/src/ocrdesktop/main.py index c76076c..9589a7a 100755 --- a/src/ocrdesktop/main.py +++ b/src/ocrdesktop/main.py @@ -389,17 +389,18 @@ def appendToOCRWords(self, OCRWords): except KeyError: self._OCRWords[k] = v def _cleanOCRText(self): - regexSpace = re.compile('[^\S\r\n]{2,}') #remove double spaces + self._OCRText=self._OCRText.replace("\r", "") # Clear out \r characters from line endings if necessary + regexSpace = re.compile('[ \t]{2,}') #remove double spaces self._OCRText = regexSpace.sub(' ',self._OCRText) - regexSpace = re.compile('\n\s*\n') #remove empty lines + regexSpace = re.compile('\n[ \t\n]*\n') #remove empty lines self._OCRText = regexSpace.sub('\n',self._OCRText) - regexSpace = re.compile('\s*\n') #remove ending spaces + regexSpace = re.compile('[ \t\n]*\n') #remove ending spaces self._OCRText = regexSpace.sub('\n',self._OCRText) - regexSpace = re.compile('^\s') #remove trailing space in first line + regexSpace = re.compile('^[ \t\n]') #remove trailing space in first line self._OCRText = regexSpace.sub( '\n', self._OCRText) regexSpace = re.compile('$\n') #remove ending newline self._OCRText = regexSpace.sub( '', self._OCRText) - regexSpace = re.compile('\n\s') #remove trailing spaces + regexSpace = re.compile('\n[ \t\n]') #remove trailing spaces self._OCRText = regexSpace.sub( '\n', self._OCRText) self._OCRText = self._OCRText[:-1]