From 4db11a30a2967502ab2b2b69260f5ea3cafa14f5 Mon Sep 17 00:00:00 2001
From: bejean <dominique.bejean@eolya.fr>
Date: Thu, 5 Mar 2026 09:52:29 +0100
Subject: [PATCH 1/2] fix: regex global flag

Since Python 3.11, re requires global flags like (?i) to be placed at the very beginning of the regular expression.
---
 wikiextractor/extract/extract.py | 34 ++++++++++++++++----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/wikiextractor/extract/extract.py b/wikiextractor/extract/extract.py
index 3cfdeac..7926bc7 100755
--- a/wikiextractor/extract/extract.py
+++ b/wikiextractor/extract/extract.py
@@ -21,7 +21,7 @@
 # ----------------------------------------------------------------------
 
 # match tail after wikilink
-tailRE = re.compile('\w+')
+tailRE = re.compile(r'\w+')
 syntaxhighlight = re.compile('&lt;syntaxhighlight .*?&gt;(.*?)&lt;/syntaxhighlight&gt;', re.DOTALL)
 
 ## PARAMS ####################################################################
@@ -239,8 +239,8 @@ def clean(extractor, text, expand_templates=False, language = None, html_safe=Tr
     text = text.replace('\t', ' ')
     text = spaces.sub(' ', text)
     text = dots.sub('...', text)
-    text = re.sub(u' (,:\.\)\]»)', r'\1', text)
-    text = re.sub(u'(\[\(«) ', r'\1', text)
+    text = re.sub(r' (,:\.\)\]»)', r'\1', text)
+    text = re.sub(r'(\[\(«) ', r'\1', text)
     text = re.sub(r'\n\W+?\n', '\n', text, flags=re.U)  # lines with only punctuations
     text = text.replace(',,', ',').replace(',.', '.')
     if html_safe:
@@ -382,7 +382,7 @@ def compact(text, mark_headers=False):
                         for (i, v) in items:
                             page.append(v) #header title
                     headers.clear()
-                list_item = re.sub('[;#\*]',' ', line) 
+                list_item = re.sub(r'[;#\*]',' ', line)
                 #Fixme? sometimes list before indent: "#:"
                 list_item= re.sub("(^ *)(.+)",r"\1- \2",list_item)
                 page.append(list_item)
@@ -516,12 +516,12 @@ def dropSpans(spans, text):
 # as well as U+3000 is IDEOGRAPHIC SPACE for bug 19052
 EXT_LINK_URL_CLASS = r'[^][<>"\x00-\x20\x7F\s]'
 ExtLinkBracketedRegex = re.compile(
-    '\[(((?i)' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]',
-    re.S | re.U)
+    r'\[((' + '|'.join(wgUrlProtocols) + ')' + EXT_LINK_URL_CLASS + r'+)\s*([^\]\x00-\x08\x0a-\x1F]*?)\]',
+    re.I | re.S | re.U)
 EXT_IMAGE_REGEX = re.compile(
     r"""^(http://|https://)([^][<>"\x00-\x20\x7F\s]+)
-    /([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.((?i)gif|png|jpg|jpeg)$""",
-    re.X | re.S | re.U)
+    /([A-Za-z0-9_.,~%\-+&;#*?!=()@\x80-\xFF]+)\.(gif|png|jpg|jpeg)$""",
+    re.I | re.X | re.S | re.U)
 
 
 def replaceExternalLinks(text):
@@ -870,7 +870,7 @@ def fixup(m):
         except:
             return text  # leave as is
 
-    return re.sub("&#?(\w+);", fixup, text)
+    return re.sub(r"&#?(\w+);", fixup, text)
 
 
 # Match HTML comments
@@ -1067,10 +1067,10 @@ class Extractor():
     discardSections = None
 
     ## Banned template names path
-    discardTemplates = None
-    
+    discardTemplates = set()
+
     ## Ignores template names path
-    ignoreTemplates = None
+    ignoreTemplates = set()
 
 
     ## Output format 
@@ -1441,7 +1441,7 @@ def expandTemplate(self, body, language = None):
         #  and the have just the content written in other language
         #    FORMAT  1: e.g. {{lang|fr|Je suis ....}} Return: "космонавт"
         #    FORMART 2: e.g. Template {{lang-ru|космонавт}} Return: "russian:космонавт"
-        elif(re.match('lang\-+',title,re.IGNORECASE) ):
+        elif(re.match(r'lang\-+',title,re.IGNORECASE) ):
             if(language):
                 try:
                     isoCode = parts[0].split('-')[1]
@@ -1792,8 +1792,8 @@ def findMatchingBraces(text, ldelim=0):
         reOpen = re.compile('[{]{%d,}' % ldelim)  # at least ldelim
         reNext = re.compile('[{]{2,}|}{2,}')  # at least 2 open or close bracces
     else:
-        reOpen = re.compile('{{2,}|\[{2,}')
-        reNext = re.compile('{{2,}|}{2,}|\[{2,}|]{2,}')  # at least 2
+        reOpen = re.compile(r'{{2,}|\[{2,}')
+        reNext = re.compile(r'{{2,}|}{2,}|\[{2,}|]{2,}')  # at least 2
 
     cur = 0
     while True:
@@ -2077,7 +2077,7 @@ def sharp_ifeq(lvalue, rvalue, valueIfTrue, valueIfFalse=None, *args):
 
 
 def sharp_iferror(test, then='', Else=None, *args):
-    if re.match('<(?:strong|span|p|div)\s(?:[^\s>]*\s+)*?class="(?:[^"\s>]*\s+)*?error(?:\s[^">]*)?"', test):
+    if re.match(r'<(?:strong|span|p|div)\s(?:[^\s>]*\s+)*?class="(?:[^"\s>]*\s+)*?error(?:\s[^">]*)?"', test):
         return then
     elif Else is None:
         return test.strip()
@@ -2250,7 +2250,7 @@ def define_template(title, page):
 
     # check for redirects
     #m = re.match('#REDIRECT.*?\[\[([^\]]*)]]', page[0], re.IGNORECASE)
-    m = re.match('#REDIRE.*?\[\[([^\]]*)]]', page[0], re.IGNORECASE)
+    m = re.match(r'#REDIRE.*?\[\[([^\]]*)]]', page[0], re.IGNORECASE)
 
 
     if m:

From 80f30cd075fc2d8501f91a244ded55016d103be1 Mon Sep 17 00:00:00 2001
From: bejean <dominique.bejean@eolya.fr>
Date: Thu, 5 Mar 2026 09:53:13 +0100
Subject: [PATCH 2/2] fix: SyntaxWarning: invalid escape sequence '\S'

---
 wikiextractor/WikiExtractor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wikiextractor/WikiExtractor.py b/wikiextractor/WikiExtractor.py
index f45f217..09dbd9e 100755
--- a/wikiextractor/WikiExtractor.py
+++ b/wikiextractor/WikiExtractor.py
@@ -681,12 +681,12 @@ def main(*args, **kwargs):
     groupP.add_argument("--discard_templates", action="store_true",
                         help="If specified, it will discard \
                               some wikipedia docs if containg some templates titles (e.g. Disambiguation, Desambiguación). \
-                              \Since most template names are usually tranlated.  \
+                              Since most template names are usually tranlated.  \
                                 See an example under config/discard_templates.txt ")
     groupP.add_argument("--ignore_templates", action="store_true",
                         help="If specified, it will not expand \
                               some templates (e.g. Millorar format). \
-                              \Since most template names are usually tranlated.  \
+                              Since most template names are usually tranlated.  \
                                 See an example under config/ignore_templates.txt ")
     groupP.add_argument("--html_safe", default=True,
                         help="use to produce HTML safe output within <doc>...</doc>")