feat: add regex example

davidbp · davidbp · commit 3e2c8483b96e · 2025-03-13T11:37:08.000+01:00
diff --git a/python_basics/re/regex_for_tokenizing.ipynb b/python_basics/re/regex_for_tokenizing.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "66c20f5d",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['the', 'house,', 'and', 'the', 'super-cat']\n",
+      "['the', 'house', 'and', 'the', 'super-cat']\n",
+      "['the', 'house', 'and', 'the', 'super', 'cat']\n",
+      "['the', 'house', 'and', 'the', 'super', 'cat']\n"
+     ]
+    }
+   ],
+   "source": [
+    "import re\n",
+    "\n",
+    "\n",
+    "def simple_tokenize(sentence: str):\n",
+    "     return sentence.lower().split()\n",
+    "\n",
+    "def tokenize(sentence: str, tokenize_regex_pattern_:str ):\n",
+    "     return re.findall(tokenize_regex_pattern_, sentence.lower())\n",
+    "\n",
+    "    \n",
+    "token_pattern: str = r\"[\\w\\/]+\"\n",
+    "#tokenize_regex_pattern_ = re.compile(token_pattern)\n",
+    "\n",
+    "\n",
+    "str1 = \"the house, and the super-cat\"\n",
+    "str2 = \"the house and the super-cat\"\n",
+    "\n",
+    "print(simple_tokenize(str1))\n",
+    "print(simple_tokenize(str2))\n",
+    "\n",
+    "\n",
+    "print(tokenize(str1, token_pattern))\n",
+    "print(tokenize(str2, token_pattern))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "66d06397",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['the', 'house', 'and', 'the', 'super-cat']\n",
+      "['the', 'house', 'and', 'the', 'super-cat']\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "token_pattern: str = r\"[\\w?-]+\"\n",
+    "\n",
+    "str1 = \"the house, and the super-cat!\"\n",
+    "str2 = \"the house and the super-cat!\"\n",
+    "\n",
+    "print(tokenize(str1, token_pattern))\n",
+    "print(tokenize(str2, token_pattern))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "20728229",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['the', 'house', 'and', 'the', 'super', 'cat']\n",
+      "['the', 'house', 'and', 'the', 'super', 'cat']\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "token_pattern: str = r\"[\\w]+\"\n",
+    "\n",
+    "str1 = \"the house, and the super-cat!\"\n",
+    "str2 = \"the house and the super-cat!\"\n",
+    "\n",
+    "print(tokenize(str1, token_pattern))\n",
+    "print(tokenize(str2, token_pattern))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "25364219",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "the house  and the super cat \n",
+      "the house and the super cat \n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "def clean_non_alphabetic(str_raw):\n",
+    "    pattern = r\"[^a-zA-Z0-9\\s]+\"\n",
+    "    str_raw_clean = re.sub(pattern, \" \", str_raw)\n",
+    "    return str_raw_clean\n",
+    "\n",
+    "\n",
+    "\n",
+    "str1 = \"the house, and the super-cat!\"\n",
+    "str2 = \"the house and the super-cat!\"\n",
+    "\n",
+    "\n",
+    "print(clean_non_alphabetic(str1))\n",
+    "print(clean_non_alphabetic(str2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "53c19a7f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'the house  and the super cat'"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "423f1d8b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'the house and the super cat'"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5615f6cd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a37a8c0a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}