Skip to content

Commit 3e2c848

Browse files
committed
feat: add regex example
1 parent f5d1588 commit 3e2c848

1 file changed

Lines changed: 206 additions & 0 deletions

File tree

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 15,
6+
"id": "66c20f5d",
7+
"metadata": {},
8+
"outputs": [
9+
{
10+
"name": "stdout",
11+
"output_type": "stream",
12+
"text": [
13+
"['the', 'house,', 'and', 'the', 'super-cat']\n",
14+
"['the', 'house', 'and', 'the', 'super-cat']\n",
15+
"['the', 'house', 'and', 'the', 'super', 'cat']\n",
16+
"['the', 'house', 'and', 'the', 'super', 'cat']\n"
17+
]
18+
}
19+
],
20+
"source": [
21+
"import re\n",
22+
"\n",
23+
"\n",
24+
"def simple_tokenize(sentence: str):\n",
25+
" return sentence.lower().split()\n",
26+
"\n",
27+
"def tokenize(sentence: str, tokenize_regex_pattern_:str ):\n",
28+
" return re.findall(tokenize_regex_pattern_, sentence.lower())\n",
29+
"\n",
30+
" \n",
31+
"token_pattern: str = r\"[\\w\\/]+\"\n",
32+
"#tokenize_regex_pattern_ = re.compile(token_pattern)\n",
33+
"\n",
34+
"\n",
35+
"str1 = \"the house, and the super-cat\"\n",
36+
"str2 = \"the house and the super-cat\"\n",
37+
"\n",
38+
"print(simple_tokenize(str1))\n",
39+
"print(simple_tokenize(str2))\n",
40+
"\n",
41+
"\n",
42+
"print(tokenize(str1, token_pattern))\n",
43+
"print(tokenize(str2, token_pattern))\n"
44+
]
45+
},
46+
{
47+
"cell_type": "code",
48+
"execution_count": 24,
49+
"id": "66d06397",
50+
"metadata": {},
51+
"outputs": [
52+
{
53+
"name": "stdout",
54+
"output_type": "stream",
55+
"text": [
56+
"['the', 'house', 'and', 'the', 'super-cat']\n",
57+
"['the', 'house', 'and', 'the', 'super-cat']\n"
58+
]
59+
}
60+
],
61+
"source": [
62+
"\n",
63+
"token_pattern: str = r\"[\\w?-]+\"\n",
64+
"\n",
65+
"str1 = \"the house, and the super-cat!\"\n",
66+
"str2 = \"the house and the super-cat!\"\n",
67+
"\n",
68+
"print(tokenize(str1, token_pattern))\n",
69+
"print(tokenize(str2, token_pattern))"
70+
]
71+
},
72+
{
73+
"cell_type": "code",
74+
"execution_count": 25,
75+
"id": "20728229",
76+
"metadata": {},
77+
"outputs": [
78+
{
79+
"name": "stdout",
80+
"output_type": "stream",
81+
"text": [
82+
"['the', 'house', 'and', 'the', 'super', 'cat']\n",
83+
"['the', 'house', 'and', 'the', 'super', 'cat']\n"
84+
]
85+
}
86+
],
87+
"source": [
88+
"\n",
89+
"token_pattern: str = r\"[\\w]+\"\n",
90+
"\n",
91+
"str1 = \"the house, and the super-cat!\"\n",
92+
"str2 = \"the house and the super-cat!\"\n",
93+
"\n",
94+
"print(tokenize(str1, token_pattern))\n",
95+
"print(tokenize(str2, token_pattern))"
96+
]
97+
},
98+
{
99+
"cell_type": "code",
100+
"execution_count": 21,
101+
"id": "25364219",
102+
"metadata": {},
103+
"outputs": [
104+
{
105+
"name": "stdout",
106+
"output_type": "stream",
107+
"text": [
108+
"the house and the super cat \n",
109+
"the house and the super cat \n"
110+
]
111+
}
112+
],
113+
"source": [
114+
"\n",
115+
"def clean_non_alphabetic(str_raw):\n",
116+
" pattern = r\"[^a-zA-Z0-9\\s]+\"\n",
117+
" str_raw_clean = re.sub(pattern, \" \", str_raw)\n",
118+
" return str_raw_clean\n",
119+
"\n",
120+
"\n",
121+
"\n",
122+
"str1 = \"the house, and the super-cat!\"\n",
123+
"str2 = \"the house and the super-cat!\"\n",
124+
"\n",
125+
"\n",
126+
"print(clean_non_alphabetic(str1))\n",
127+
"print(clean_non_alphabetic(str2))"
128+
]
129+
},
130+
{
131+
"cell_type": "code",
132+
"execution_count": 18,
133+
"id": "53c19a7f",
134+
"metadata": {},
135+
"outputs": [
136+
{
137+
"data": {
138+
"text/plain": [
139+
"'the house and the super cat'"
140+
]
141+
},
142+
"execution_count": 18,
143+
"metadata": {},
144+
"output_type": "execute_result"
145+
}
146+
],
147+
"source": []
148+
},
149+
{
150+
"cell_type": "code",
151+
"execution_count": 19,
152+
"id": "423f1d8b",
153+
"metadata": {},
154+
"outputs": [
155+
{
156+
"data": {
157+
"text/plain": [
158+
"'the house and the super cat'"
159+
]
160+
},
161+
"execution_count": 19,
162+
"metadata": {},
163+
"output_type": "execute_result"
164+
}
165+
],
166+
"source": []
167+
},
168+
{
169+
"cell_type": "code",
170+
"execution_count": null,
171+
"id": "5615f6cd",
172+
"metadata": {},
173+
"outputs": [],
174+
"source": []
175+
},
176+
{
177+
"cell_type": "code",
178+
"execution_count": null,
179+
"id": "a37a8c0a",
180+
"metadata": {},
181+
"outputs": [],
182+
"source": []
183+
}
184+
],
185+
"metadata": {
186+
"kernelspec": {
187+
"display_name": "Python 3 (ipykernel)",
188+
"language": "python",
189+
"name": "python3"
190+
},
191+
"language_info": {
192+
"codemirror_mode": {
193+
"name": "ipython",
194+
"version": 3
195+
},
196+
"file_extension": ".py",
197+
"mimetype": "text/x-python",
198+
"name": "python",
199+
"nbconvert_exporter": "python",
200+
"pygments_lexer": "ipython3",
201+
"version": "3.9.12"
202+
}
203+
},
204+
"nbformat": 4,
205+
"nbformat_minor": 5
206+
}

0 commit comments

Comments
 (0)