diff --git a/examples/grep_speed.py b/examples/grep_speed.py index 16b4d7bc..8325d2bc 100644 --- a/examples/grep_speed.py +++ b/examples/grep_speed.py @@ -6,9 +6,14 @@ _, filename, limit = sys.argv -with open(filename) as fh: - for line in fh: - for _ in range(int(limit)): - if re.search(r'y', line): - print(line) +def grep(regex, filename): + with open(filename) as fh: + for line in fh: + if re.search(regex, line): + print(line, end='') +i = int(limit) + +while i: + i-=1 + grep('y', filename) diff --git a/examples/grep_speed.sh b/examples/grep_speed.sh index 9d946dde..2ced1a2a 100644 --- a/examples/grep_speed.sh +++ b/examples/grep_speed.sh @@ -1,7 +1,13 @@ +#!/bin/bash + +if (($# != 2)); then + echo "$0" 'FILENAME LIMIT' >&2 + exit 1 +fi + filename=$1 limit=$2 -for ((i=1;i<=$limit;i++)); -do - grep y $filename +for ((i=limit; i; --i)); do + grep y "$filename" done diff --git a/examples/grep_speed_open_once.py b/examples/grep_speed_open_once.py new file mode 100644 index 00000000..1c089f3d --- /dev/null +++ b/examples/grep_speed_open_once.py @@ -0,0 +1,20 @@ +import sys +import re + +if len(sys.argv) != 3: + exit(f"{sys.argv[0]} FILENAME LIMIT") + +_, filename, limit = sys.argv + +def grep(regex, fh): + for line in fh: + if re.search(regex, line): + print(line, end='') + +i = int(limit) + +with open(filename) as fh: + while i: + i-=1 + grep('y', fh) + fh.seek(0) diff --git a/examples/grep_speed_optimized.py b/examples/grep_speed_optimized.py new file mode 100644 index 00000000..041b7924 --- /dev/null +++ b/examples/grep_speed_optimized.py @@ -0,0 +1,21 @@ +import sys +import re + +if len(sys.argv) != 3: + exit(f"{sys.argv[0]} FILENAME LIMIT") + +_, filename, limit = sys.argv + +def grep(regex, fh): + for line in fh: + if regex.search(line): + print(line, end='') + +i = int(limit) + +y = re.compile('y') +with open(filename) as fh: + while i: + i-=1 + grep(y, fh) + fh.seek(0) diff --git a/examples/grep_speed_oxo.py b/examples/grep_speed_oxo.py index 5c3644cc..9b4d3d4a 100644 --- a/examples/grep_speed_oxo.py +++ b/examples/grep_speed_oxo.py @@ -6,9 +6,16 @@ _, filename, limit = sys.argv -with open(filename) as fh: +def grep(regex, fh): for line in fh: - for _ in range(int(limit)): - if re.search(r'(.)y\1', line): - print(line) + if regex.search(line): + print(line, end='') + +i = int(limit) +y = re.compile(r'(.)y\1') +with open(filename) as fh: + while i: + i-=1 + grep(y, fh) + fh.seek(0) diff --git a/examples/grep_speed_oxo.sh b/examples/grep_speed_oxo.sh index e92fc2d4..95b0207a 100644 --- a/examples/grep_speed_oxo.sh +++ b/examples/grep_speed_oxo.sh @@ -1,7 +1,13 @@ +#!/bin/bash + +if (($# != 2)); then + echo "$0" 'FILENAME LIMIT' >&2 + exit 1 +fi + filename=$1 limit=$2 -for ((i=1;i<=$limit;i++)); -do - grep '\(.\)y\1' $filename +for ((i=limit; i; --i)); do + grep '\(.\)y\1' "$filename" done diff --git a/examples/grep_speed_oxo_unoptimized.py b/examples/grep_speed_oxo_unoptimized.py new file mode 100644 index 00000000..1d75fe7c --- /dev/null +++ b/examples/grep_speed_oxo_unoptimized.py @@ -0,0 +1,19 @@ +import sys +import re + +if len(sys.argv) != 3: + exit(f"{sys.argv[0]} FILENAME LIMIT") + +_, filename, limit = sys.argv + +def grep(regex, filename): + with open(filename) as fh: + for line in fh: + if re.search(regex, line): + print(line, end='') + +i = int(limit) + +while i: + i-=1 + grep(r'(.)y\1', filename) diff --git a/sites/en/pages/compare-the-speed-of-grep-with-python-regex.txt b/sites/en/pages/compare-the-speed-of-grep-with-python-regex.txt index 9c62679e..fe210fea 100644 --- a/sites/en/pages/compare-the-speed-of-grep-with-python-regex.txt +++ b/sites/en/pages/compare-the-speed-of-grep-with-python-regex.txt @@ -9,7 +9,7 @@ =abstract start -At one of my client we had a Bash script that grepped a huge log file 20 times in order to generate a report. +One of my clients had a Bash script that grepped a huge log file 20 times in order to generate a report. It created a lot of load on the server as grep was reading the entire file 20 times. As we were converting our Shell scripts to Python anyway I thought I could rewrite it in Python and go over the file @@ -31,21 +31,34 @@ We can run it like this, indicating the name of the file we would like to create the number of rows and the length of rows. -python create-big-file.py FILENAME NUMBER-OF-ROWS LENGTH-OF-ROWS +$ python create-big-file.py FILENAME NUMBER-OF-ROWS LENGTH-OF-ROWS + + +For example: + + +$ python create-big-file.py a.txt 100000 50 It will create a file full of the character "x", with a single "y" somewhere. + +$ wc a.txt + 1000000 1000000 51000000 a.txt +$ grep y a.txt +xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxyx + + I think this is going to be good enough for our simple example.

Using grep

In the original shell script we had some 20 different calls to grep, -but to make it simpler I made this shell script with that runs the same regex multiple times. +but to make it simpler I made this shell script, which runs the same regex multiple times. -You can pass the name of the data file and the number of time you'd like to run grep. +You can pass the name of the data file and the number of times you'd like to run grep.

Grep with Python regexes

@@ -61,46 +74,59 @@ and thous would be probably faster, but in our cases we really had more complex

Comparing the speed

+Here are the results of running the grep test: + -python create-big-file.py a.txt 100000 50 +$ time bash examples/grep_speed.sh a.txt 20 >/dev/null + +real 0m0.355s +user 0m0.238s +sys 0m0.097s -Verify the file: -$ wc a.txt - 1000000 1000000 51000000 a.txt - +$ time python examples/grep_speed.py a.txt 20 >/dev/null - -# grep y a.txt -xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxyx +real 0m9.897s +user 0m9.772s +sys 0m0.120s +So, grep is upwards of 30 times faster than Python; what if we optimize the Python code by only opening the file once? + + + +Making that change did almost nothing to improve the speed. + -$ time bash examples/grep_speed.sh a.txt 20 +$ time python examples/grep_speed_open_once.py a.txt 20 >/dev/null -real 0m0.227s -user 0m0.055s -sys 0m0.172s +real 0m9.712s +user 0m9.625s +sys 0m0.082s +What if we optimize the regular expression by compiling it only once? + + + +That makes a signicant improvement! -$ time python examples/grep_speed.py a.txt 20 +$ time python examples/grep_speed_optimized.py a.txt 20 >/dev/null -real 0m9.509s -user 0m9.477s -sys 0m0.032s +real 0m2.198s +user 0m2.121s +sys 0m0.075s - -grep is about 50 times faster than Python even though grep had to read the file 20 time while Python only read it once. +By pre-compiling the regular expression, the Python code is now about 4.5x faster than the unoptimized Python code; however, grep is still about 6 times faster than Python, even though grep must start from afresh on each iteration.

More complex grep

-In the previous case we used a very simple regex, now let's change it to use a slightly more complex expression +In the previous case we used a very simple regex; now, let's change it to use a slightly more complex expression in which we are not only looking for a single character, but we also want to make sure it is between two identical characters. @@ -113,30 +139,43 @@ identical characters. You can try it yourself: -grep '\(.\)y\1' a.txt +$ grep '\(.\)y\1' a.txt +xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxyx

Comparing the speed of the more complex examples

-$ time bash examples/grep_speed_oxo.sh a.txt 20 +$ time bash examples/grep_speed_oxo.sh a.txt 20 >/dev/null -real 0m0.196s -user 0m0.035s -sys 0m0.161s +real 0m0.413s +user 0m0.297s +sys 0m0.097s -$ time python examples/grep_speed_oxo.py a.txt 20 +$ time python examples/grep_speed_oxo.py a.txt 20 >/dev/null -real 0m25.067s -user 0m24.972s -sys 0m0.016s +real 0m12.724s +user 0m12.589s +sys 0m0.128s -The speed of grep did not change, but Python became even slower. This time grep is more than a 100 times faster than Python. +The speed of grep did not change appreciably, but the Python code became much slower; this time, grep is more than a 30 times faster than Python, despite using some explicit optimizations in the Python code. How does the unoptimized code fair? + + + +Using grep is about 57 times faster than using the unoptimized Python code. + + +$ time python examples/grep_speed_oxo_unoptimized.py a.txt 20 >/dev/null + +real 0m23.448s +user 0m23.319s +sys 0m0.114s +

Version information

@@ -147,13 +186,13 @@ Python 3.8.2 $ grep -V -grep (GNU grep) 3.4 +grep (GNU grep) 3.3

Other cases

-The results are consistent with what I saw during my work, but I wonder what would be the results if the file was larger than the available memory in my computer. +The results are consistent with what I saw during my work, but I wonder what the results would be if the file were larger than the available memory in my computer.

Conclusion

@@ -161,7 +200,7 @@ The results are consistent with what I saw during my work, but I wonder what wou Or I made a mistake somewhere that impacts the results. -Oh and one more thing, I also create a Perl version of the code and +Oh and one more thing, I also created a Perl version of the code and Perl is much faster than Python even though it is also slower than the grep code.