|
| 1 | +import sys |
| 2 | +import time |
| 3 | + |
| 4 | +import pandas as pd |
| 5 | +import polars as pl |
| 6 | +from data_generation import data_generation |
| 7 | + |
| 8 | +# Data Generation |
| 9 | + |
| 10 | +test_data = data_generation(int(sys.argv[1])) |
| 11 | + |
| 12 | +# Polars DataFrame Test |
| 13 | + |
| 14 | +overall_time_start = time.time() |
| 15 | + |
| 16 | +polars_dataframe = pl.DataFrame(test_data) |
| 17 | + |
| 18 | +processing_time_start = time.time() |
| 19 | + |
| 20 | +( |
| 21 | + polars_dataframe.group_by(["region", "product", "sales_person"]).agg( |
| 22 | + total_sales=pl.col("sales_income").sum() |
| 23 | + ) |
| 24 | +) |
| 25 | + |
| 26 | +end_time = time.time() |
| 27 | + |
| 28 | +del polars_dataframe |
| 29 | + |
| 30 | +print( |
| 31 | + f"Polars DataFrame creation: {processing_time_start - overall_time_start}" |
| 32 | +) |
| 33 | +print(f"Polars DataFrame query runtime: {end_time - processing_time_start}") |
| 34 | +print(f"Polars DataFrame overall time: {end_time - overall_time_start}") |
| 35 | +print() |
| 36 | + |
| 37 | +# Polars LazyFrame Test |
| 38 | + |
| 39 | +overall_time_start = time.time() |
| 40 | + |
| 41 | +polars_lazyframe = pl.LazyFrame(test_data) |
| 42 | + |
| 43 | +processing_time_start = time.time() |
| 44 | + |
| 45 | +( |
| 46 | + polars_lazyframe.group_by(["region", "product", "sales_person"]).agg( |
| 47 | + total_sales=pl.col("sales_income").sum() |
| 48 | + ) |
| 49 | +).collect() |
| 50 | + |
| 51 | +end_time = time.time() |
| 52 | + |
| 53 | +del polars_lazyframe |
| 54 | + |
| 55 | +print( |
| 56 | + f"Polars LazyFrame creation: {processing_time_start - overall_time_start}" |
| 57 | +) |
| 58 | +print(f"Polars LazyFrame query runtime: {end_time - processing_time_start}") |
| 59 | +print(f"Polars LazyFrame overall time: {end_time - overall_time_start}") |
| 60 | +print() |
| 61 | + |
| 62 | +# Pandas DataFrame Test |
| 63 | + |
| 64 | +overall_time_start = time.time() |
| 65 | + |
| 66 | +pandas_dataframe = pd.DataFrame(test_data) |
| 67 | + |
| 68 | +processing_time_start = time.time() |
| 69 | + |
| 70 | +pandas_dataframe.groupby(["region", "product", "sales_person"])[ |
| 71 | + "sales_income" |
| 72 | +].sum() |
| 73 | + |
| 74 | +end_time = time.time() |
| 75 | + |
| 76 | +del pandas_dataframe |
| 77 | + |
| 78 | +print( |
| 79 | + f"Pandas DataFrame creation: {processing_time_start - overall_time_start}" |
| 80 | +) |
| 81 | +print(f"Pandas DataFrame query runtime: {end_time - processing_time_start}") |
| 82 | +print(f"Pandas DataFrame overall time: {end_time - overall_time_start}") |
0 commit comments