|
| 1 | +import functools |
| 2 | +import sys |
| 3 | +from timeit import Timer |
| 4 | + |
| 5 | +import pandas as pd |
| 6 | +import polars as pl |
| 7 | + |
| 8 | +from data_generation import data_generation |
| 9 | + |
| 10 | + |
| 11 | +def create_pandas_dataframe(test_data): |
| 12 | + return pd.DataFrame(test_data).convert_dtypes(dtype_backend="pyarrow") |
| 13 | + |
| 14 | + |
| 15 | +def create_polars_dataframe(test_data): |
| 16 | + return pl.DataFrame(test_data) |
| 17 | + |
| 18 | + |
| 19 | +def create_polars_lazyframe(test_data): |
| 20 | + return pl.LazyFrame(test_data) |
| 21 | + |
| 22 | + |
| 23 | +def analyze_pandas_dataframe(pandas_df): |
| 24 | + pandas_df.groupby(["region", "product", "sales_person"])[ |
| 25 | + "sales_income" |
| 26 | + ].sum() |
| 27 | + |
| 28 | + |
| 29 | +def analyze_polars_dataframe(polars_df): |
| 30 | + polars_df.group_by(["region", "product", "sales_person"]).agg( |
| 31 | + total_sales=pl.col("sales_income").sum() |
| 32 | + ) |
| 33 | + |
| 34 | + |
| 35 | +def analyze_polars_lazyframe(polars_lf): |
| 36 | + polars_lf.group_by(["region", "product", "sales_person"]).agg( |
| 37 | + total_sales=pl.col("sales_income").sum() |
| 38 | + ).collect() |
| 39 | + |
| 40 | + |
| 41 | +test_data = data_generation(int(sys.argv[1])) |
| 42 | + |
| 43 | +print(f"Pandas dataframe creation time for {int(sys.argv[1])} rows:") |
| 44 | +print(Timer(functools.partial(create_pandas_dataframe, test_data)).timeit(100)) |
| 45 | +print() |
| 46 | +print(f"Polars dataframe creation time for {int(sys.argv[1])} rows:") |
| 47 | +print(Timer(functools.partial(create_polars_dataframe, test_data)).timeit(100)) |
| 48 | +print() |
| 49 | +print(f"Polars lazyframe creation time for {int(sys.argv[1])} rows:") |
| 50 | +print(Timer(functools.partial(create_polars_lazyframe, test_data)).timeit(100)) |
| 51 | + |
| 52 | +print() |
| 53 | + |
| 54 | +pandas_df = create_pandas_dataframe(test_data) |
| 55 | +polars_df = create_polars_dataframe(test_data) |
| 56 | +polars_lf = create_polars_lazyframe(test_data) |
| 57 | + |
| 58 | +print(f"Pandas dataframe analysis time for {int(sys.argv[1])} rows:") |
| 59 | +print( |
| 60 | + Timer(functools.partial(analyze_pandas_dataframe, pandas_df)).timeit(100) |
| 61 | +) |
| 62 | + |
| 63 | +print() |
| 64 | +print(f"Polars dataframe analysis time for {int(sys.argv[1])} rows:") |
| 65 | +print( |
| 66 | + Timer(functools.partial(analyze_polars_dataframe, polars_df)).timeit(100) |
| 67 | +) |
| 68 | + |
| 69 | +print() |
| 70 | +print(f"Polars lazyframe analysis time for {int(sys.argv[1])} rows:") |
| 71 | +print( |
| 72 | + Timer(functools.partial(analyze_polars_lazyframe, polars_lf)).timeit(100) |
| 73 | +) |
0 commit comments