diff --git a/data/analytics/ducky.py b/data/analytics/ducky.py index c01f777..a9c7701 100644 --- a/data/analytics/ducky.py +++ b/data/analytics/ducky.py @@ -1,3 +1,10 @@ -import duckdb +import duckdb -duckdb.sql("SELECT 42 FROM data/2026-01-14/hey.parquet") +df = duckdb.read_parquet("../2026-01-14/hey.parquet") + +duckdb.sql("DESCRIBE SELECT * FROM df").show() + +duckdb.sql("SELECT language, COUNT(language) AS c_p \ + FROM df \ + GROUP BY language \ + ORDER BY c_p DESC").show() \ No newline at end of file diff --git a/worker.py b/worker.py index a99a1be..aae7a5d 100644 --- a/worker.py +++ b/worker.py @@ -123,15 +123,14 @@ def get_github_data(self, start_in_repo_num: int = 0, batch_size: int = 500, git except Exception as validation_error: print(f"Validation error for repo {github_data_points.get('full_name')}: {validation_error}") - print("Skipping this repo and continuing...") + print("Skipping this repo and continuing") continue remaining_api_calls = github_instance.rate_limiting remaining = remaining_api_calls[0] - if remaining_api_calls == 1: + if remaining == 2: print(f"Reached batch size limit of {batch_size}") - break # # start_in_repo_num = counter