From 958d29168605344113660d20775c79b5c37f8e0b Mon Sep 17 00:00:00 2001 From: Oreoluwa Oluwasina Date: Fri, 16 Jan 2026 12:00:21 +0100 Subject: [PATCH 1/2] Moved data_to_csv to shared.py --- scripts/2-process/gcs_process.py | 23 ++++++----------------- scripts/2-process/github_process.py | 15 ++------------- scripts/2-process/wikipedia_process.py | 17 +++-------------- scripts/shared.py | 11 +++++++++++ 4 files changed, 22 insertions(+), 44 deletions(-) diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py index fefbba0f..a79b7d02 100755 --- a/scripts/2-process/gcs_process.py +++ b/scripts/2-process/gcs_process.py @@ -4,7 +4,6 @@ """ # Standard library import argparse -import csv import os import sys import textwrap @@ -62,16 +61,6 @@ def parse_arguments(): return args -def data_to_csv(args, data, file_path): - if not args.enable_save: - return - os.makedirs(PATHS["data_phase"], exist_ok=True) - # emulate csv.unix_dialect - data.to_csv( - file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n" - ) - - def process_product_totals(args, count_data): """ Processing count data: totals by product @@ -111,7 +100,7 @@ def process_product_totals(args, count_data): data.items(), columns=["CC legal tool product", "Count"] ) file_path = shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv") - data_to_csv(args, data, file_path) + shared.data_to_csv(args, data, file_path, PATHS) def process_latest_prior_retired_totals(args, count_data): @@ -192,7 +181,7 @@ def process_latest_prior_retired_totals(args, count_data): file_path = shared.path_join( PATHS["data_phase"], f"gcs_status_{key}_totals.csv" ) - data_to_csv(args, dataframe, file_path) + shared.data_to_csv(args, dataframe, file_path) def process_totals_by_free_cultural(args, count_data): @@ -225,7 +214,7 @@ def process_totals_by_free_cultural(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "gcs_totals_by_free_cultural.csv" ) - data_to_csv(args, data, file_path) + shared.data_to_csv(args, data, file_path) def process_totals_by_restrictions(args, count_data): @@ -259,7 +248,7 @@ def process_totals_by_restrictions(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "gcs_totals_by_restrictions.csv" ) - data_to_csv(args, data, file_path) + shared.data_to_csv(args, data, file_path) def process_totals_by_language(args, data): @@ -280,7 +269,7 @@ def process_totals_by_language(args, data): file_path = shared.path_join( PATHS["data_phase"], "gcs_totals_by_language.csv" ) - data_to_csv(args, data, file_path) + shared.data_to_csv(args, data, file_path) def process_totals_by_country(args, data): @@ -301,7 +290,7 @@ def process_totals_by_country(args, data): file_path = shared.path_join( PATHS["data_phase"], "gcs_totals_by_country.csv" ) - data_to_csv(args, data, file_path) + shared.data_to_csv(args, data, file_path) def main(): diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py index 27945613..ec4441cc 100755 --- a/scripts/2-process/github_process.py +++ b/scripts/2-process/github_process.py @@ -5,7 +5,6 @@ """ # Standard library import argparse -import csv import os import sys import traceback @@ -66,16 +65,6 @@ def check_for_data_file(file_path): ) -def data_to_csv(args, data, file_path): - if not args.enable_save: - return - os.makedirs(PATHS["data_phase"], exist_ok=True) - # emulate csv.unix_dialect - data.to_csv( - file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n" - ) - - def process_totals_by_license(args, count_data): """ Processing count data: totals by License @@ -99,7 +88,7 @@ def process_totals_by_license(args, count_data): PATHS["data_phase"], "github_totals_by_license.csv" ) check_for_data_file(file_path) - data_to_csv(args, data, file_path) + shared.data_to_csv(args, data, file_path, PATHS) def process_totals_by_restriction(args, count_data): @@ -134,7 +123,7 @@ def process_totals_by_restriction(args, count_data): PATHS["data_phase"], "github_totals_by_restriction.csv" ) check_for_data_file(file_path) - data_to_csv(args, data, file_path) + shared.data_to_csv(args, data, file_path, PATHS) def main(): diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py index 7712b26a..41435ec0 100755 --- a/scripts/2-process/wikipedia_process.py +++ b/scripts/2-process/wikipedia_process.py @@ -5,7 +5,6 @@ """ # Standard library import argparse -import csv import os import sys import textwrap @@ -70,16 +69,6 @@ def check_for_data_file(file_path): ) -def data_to_csv(args, data, file_path): - if not args.enable_save: - return - os.makedirs(PATHS["data_phase"], exist_ok=True) - # emulate csv.unix_dialect - data.to_csv( - file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n" - ) - - def process_highest_language_usage(args, count_data): """ Processing count data: Most represented languages @@ -99,7 +88,7 @@ def process_highest_language_usage(args, count_data): PATHS["data_phase"], "wikipedia_highest_language_usage.csv" ) check_for_data_file(file_path) - data_to_csv(args, top_10, file_path) + shared.data_to_csv(args, top_10, file_path, PATHS) def process_least_language_usage(args, count_data): @@ -123,7 +112,7 @@ def process_least_language_usage(args, count_data): PATHS["data_phase"], "wikipedia_least_language_usage.csv" ) check_for_data_file(file_path) - data_to_csv(args, bottom_10, file_path) + shared.data_to_csv(args, bottom_10, file_path, PATHS) def process_language_representation(args, count_data): @@ -150,7 +139,7 @@ def process_language_representation(args, count_data): PATHS["data_phase"], "wikipedia_language_representation.csv" ) check_for_data_file(file_path) - data_to_csv(args, language_counts, file_path) + shared.data_to_csv(args, language_counts, file_path, PATHS) def main(): diff --git a/scripts/shared.py b/scripts/shared.py index 51cfd8d2..66650c43 100644 --- a/scripts/shared.py +++ b/scripts/shared.py @@ -1,4 +1,5 @@ # Standard library +import csv import logging import os import sys @@ -36,6 +37,16 @@ def __init__(self, message, exit_code=None): super().__init__(self.message) +def data_to_csv(args, data, file_path, PATHS): + if not args.enable_save: + return + os.makedirs(PATHS["data_phase"], exist_ok=True) + # emulate csv.unix_dialect + data.to_csv( + file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n" + ) + + def get_session(accept_header=None, session=None): """ Create or configure a reusable HTTPS session with retry logic and From 39e692e20f58ead79e7547a05193d02c770ba2fd Mon Sep 17 00:00:00 2001 From: Oreoluwa Oluwasina Date: Wed, 21 Jan 2026 08:38:01 +0100 Subject: [PATCH 2/2] Made review changes --- scripts/2-process/gcs_process.py | 2 +- scripts/2-process/github_process.py | 5 +++-- scripts/2-process/wikipedia_process.py | 7 ++++--- scripts/shared.py | 6 ++++-- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/scripts/2-process/gcs_process.py b/scripts/2-process/gcs_process.py index 252fc8cc..f3fb192f 100755 --- a/scripts/2-process/gcs_process.py +++ b/scripts/2-process/gcs_process.py @@ -120,7 +120,7 @@ def process_product_totals(args, count_data): data.items(), columns=["CC legal tool product", "Count"] ) file_path = shared.path_join(PATHS["data_phase"], "gcs_product_totals.csv") - shared.data_to_csv(args, data, file_path, PATHS) + shared.data_to_csv(args, data, file_path) def process_latest_prior_retired_totals(args, count_data): diff --git a/scripts/2-process/github_process.py b/scripts/2-process/github_process.py index bd4f2eef..5553f842 100755 --- a/scripts/2-process/github_process.py +++ b/scripts/2-process/github_process.py @@ -94,7 +94,8 @@ def process_totals_by_license(args, count_data): data.reset_index(drop=True, inplace=True) file_path = shared.path_join( PATHS["data_phase"], "github_totals_by_license.csv" - shared.data_to_csv(args, data, file_paths, PATH) + ) + shared.data_to_csv(args, data, file_path) def process_totals_by_restriction(args, count_data): @@ -128,7 +129,7 @@ def process_totals_by_restriction(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "github_totals_by_restriction.csv" ) - shared.data_to_csv(args, data, file_path, PATHS) + shared.data_to_csv(args, data, file_path) def main(): diff --git a/scripts/2-process/wikipedia_process.py b/scripts/2-process/wikipedia_process.py index 90372045..2d23b393 100755 --- a/scripts/2-process/wikipedia_process.py +++ b/scripts/2-process/wikipedia_process.py @@ -102,7 +102,7 @@ def process_highest_language_usage(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_highest_language_usage.csv" ) - shared.data_to_csv(args, top_10, file_path, PATHS) + shared.data_to_csv(args, top_10, file_path) def process_least_language_usage(args, count_data): @@ -125,7 +125,8 @@ def process_least_language_usage(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_least_language_usage.csv" ) - shared.data_to_csv(args, bottom_10, file_path, PATHS) + shared.data_to_csv(args, bottom_10, file_path) + def process_language_representation(args, count_data): """ @@ -150,7 +151,7 @@ def process_language_representation(args, count_data): file_path = shared.path_join( PATHS["data_phase"], "wikipedia_language_representation.csv" ) - shared.data_to_csv(args, language_counts, file_path, PATHS) + shared.data_to_csv(args, language_counts, file_path) def main(): diff --git a/scripts/shared.py b/scripts/shared.py index 302da467..619d0770 100644 --- a/scripts/shared.py +++ b/scripts/shared.py @@ -37,14 +37,16 @@ def __init__(self, message, exit_code=None): super().__init__(self.message) -def data_to_csv(args, data, file_path, PATHS): +def data_to_csv(args, data, file_path): if not args.enable_save: return - os.makedirs(PATHS["data_phase"], exist_ok=True) + os.makedirs(args.paths["data_phase"], exist_ok=True) # emulate csv.unix_dialect data.to_csv( file_path, index=False, quoting=csv.QUOTE_ALL, lineterminator="\n" ) + + def check_for_data_files(args, file_paths, QUARTER): if args.force: return