From be0cfd7404252a9a49e93907d783b82fffafca43 Mon Sep 17 00:00:00 2001 From: Alexandre Catarino Date: Thu, 16 Apr 2026 23:52:48 +0100 Subject: [PATCH 1/3] Fix bugs and improve fetch performance in process.py - Reuse a single requests.Session and bound concurrency with a ThreadPoolExecutor(max_workers=8) instead of spawning one thread per page with no limit. - Move the per-date file write out of the per-response loop; previously every date file was rewritten once per page. - Fix country_states scope bug that dropped state names when an agency had multiple countries or none. - Drop the Python 3.6 timezone string workaround; %z now parses the raw value directly. - Drop hardcoded REGALYTICS_API_KEY fallback and the in_federal_register boolean coercion. - Log response page count and per-date article counts; exit 1 if more than one date is produced. Co-Authored-By: Claude Opus 4.7 (1M context) --- process.py | 77 ++++++++++++++++++++---------------------------------- 1 file changed, 29 insertions(+), 48 deletions(-) diff --git a/process.py b/process.py index 3f0709a..16d20d0 100644 --- a/process.py +++ b/process.py @@ -1,9 +1,10 @@ +import sys from json import dumps from pathlib import Path from datetime import datetime, timezone from os import environ -from requests import post -import threading +from concurrent.futures import ThreadPoolExecutor +from requests import Session URL = environ.get("REGALYTICS_API_BASE_URL", "https://api.regalytics.ai/api/v3") API_KEY = environ.get("REGALYTICS_API_KEY", "") @@ -16,6 +17,8 @@ process_date = datetime.strptime(DEPLOYMENT_DATE, '%Y%m%d').strftime('%Y-%m-%d') +SESSION = Session() + def get_data_from_source(process_date): payload = dumps({ @@ -28,31 +31,20 @@ def get_data_from_source(process_date): "page_size": 1000, }, }) + headers = {'Content-Type': 'application/json'} - def get_page(p): - page = post(f"https://api.regalytics.ai/api/v3/search?page={p}", headers={ 'Content-Type': 'application/json' }, data=payload).json() - all_responses.append(page) - - page_1 = post(f"https://api.regalytics.ai/api/v3/search", headers={ 'Content-Type': 'application/json' }, data=payload).json() - all_responses = [page_1] + def fetch(page): + url = f"{URL}/search" if page == 1 else f"{URL}/search?page={page}" + return SESSION.post(url, headers=headers, data=payload).json() + page_1 = fetch(1) if page_1['total_pages'] == 1: - return all_responses - - threads = [] - - for p in range(2, page_1['total_pages'] + 1): - threads.append(threading.Thread(target=get_page, args=(p,))) - - for t in threads: - t.start() + return [page_1] - for t in threads: - t.join() + with ThreadPoolExecutor(max_workers=8) as executor: + remaining = list(executor.map(fetch, range(2, page_1['total_pages'] + 1))) - all_responses.sort(key=lambda x: x['page_number']) - - return all_responses + return [page_1] + remaining # "agencies": [ # { @@ -76,54 +68,43 @@ def get_page(p): def process(process_date): all_responses = get_data_from_source(process_date) + print(f'Fetched {len(all_responses)} response page(s) for {process_date}') for response in all_responses: for article in response.get('results', []): - article['in_federal_register'] = 'yes' in article['in_federal_register'].lower() # State -> Dictionary> states = {} - agencies = article.get('agencies', []) - if not agencies: - agencies = [] + agencies = article.get('agencies') or [] for agency in agencies: state = agency.get('state') if not state: continue + state_names = [x['name'] for x in state] for country in agency.get('country', []): - name = country['name'] - - if not name in states: - country_states = [] - states[name] = country_states - else: - country_states = states[name] - - country_states.extend([x['name'] for x in state]) + states.setdefault(country['name'], []).extend(state_names) article['states'] = states article['agencies'] = [agency['name'] for agency in agencies] - - # search using `created_at` returns all with UTC time between 00:00-23:59 in a single day, + + # search using `created_at` returns all with UTC time between 00:00-23:59 in a single day, # so it include some articles created at 20:00-00:00 in EST of the "previous day" (-04:00). # Adjust timezone info of `created_at` field into UTC time to avoid overwriting the previous day file - article['created_at'] = article['created_at'][:-3] + article['created_at'][-2:] # %z only accepts `-0400` instead of `-04:00` in Python3.6 created_at = datetime.strptime(article['created_at'], '%Y-%m-%dT%H:%M:%S.%f%z').astimezone(timezone.utc) article['created_at'] = created_at.strftime('%Y-%m-%dT%H:%M:%S.%f') date_key = created_at.strftime('%Y%m%d') - if date_key not in articles_by_date: - date_articles = [] - articles_by_date[date_key] = date_articles - else: - date_articles = articles_by_date[date_key] + articles_by_date.setdefault(date_key, []).append(article) - date_articles.append(article) + date_count = len(articles_by_date) + print(f'Writing {date_count} date file(s)') + for date, articles in articles_by_date.items(): + print(f' {date}: {len(articles)} article(s)') + with open(ARTICLE_PATH / f'{date}.json', 'w') as article_file: + article_file.write('\n'.join(dumps(article, indent=None) for article in articles)) - for date, articles in articles_by_date.items(): - with open(ARTICLE_PATH / f'{date}.json', 'w') as article_file: - article_lines = '\n'.join([dumps(article, indent=None) for article in articles]) - article_file.write(article_lines) + if date_count > 1: + sys.exit(f'ERROR: expected 1 date, got {date_count}: {sorted(articles_by_date)}') if __name__ == "__main__": process(process_date) \ No newline at end of file From e617fd43f0ad5cbd9c0e7bbc3d054c8f34610f54 Mon Sep 17 00:00:00 2001 From: Alexandre Catarino Date: Fri, 17 Apr 2026 00:21:48 +0100 Subject: [PATCH 2/3] Bump build.yml to checkout@v5 and net10.0 Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 286187b..4fd6f82 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-24.04 steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v5 - name: Liberate disk space uses: jlumbroso/free-disk-space@main @@ -32,4 +32,4 @@ jobs: # BuildTests dotnet build ./tests/Tests.csproj /p:Configuration=Release /v:quiet /p:WarningLevel=1 && \ # Run Tests - dotnet test ./tests/bin/Release/net9.0/Tests.dll + dotnet test ./tests/bin/Release/net10.0/Tests.dll From 0631f0579562de5c6ec56675a6eea247d00dec18 Mon Sep 17 00:00:00 2001 From: Alexandre Catarino Date: Fri, 17 Apr 2026 15:08:17 +0100 Subject: [PATCH 3/3] Reimplements 'in_federal_register' --- process.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/process.py b/process.py index 16d20d0..971c1ba 100644 --- a/process.py +++ b/process.py @@ -72,6 +72,8 @@ def process(process_date): for response in all_responses: for article in response.get('results', []): + # Convert `in_federal_register` field into boolean value, default to False if the field is missing or empty or None. + article['in_federal_register'] = 'yes' in (article.get('in_federal_register') or '').lower() # State -> Dictionary> states = {} agencies = article.get('agencies') or []