diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 4e74707582..960aebabc0 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -82,4 +82,7 @@ jobs: cl/search/docket_number_cleaner.py \ cl/search/management/commands/clean_docket_number_raw.py \ cl/scrapers/management/commands/back_scrape_dockets.py \ + cl/corpus_importer/management/commands/import_texas_dockets.py \ + cl/corpus_importer/management/utils.py \ + cl/lib/decorators.py \ cl/corpus_importer/management/commands/import_scotus_dockets.py diff --git a/cl/alerts/management/commands/handle_old_docket_alerts.py b/cl/alerts/management/commands/handle_old_docket_alerts.py index 50fd22fdc9..e68fad1df7 100644 --- a/cl/alerts/management/commands/handle_old_docket_alerts.py +++ b/cl/alerts/management/commands/handle_old_docket_alerts.py @@ -1,5 +1,6 @@ from argparse import RawTextHelpFormatter +from asgiref.sync import async_to_sync, sync_to_async from django.conf import settings from django.contrib.auth.models import User from django.core.mail import EmailMultiAlternatives @@ -78,7 +79,7 @@ def build_user_report(user, delete=False): return report -def send_old_alert_warning_email_and_webhook(user, report) -> int: +async def send_old_alert_warning_email_and_webhook(user, report) -> int: """Send alerts emails and webhooks for old alerts :param user: The user with terminated dockets @@ -91,17 +92,21 @@ def send_old_alert_warning_email_and_webhook(user, report) -> int: ) webhook_count = 0 if report.very_old_alerts or report.disabled_alerts: - for user_webhook in user_webhooks: - send_old_alerts_webhook_event(user_webhook, report) + async for user_webhook in user_webhooks: + await send_old_alerts_webhook_event(user_webhook, report) webhook_count += 1 count = report.total_count() subject_template = loader.get_template("emails/old_email_subject.txt") subject = subject_template.render({"count": count}).strip() - txt = loader.get_template("emails/old_alert_email.txt").render( + txt = await sync_to_async( + loader.get_template("emails/old_alert_email.txt").render + )( {"report_data": report}, ) - html = loader.get_template("emails/old_alert_email.html").render( + html = await sync_to_async( + loader.get_template("emails/old_alert_email.html").render + )( {"report_data": report}, ) msg = EmailMultiAlternatives( @@ -174,9 +179,9 @@ def handle(self, *args, **options): count = report.total_count() if options["send_alerts"] and count > 0: emails_sent += 1 - webhooks_count = send_old_alert_warning_email_and_webhook( - user, report - ) + webhooks_count = async_to_sync( + send_old_alert_warning_email_and_webhook + )(user, report) webhooks_sent += webhooks_count logger.info( diff --git a/cl/alerts/management/commands/monitor_pacer.py b/cl/alerts/management/commands/monitor_pacer.py index 514b063c71..8ff85fe1fe 100644 --- a/cl/alerts/management/commands/monitor_pacer.py +++ b/cl/alerts/management/commands/monitor_pacer.py @@ -1,6 +1,7 @@ import datetime import time +from asgiref.sync import async_to_sync from django.conf import settings from django.core.mail import send_mail from django.template import loader @@ -48,12 +49,12 @@ def handle(self, *args, **options): s = ProxyPacerSession( username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD ) - s.login() + async_to_sync(s.login)() report = CaseQueryAdvancedBankruptcy("canb", s) t1 = now() while True: query = "Pacific" - report.query( + async_to_sync(report.query)( name_last=query, filed_from=datetime.date(2019, 1, 28), filed_to=datetime.date(2019, 1, 30), @@ -66,7 +67,7 @@ def handle(self, *args, **options): exit(0) query = "PG&E" - report.query( + async_to_sync(report.query)( name_last=query, filed_from=datetime.date(2019, 1, 28), filed_to=datetime.date(2019, 1, 30), @@ -83,5 +84,5 @@ def handle(self, *args, **options): min_login_frequency = 60 * 30 # thirty minutes if (t2 - t1).seconds > min_login_frequency: print("Logging in again.") - s.login() + async_to_sync(s.login)() t1 = now() diff --git a/cl/api/utils.py b/cl/api/utils.py index fa3d908dd0..75f3e1ad58 100644 --- a/cl/api/utils.py +++ b/cl/api/utils.py @@ -7,6 +7,7 @@ from typing import Any, TypedDict import eyecite +from asgiref.sync import async_to_sync, sync_to_async from dateutil import parser from dateutil.rrule import DAILY, rrule from django.conf import settings @@ -25,7 +26,7 @@ from django.views.decorators.vary import vary_on_headers from django_ratelimit.core import get_header from eyecite.tokenizers import HyperscanTokenizer -from requests import Response +from httpx import Response from rest_framework import serializers from rest_framework.exceptions import Throttled, ValidationError from rest_framework.metadata import SimpleMetadata @@ -1185,7 +1186,7 @@ def get_next_webhook_retry_date(retry_counter: int) -> datetime: WEBHOOK_MAX_RETRY_COUNTER = 7 -def check_webhook_failure_count_and_notify( +async def check_webhook_failure_count_and_notify( webhook_event: WebhookEvent, ) -> None: """Check if a Webhook needs to be disabled and/or send a notification about @@ -1208,7 +1209,7 @@ def check_webhook_failure_count_and_notify( 6: False, 7: True, # Send webhook disabled notification } - webhook = webhook_event.webhook + webhook = await Webhook.objects.aget(pk=webhook_event.webhook_id) if not webhook.enabled or webhook_event.debug: return @@ -1218,33 +1219,33 @@ def check_webhook_failure_count_and_notify( current_try_counter = webhook_event.retry_counter notify = notify_on[current_try_counter] if notify: - oldest_enqueued_for_retry = WebhookEvent.objects.filter( - webhook=webhook_event.webhook, + oldest_enqueued_for_retry = await WebhookEvent.objects.filter( + webhook=webhook, event_status=WEBHOOK_EVENT_STATUS.ENQUEUED_RETRY, debug=False, - ).earliest("date_created") + ).aearliest("date_created") if current_try_counter >= WEBHOOK_MAX_RETRY_COUNTER: webhook.enabled = False update_fields.append("enabled") update_fields.append("date_modified") # If the parent webhook is disabled mark all current ENQUEUED_RETRY # events as ENDPOINT_DISABLED - WebhookEvent.objects.filter( - webhook=webhook_event.webhook, + await WebhookEvent.objects.filter( + webhook=webhook, event_status=WEBHOOK_EVENT_STATUS.ENQUEUED_RETRY, debug=False, - ).update( + ).aupdate( event_status=WEBHOOK_EVENT_STATUS.ENDPOINT_DISABLED, date_modified=now(), ) if oldest_enqueued_for_retry.pk == webhook_event.pk: failure_counter = current_try_counter + 1 - notify_failing_webhook.delay( + await sync_to_async(notify_failing_webhook.delay)( webhook_event.pk, failure_counter, webhook.enabled ) # Save webhook and avoid emailing admins via signal in cl.users.signals - webhook.save(update_fields=update_fields) + await webhook.asave(update_fields=update_fields) def update_webhook_event_after_request( @@ -1288,7 +1289,7 @@ def update_webhook_event_after_request( if error is None: error = "" webhook_event.error_message = error - check_webhook_failure_count_and_notify(webhook_event) + async_to_sync(check_webhook_failure_count_and_notify)(webhook_event) if webhook_event.retry_counter >= WEBHOOK_MAX_RETRY_COUNTER: # If the webhook has reached the max retry counter, mark as failed webhook_event.event_status = WEBHOOK_EVENT_STATUS.FAILED diff --git a/cl/api/webhooks.py b/cl/api/webhooks.py index f6ff5f50f7..5c2c17be34 100644 --- a/cl/api/webhooks.py +++ b/cl/api/webhooks.py @@ -2,7 +2,9 @@ import random import requests +from asgiref.sync import sync_to_async from django.conf import settings +from django.contrib.auth.models import User from elasticsearch_dsl.response import Response from rest_framework.renderers import JSONRenderer @@ -80,7 +82,7 @@ def send_webhook_event( update_webhook_event_after_request(webhook_event, error=error_str) -def send_old_alerts_webhook_event( +async def send_old_alerts_webhook_event( webhook: Webhook, report: OldAlertReport ) -> None: """Send webhook event for old alerts @@ -115,14 +117,14 @@ def send_old_alerts_webhook_event( post_content, accepted_media_type="application/json;", ) - webhook_event = WebhookEvent.objects.create( + webhook_event = await WebhookEvent.objects.acreate( webhook=webhook, content=post_content, ) - send_webhook_event(webhook_event, json_bytes) + await sync_to_async(send_webhook_event)(webhook_event, json_bytes) -def send_recap_fetch_webhooks(fq: PacerFetchQueue) -> None: +async def send_recap_fetch_webhooks(fq: PacerFetchQueue) -> None: """Send webhook event for processed PacerFetchQueue objects. :param fq: The PacerFetchQueue object related to the event. @@ -137,10 +139,11 @@ def send_recap_fetch_webhooks(fq: PacerFetchQueue) -> None: PROCESSING_STATUS.INVALID_CONTENT, PROCESSING_STATUS.NEEDS_INFO, ]: - user_webhooks = fq.user.webhooks.filter( + user = await User.objects.aget(pk=fq.user_id) + user_webhooks = user.webhooks.filter( event_type=WebhookEventType.RECAP_FETCH, enabled=True ) - for webhook in user_webhooks: + async for webhook in user_webhooks: payload = PacerFetchQueueSerializer(fq).data post_content = { "webhook": generate_webhook_key_content(webhook), @@ -151,11 +154,11 @@ def send_recap_fetch_webhooks(fq: PacerFetchQueue) -> None: post_content, accepted_media_type="application/json;", ) - webhook_event = WebhookEvent.objects.create( + webhook_event = await WebhookEvent.objects.acreate( webhook=webhook, content=post_content, ) - send_webhook_event(webhook_event, json_bytes) + await sync_to_async(send_webhook_event)(webhook_event, json_bytes) def send_search_alert_webhook( diff --git a/cl/corpus_importer/bulk_utils.py b/cl/corpus_importer/bulk_utils.py index 5c5919d1d6..1ab1574df6 100644 --- a/cl/corpus_importer/bulk_utils.py +++ b/cl/corpus_importer/bulk_utils.py @@ -1,3 +1,4 @@ +from asgiref.sync import async_to_sync from celery import chain from cl.corpus_importer.tasks import get_pacer_doc_by_rd @@ -45,7 +46,7 @@ def get_petitions( session = ProxyPacerSession( username=pacer_username, password=pacer_password ) - session.login() + async_to_sync(session.login)() for i, rd_pk in enumerate(rds): if i < options["offset"]: i += 1 @@ -57,7 +58,7 @@ def get_petitions( session = ProxyPacerSession( username=pacer_username, password=pacer_password ) - session.login() + async_to_sync(session.login)() logger.info(f"Sent {i} tasks to celery so far.") logger.info("Doing row %s", i) throttle.maybe_wait() diff --git a/cl/corpus_importer/management/commands/760_project.py b/cl/corpus_importer/management/commands/760_project.py index 90ecf391b7..bd1fdc4352 100644 --- a/cl/corpus_importer/management/commands/760_project.py +++ b/cl/corpus_importer/management/commands/760_project.py @@ -2,6 +2,7 @@ import csv import os +from asgiref.sync import async_to_sync from celery.canvas import chain from django.conf import settings @@ -34,7 +35,7 @@ def get_dockets(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() session_data = SessionData(session.cookies, session.proxy_address) for i, row in enumerate(reader): if i < options["offset"]: @@ -101,7 +102,7 @@ def get_att_pages(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() get_district_attachment_pages( options=options, rd_pks=rd_pks, tag_names=[TAG], session=session ) diff --git a/cl/corpus_importer/management/commands/adelman_david.py b/cl/corpus_importer/management/commands/adelman_david.py index b37279b9a6..ae4c0c6530 100644 --- a/cl/corpus_importer/management/commands/adelman_david.py +++ b/cl/corpus_importer/management/commands/adelman_david.py @@ -1,6 +1,7 @@ import csv import os +from asgiref.sync import async_to_sync from celery.canvas import chain from django.conf import settings @@ -31,7 +32,7 @@ def download_dockets(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() session_data = SessionData(session.cookies, session.proxy_address) for i, row in enumerate(reader): if i < options["offset"]: diff --git a/cl/corpus_importer/management/commands/buchwald_project.py b/cl/corpus_importer/management/commands/buchwald_project.py index 3bb5f2f292..7ff1690d53 100644 --- a/cl/corpus_importer/management/commands/buchwald_project.py +++ b/cl/corpus_importer/management/commands/buchwald_project.py @@ -1,6 +1,7 @@ import os from argparse import RawTextHelpFormatter +from asgiref.sync import async_to_sync from celery import chain from django.conf import settings @@ -36,7 +37,7 @@ def add_all_nysd_to_cl(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() # IDs obtained by binary search of docket numbers on PACER website. earliest_id = 405990 @@ -53,7 +54,7 @@ def add_all_nysd_to_cl(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() throttle.maybe_wait() logger.info("Doing pacer_case_id: %s", pacer_case_id) @@ -72,7 +73,7 @@ def get_dockets(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() buchwald_id = 450 ds = ( @@ -95,7 +96,7 @@ def get_dockets(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() throttle.maybe_wait() logger.info("%s: Doing docket with pk: %s", i, d.pk) diff --git a/cl/corpus_importer/management/commands/claims_activity_project.py b/cl/corpus_importer/management/commands/claims_activity_project.py index 6ba8a66565..a9e57979f2 100644 --- a/cl/corpus_importer/management/commands/claims_activity_project.py +++ b/cl/corpus_importer/management/commands/claims_activity_project.py @@ -6,6 +6,7 @@ from datetime import date import pandas as pd +from asgiref.sync import async_to_sync from django.conf import settings from juriscraper.pacer import ClaimsActivity @@ -45,7 +46,7 @@ def query_and_parse_claims_activity( } s = ProxyPacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) - s.login() + async_to_sync(s.login)() for court_id in courts: court = map_cl_to_pacer_id(court_id) for alias, creditor_name in creditor_names.items(): diff --git a/cl/corpus_importer/management/commands/download_texas_documents.py b/cl/corpus_importer/management/commands/download_texas_documents.py new file mode 100644 index 0000000000..9ce85102df --- /dev/null +++ b/cl/corpus_importer/management/commands/download_texas_documents.py @@ -0,0 +1,172 @@ +import time +from itertools import batched + +from celery import chain +from django.db.models import Q + +from cl.corpus_importer.tasks import download_texas_document_pdf, logger +from cl.lib.celery_utils import CeleryThrottle +from cl.lib.command_utils import VerboseCommand +from cl.scrapers.tasks import extract_pdf_document +from cl.search.models import TexasDocument + + +def extract_texas_documents( + extraction_queue: str, batch_size: int, delay: float +) -> None: + """ + Run the extraction task for TexasDocument instances where ocr_status is not + OCR_UNNECESSARY or OCR_COMPLETE. + + :param extraction_queue: The celery queue for PDF extraction tasks. + :param batch_size: The batch size for PDF extraction tasks. + :param delay: Seconds to sleep between scheduling tasks. + + :return: None + """ + docs = ( + TexasDocument.objects.exclude( + Q(filepath_local="") + | Q( + ocr_status__in=( + TexasDocument.OCR_UNNECESSARY, + TexasDocument.OCR_COMPLETE, + ) + ) + ) + .values_list("pk", flat=True) + .order_by() + ) + count = docs.count() + logger.info("Found %s TexasDocuments needing extraction.", count) + throttle = CeleryThrottle(queue_name=extraction_queue) + processed_count = 0 + for pks in batched(docs.iterator(), batch_size): + throttle.maybe_wait() + extract_pdf_document.si( + pks=pks, + check_if_needed=False, + model_name="search.TexasDocument", + ).set(queue=extraction_queue).apply_async() + processed_count += 1 + if processed_count % 100 == 0: + logger.info( + "Scheduled %s/%s (%s)", + processed_count, + count, + f"{processed_count / count:.0%}", + ) + time.sleep(delay) + logger.info( + "Scheduled %s/%s", + processed_count, + count, + ) + + +def download_and_extract_texas_documents( + download_queue: str, extraction_queue: str, delay: float +) -> None: + """ + Download and extract attachments for TexasDocument with a missing or stale + local file. + + Queries TexasDocument instances that have no filepath_local, then schedules + a download -> extraction chain for each. + + :param download_queue: The celery queue for download tasks. + :param extraction_queue: The celery queue for extraction tasks. + :param delay: Seconds to sleep between scheduling tasks. + + :return: None + """ + docs = ( + TexasDocument.objects.filter(filepath_local="") + .values_list("pk", flat=True) + .order_by() + ) + count = docs.count() + logger.info( + "Found %s TexasDocuments needing download and extraction.", count + ) + throttle = CeleryThrottle(queue_name=extraction_queue) + processed_count = 0 + for pk in docs.iterator(): + throttle.maybe_wait() + chain( + download_texas_document_pdf.si(pk).set(queue=download_queue), + extract_pdf_document.s( + check_if_needed=False, + model_name="search.TexasDocument", + ).set(queue=extraction_queue), + ).apply_async() + processed_count += 1 + if processed_count % 100 == 0: + logger.info( + "Scheduled %s/%s (%s)", + processed_count, + count, + f"{processed_count / count:.0%}", + ) + time.sleep(delay) + logger.info( + "Scheduled %s/%s", + processed_count, + count, + ) + + +class Command(VerboseCommand): + help = "Download and extract PDFs for TexasDocument instances which have missing or stale local files." + + def add_arguments(self, parser): + parser.add_argument( + "--download-queue", + type=str, + default="celery", + help="The celery queue for PDF download tasks.", + ) + parser.add_argument( + "--extraction-queue", + type=str, + default="celery", + help="The celery queue for PDF extraction tasks.", + ) + parser.add_argument( + "--only-extraction", + type=bool, + default=False, + help="Skip downloading attachments and only run the extraction task.", + ) + parser.add_argument( + "--batch-size", + type=int, + default=1000, + help="The batch size for PDF extraction tasks. Only used if --only-extraction is true.", + ) + parser.add_argument( + "--delay", + type=float, + default=1.0, + help="Seconds to sleep between scheduling tasks.", + ) + + def handle(self, *args, **options): + super().handle(*args, **options) + + extraction_queue = options["extraction_queue"] + delay = options["delay"] + only_extraction = options["only_extraction"] + + if only_extraction: + batch_size = options["batch_size"] + logger.info("Running extraction for TexasDocuments...") + extract_texas_documents(extraction_queue, batch_size, delay) + else: + download_queue = options["download_queue"] + logger.info( + "Downloading and extracting TexasDocument attachments..." + ) + download_and_extract_texas_documents( + download_queue, extraction_queue, delay + ) diff --git a/cl/corpus_importer/management/commands/everything_project.py b/cl/corpus_importer/management/commands/everything_project.py index bbe84f8daf..315cdef323 100644 --- a/cl/corpus_importer/management/commands/everything_project.py +++ b/cl/corpus_importer/management/commands/everything_project.py @@ -1,5 +1,6 @@ import os +from asgiref.sync import async_to_sync from celery.canvas import chain from django.conf import settings @@ -115,7 +116,7 @@ def get_dockets(options, items, tags, sample_size=0, doc_num_end=""): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() for i, row in enumerate(items): if i < options["offset"]: continue @@ -128,7 +129,7 @@ def get_dockets(options, items, tags, sample_size=0, doc_num_end=""): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) diff --git a/cl/corpus_importer/management/commands/export_control.py b/cl/corpus_importer/management/commands/export_control.py index 882aa74741..4c606bcd41 100644 --- a/cl/corpus_importer/management/commands/export_control.py +++ b/cl/corpus_importer/management/commands/export_control.py @@ -2,6 +2,7 @@ import csv import os +from asgiref.sync import async_to_sync from django.conf import settings from cl.corpus_importer.task_canvases import get_docket_and_claims @@ -70,7 +71,7 @@ def get_data(options, row_transform, tags): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() for i, row in enumerate(reader): if i < options["offset"]: continue diff --git a/cl/corpus_importer/management/commands/fill_case_transfers.py b/cl/corpus_importer/management/commands/fill_case_transfers.py new file mode 100644 index 0000000000..52f8744a1e --- /dev/null +++ b/cl/corpus_importer/management/commands/fill_case_transfers.py @@ -0,0 +1,10 @@ +from cl.lib.command_utils import VerboseCommand +from cl.search.models import CaseTransfer + + +class Command(VerboseCommand): + help = "Update missing docket foreign keys in the CaseTransfer table." + + def handle(self, *args, **options): + super().handle(*args, **options) + CaseTransfer.fill_null_dockets() diff --git a/cl/corpus_importer/management/commands/get_pacer_doc_ids.py b/cl/corpus_importer/management/commands/get_pacer_doc_ids.py index 764465d978..79b674112b 100644 --- a/cl/corpus_importer/management/commands/get_pacer_doc_ids.py +++ b/cl/corpus_importer/management/commands/get_pacer_doc_ids.py @@ -1,5 +1,6 @@ import os +from asgiref.sync import async_to_sync from django.conf import settings from cl.corpus_importer.tasks import get_pacer_doc_id_with_show_case_doc_url @@ -12,7 +13,7 @@ PACER_PASSWORD = os.environ.get("PACER_PASSWORD", settings.PACER_PASSWORD) -def get_pacer_doc_ids(options): +async def get_pacer_doc_ids(options): """Get pacer_doc_ids for any item that needs them.""" q = options["queue"] throttle = CeleryThrottle(queue_name=q) @@ -37,7 +38,7 @@ def get_pacer_doc_ids(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() logger.info( f"Sent {completed} tasks to celery so far. Latest pk: {row_pk}" ) @@ -72,4 +73,4 @@ def add_arguments(self, parser): def handle(self, *args, **options): super().handle(*args, **options) - get_pacer_doc_ids(options) + async_to_sync(get_pacer_doc_ids)(options) diff --git a/cl/corpus_importer/management/commands/import_patent.py b/cl/corpus_importer/management/commands/import_patent.py index 045654a0d6..15485205c0 100644 --- a/cl/corpus_importer/management/commands/import_patent.py +++ b/cl/corpus_importer/management/commands/import_patent.py @@ -1,6 +1,7 @@ import os import time +from asgiref.sync import async_to_sync from celery import chain from django.conf import settings @@ -42,7 +43,7 @@ def get_dockets(options: dict) -> None: session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() session_data = SessionData(session.cookies, session.proxy_address) NOS_CODES = [PATENT, PATENT_ANDA] DISTRICTS = ["ded", "txwd"] diff --git a/cl/corpus_importer/management/commands/import_texas_dockets.py b/cl/corpus_importer/management/commands/import_texas_dockets.py new file mode 100644 index 0000000000..4189444da6 --- /dev/null +++ b/cl/corpus_importer/management/commands/import_texas_dockets.py @@ -0,0 +1,51 @@ +from collections.abc import Iterable +from pathlib import Path + +from cl.celery_init import app +from cl.corpus_importer.management.utils import ( + CorpusImporterCommand, +) +from cl.corpus_importer.tasks import ( + texas_corpus_download_task, + texas_ingest_docket_task, +) +from cl.search.models import CaseTransfer + + +class Command(CorpusImporterCommand): + help = "Import Texas dockets from S3 using an inventory CSV." + + compose_redis_key = "texas_docket_import:log" + + @staticmethod + def transform_inventory_iterator( + csv_reader: Iterable[list[str]], + ) -> Iterable[tuple[tuple[str, str], tuple[str, str]]]: + meta_rows = filter( + # Filter only for meta files which are not duplicates (don't end in "_X") and not for search result scrapes + lambda r: "searches" not in r[1] + and Path(r[1]).name.endswith("_meta.json"), + map(lambda r: (r[0].strip(), r[1].strip()), csv_reader), + ) + + for meta_row in meta_rows: + meta_bucket, meta_key = meta_row + meta_path = Path(meta_key) + docket_name = meta_path.stem.removesuffix("_meta") + html_key = str(meta_path.with_name(f"{docket_name}.html")) + yield ( + (meta_bucket, html_key), + (meta_bucket, meta_key), + ) + + @staticmethod + def download_task() -> app.Task: + return texas_corpus_download_task + + @staticmethod + def merge_task() -> app.Task: + return texas_ingest_docket_task + + def handle(self, *args, **options): + super().handle(*args, **options) + CaseTransfer.fill_null_dockets() diff --git a/cl/corpus_importer/management/commands/jackson_project.py b/cl/corpus_importer/management/commands/jackson_project.py index 97ca50d29b..2caede24e4 100644 --- a/cl/corpus_importer/management/commands/jackson_project.py +++ b/cl/corpus_importer/management/commands/jackson_project.py @@ -1,5 +1,6 @@ import os +from asgiref.sync import async_to_sync from celery import chain from django.conf import settings @@ -22,7 +23,7 @@ def get_dockets(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() jackson_id = 1609 ds = Docket.objects.filter(court_id="dcd", assigned_to_id=jackson_id) diff --git a/cl/corpus_importer/management/commands/kessler_ilnb.py b/cl/corpus_importer/management/commands/kessler_ilnb.py index e93c5e0824..8778b59b78 100644 --- a/cl/corpus_importer/management/commands/kessler_ilnb.py +++ b/cl/corpus_importer/management/commands/kessler_ilnb.py @@ -2,6 +2,7 @@ import csv import os +from asgiref.sync import async_to_sync from celery.canvas import chain from django.conf import settings @@ -37,7 +38,7 @@ def get_dockets(options): pacer_session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - pacer_session.login() + async_to_sync(pacer_session.login)() for i, row in enumerate(reader): if i < options["offset"]: continue @@ -48,7 +49,7 @@ def get_dockets(options): pacer_session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - pacer_session.login() + async_to_sync(pacer_session.login)() logger.info(f"Sent {i} tasks to celery so far.") logger.info("Doing row %s", i) throttle.maybe_wait() @@ -93,7 +94,7 @@ def get_final_docs(options): pacer_session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - pacer_session.login() + async_to_sync(pacer_session.login)() for i, de in enumerate(des): if i < options["offset"]: i += 1 @@ -104,7 +105,7 @@ def get_final_docs(options): pacer_session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - pacer_session.login() + async_to_sync(pacer_session.login)() logger.info(f"Sent {i} tasks to celery so far.") logger.info("Doing row %s", i) rd_pks = ( diff --git a/cl/corpus_importer/management/commands/list_of_creditors_project.py b/cl/corpus_importer/management/commands/list_of_creditors_project.py index 93e79dda20..e0d82e188a 100644 --- a/cl/corpus_importer/management/commands/list_of_creditors_project.py +++ b/cl/corpus_importer/management/commands/list_of_creditors_project.py @@ -5,6 +5,7 @@ import re from typing import TypedDict, cast +from asgiref.sync import async_to_sync from django.conf import settings from cl.corpus_importer.bulk_utils import make_bankr_docket_number @@ -72,7 +73,7 @@ def query_and_save_creditors_data(options: OptionsType) -> None: session = ProxyPacerSession( username=CLIENT_PACER_USERNAME, password=CLIENT_PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() throttle = CeleryThrottle(queue_name=q) completed = 0 for i, rows in enumerate( diff --git a/cl/corpus_importer/management/commands/nos_700.py b/cl/corpus_importer/management/commands/nos_700.py index 6d383ffe95..57d5d8d077 100644 --- a/cl/corpus_importer/management/commands/nos_700.py +++ b/cl/corpus_importer/management/commands/nos_700.py @@ -1,5 +1,6 @@ import os +from asgiref.sync import async_to_sync from celery.canvas import chain from django.conf import settings @@ -231,7 +232,7 @@ def get_dockets(options, items, tags, sample_size=0): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() for i, row in enumerate(items): if i < options["offset"]: continue @@ -244,7 +245,7 @@ def get_dockets(options, items, tags, sample_size=0): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) @@ -281,7 +282,7 @@ def get_attachment_pages(options, tag): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() get_district_attachment_pages( options=options, rd_pks=rd_pks, tag_names=[tag], session=session ) diff --git a/cl/corpus_importer/management/commands/nywb_chapter_7.py b/cl/corpus_importer/management/commands/nywb_chapter_7.py index 9fa791a63c..dfafc1d13d 100644 --- a/cl/corpus_importer/management/commands/nywb_chapter_7.py +++ b/cl/corpus_importer/management/commands/nywb_chapter_7.py @@ -2,6 +2,7 @@ import csv import os +from asgiref.sync import async_to_sync from celery.canvas import chain from cl.corpus_importer.bulk_utils import ( @@ -32,7 +33,7 @@ def get_dockets(options): pacer_session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - pacer_session.login() + async_to_sync(pacer_session.login)() for i, row in enumerate(reader): if i < options["offset"]: continue @@ -43,7 +44,7 @@ def get_dockets(options): pacer_session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - pacer_session.login() + async_to_sync(pacer_session.login)() logger.info(f"Sent {i} tasks to celery so far.") logger.info("Doing row %s", i) throttle.maybe_wait() diff --git a/cl/corpus_importer/management/utils.py b/cl/corpus_importer/management/utils.py new file mode 100644 index 0000000000..932b08dde0 --- /dev/null +++ b/cl/corpus_importer/management/utils.py @@ -0,0 +1,218 @@ +import csv +import random +import time +from abc import ABC, abstractmethod +from collections.abc import Iterable +from datetime import date +from itertools import islice +from typing import final + +from celery import chain +from django.conf import settings +from pydantic import BaseModel, field_validator + +from cl.celery_init import app +from cl.lib.celery_utils import CeleryThrottle +from cl.lib.command_utils import VerboseCommand, logger +from cl.lib.indexing_utils import ( + get_last_parent_document_id_processed, + log_last_document_indexed, +) + + +class TexasDocketMeta(BaseModel): + case_number: str + case_url: str + date_filed: date + style: str + v: str + case_type: str + coa_case_number: str + trial_court_case_number: str + trial_court_county: str + trial_court: str + appellate_court: str + court_code: str + + @field_validator("date_filed", mode="before") + def date_filed_validator(cls, v): + return date(*(time.strptime(v, "%m/%d/%Y")[0:3])) + + +class CorpusImporterCommand(VerboseCommand, ABC): + """Base class for `cl.corpus_importer` commands encapsulating inventory\ + file reading, celery queue interactions, and redis logging. + + Uses an inventory CSV from S3 to find files to parse and ingest into the\ + database. Includes ratelimiting and autoresume logic. + + Required methods are: + + - `merge_task`: Should return a Celery task which takes the output of\ + `download_task`, parses it, and merges it into the database. Input\ + should be whatever the output of `download_task` is. Must accept a\ + `download_attachments` boolean keyword argument indicating whether\ + docket entry attachments should be downloaded as part of the merging\ + process. + + Required properties are: + + - `compose_redis_key`: The Redis log key to use for tracking progress. + + Optional methods are: + - `download_task`: Should return the task used to download files from S3.\ + A default implementation is provided for convenience.""" + + compose_redis_key: str + + @final + def add_arguments(self, parser): + parser.add_argument( + "--inventory-file", + required=True, + help="Path to the inventory CSV relative to MEDIA_ROOT.", + ) + parser.add_argument( + "--retrieval-queue", + default="celery", + help="Which celery queue to use for S3 retrieval.", + ) + parser.add_argument( + "--ingesting-queue", + default="celery", + help="Which celery queue to use for DB ingesting.", + ) + parser.add_argument( + "--throttle-min-items", + type=int, + default=5, + help="CeleryThrottle min_items parameter.", + ) + parser.add_argument( + "--delay", + type=float, + default=1.0, + help="Seconds to sleep between scheduling tasks.", + ) + parser.add_argument( + "--start-row", + type=int, + default=0, + help="Row number to start from (for manual resume).", + ) + parser.add_argument( + "--inventory-rows", + type=int, + required=True, + help="Total number of rows in the inventory CSV. Used to " + "log progress percentage.", + ) + parser.add_argument( + "--auto-resume", + action="store_true", + default=False, + help="Resume from last row stored in Redis.", + ) + parser.add_argument( + "--test-random", + type=bool, + default=False, + help="Randomly select rows from the inventory file to import.", + ) + parser.add_argument( + "--download-attachments", + type=bool, + default=False, + help="Whether to download docket entry attachments as part of this command.", + ) + + @staticmethod + def download_task() -> app.Task: + from cl.corpus_importer.tasks import default_corpus_download_task + + return default_corpus_download_task + + @staticmethod + @abstractmethod + def merge_task() -> app.Task: ... + + @staticmethod + def transform_inventory_iterator( + csv_reader: Iterable[list[str]], + ) -> Iterable: + """ + Optionally performs transformations on the inventory CSV file\ + before passing it to the download Celery task. Can be used for\ + instance to merge consecutive rows which represent the same docket\ + into one object. + + :param csv_reader: The `csv.Reader` object to use to read the CSV. + + :return: The transformed inventory CSV iterator. The item of the\ + iterable should be a list of arguments to be passed to the\ + download task.""" + return map(lambda row: [row[0].strip(), row[1].strip()], csv_reader) + + def handle(self, *args, **options): + super().handle(*args, **options) + + retrieval_queue = options["retrieval_queue"] + ingesting_queue = options["ingesting_queue"] + delay = options["delay"] + inventory_rows = options["inventory_rows"] + inventory_path = settings.MEDIA_ROOT / options["inventory_file"] + download_attachments = options["download_attachments"] + + start_row = options["start_row"] + if options["auto_resume"]: + start_row = get_last_parent_document_id_processed( + self.compose_redis_key + ) + logger.info("Auto-resuming from row %s.", start_row) + + total_rows = inventory_rows - start_row + + throttle = CeleryThrottle( + min_items=options["throttle_min_items"], + queue_name=ingesting_queue, + ) + + with open(inventory_path, encoding="utf-8") as f: + download_inputs = self.transform_inventory_iterator(csv.reader(f)) + if options["test_random"]: + logger.warning( + "In testing mode. Randomly selecting rows from the inventory file." + ) + download_inputs = filter( + lambda _: random.random() < 0.001, download_inputs + ) + for row_idx, download_args in islice( + enumerate(download_inputs), start_row, None + ): + throttle.maybe_wait() + chain( + self.download_task() + .si(*download_args) + .set(queue=retrieval_queue), + self.merge_task() + .s(download_attachments=download_attachments) + .set(queue=ingesting_queue), + ).apply_async() + time.sleep(delay) + + if row_idx % 100 == 0: + processed = row_idx - start_row + progress = ( + f" ({processed / total_rows:.1%})" + if total_rows + else "" + ) + logger.info( + "Scheduled %s rows %s. Current row: %s.", + row_idx, + progress, + download_args, + ) + log_last_document_indexed(row_idx, self.compose_redis_key) + + logger.info("Finished scheduling all rows from inventory.") diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index e9f4ecbc06..15452e01a6 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import copy import hashlib import logging @@ -12,15 +14,16 @@ from pyexpat import ExpatError from re import Pattern from tempfile import NamedTemporaryFile -from typing import IO, Any +from typing import IO, Any, NamedTuple from urllib.parse import urljoin import botocore.exceptions import environ import eyecite +import httpx import internetarchive as ia import requests -from asgiref.sync import async_to_sync +from asgiref.sync import async_to_sync, sync_to_async from celery import Task, chain from celery.exceptions import SoftTimeLimitExceeded from django.conf import settings @@ -32,10 +35,15 @@ from django.utils.timezone import localtime, now from eyecite.tokenizers import HyperscanTokenizer from httpx import ( + ConnectError, + HTTPError, HTTPStatusError, NetworkError, ReadError, + ReadTimeout, RemoteProtocolError, + RequestError, + Response, TimeoutException, ) from juriscraper.lib.exceptions import PacerLoginException, ParsingException @@ -64,13 +72,23 @@ from juriscraper.state.texas import ( TexasCaseEvent, TexasCaseParty, + TexasCourtOfCriminalAppealsDocket, + TexasCourtOfCriminalAppealsScraper, TexasSupremeCourtAppellateBrief, TexasSupremeCourtCaseEvent, + TexasSupremeCourtDocket, + TexasSupremeCourtScraper, ) from juriscraper.state.texas.common import ( + CourtID, + CourtType, TexasAppellateBrief, TexasCaseDocument, ) +from juriscraper.state.texas.court_of_appeals import ( + TexasCourtOfAppealsDocket, + TexasCourtOfAppealsScraper, +) from openai import ( APIConnectionError, APIError, @@ -80,17 +98,8 @@ ) from pydantic import ValidationError from redis import ConnectionError as RedisConnectionError -from requests import Response -from requests.exceptions import ( - ConnectionError, - HTTPError, - ReadTimeout, - RequestException, - Timeout, -) from rest_framework.renderers import JSONRenderer from sentry_sdk import capture_exception -from urllib3.exceptions import ReadTimeoutError from cl.alerts.tasks import enqueue_docket_alert, send_alert_and_webhook from cl.audio.models import Audio @@ -101,6 +110,7 @@ from cl.citations.utils import filter_out_non_case_law_citations from cl.corpus_importer.api_serializers import IADocketSerializer from cl.corpus_importer.llm_models import CaseNameExtractionResponse +from cl.corpus_importer.management.utils import TexasDocketMeta from cl.corpus_importer.prompts.system import CASE_NAME_EXTRACT_SYSTEM from cl.corpus_importer.utils import ( DownloadPDFResult, @@ -113,17 +123,23 @@ is_long_appellate_document_number, make_iquery_probing_key, mark_ia_upload_needed, + texas_js_court_id_to_court_id, + texas_originating_court_to_court_id, ) from cl.custom_filters.templatetags.text_filters import best_case_name from cl.lib.celery_utils import throttle_task +from cl.lib.command_utils import logger from cl.lib.courts import find_court_object_by_name from cl.lib.crypto import sha1 -from cl.lib.decorators import retry +from cl.lib.decorators import retry, time_call from cl.lib.llm import call_llm from cl.lib.microservice_utils import ( doc_page_count_service, microservice, ) +from cl.lib.model_helpers import ( + make_texas_docket_number_core, +) from cl.lib.pacer import ( get_blocked_status, get_first_missing_de_date, @@ -146,6 +162,10 @@ from cl.lib.redis_utils import delete_redis_semaphore, get_redis_interface from cl.lib.storage import AWSMediaStorage from cl.lib.types import TaskData +from cl.people_db.lookup_utils import ( + lookup_judge_by_full_name, + lookup_judge_by_full_name_and_set_attr, +) from cl.people_db.models import Attorney, Role from cl.recap.constants import CR_2017, CR_OLD, CV_2017, CV_2020, CV_OLD from cl.recap.mergers import ( @@ -173,6 +193,7 @@ from cl.search.cluster_sources import ClusterSources from cl.search.models import ( PRECEDENTIAL_STATUS, + CaseTransfer, ClaimHistory, Court, Docket, @@ -185,6 +206,7 @@ ScotusDocketMetadata, SCOTUSDocument, Tag, + TrialCourtData, ) from cl.search.state.texas.models import TexasDocketEntry, TexasDocument @@ -337,9 +359,7 @@ def upload_recap_json(self, pk: int, database: str = "default") -> None: increment_failure_count(d) -@app.task(bind=True, max_retries=5) -def download_recap_item( - self, +async def download_recap_item( url: str, filename: str, clobber: bool = False, @@ -349,33 +369,27 @@ def download_recap_item( try: if os.path.isfile(location) and not clobber: raise OSError(f" IOError: File already exists at {location}") - r = requests.get( - url, - stream=True, - timeout=60, - headers={"User-Agent": "Free Law Project"}, - ) + async with httpx.AsyncClient() as client: + r = await client.get( + url, + timeout=60, + headers={"User-Agent": "Free Law Project"}, + ) r.raise_for_status() - except requests.Timeout as e: - logger.warning(" Timed out attempting to get: %s\n", url) - raise self.retry(exc=e, countdown=2) - except requests.RequestException as e: - logger.warning(" Unable to get %s\nException was:\n%s", url, e) + except TimeoutException as e: + logger.warning(f" Timed out attempting to get: {url}\n") + except RequestError as e: + logger.warning(f" Unable to get {url}\nException was:\n{e}") except OSError as e: - logger.warning(" %s", e) + logger.warning(f" {e}") else: with NamedTemporaryFile(prefix="recap_download_") as tmp: r.raw.decode_content = True - try: - shutil.copyfileobj(r.raw, tmp) - tmp.flush() - except ReadTimeoutError as exc: - # The download failed part way through. - raise self.retry(exc=exc) - else: - # Successful download. Copy from tmp to the right spot. Note - # that this will clobber. - shutil.copyfile(tmp.name, location) + shutil.copyfileobj(r.raw, tmp) + tmp.flush() + # Successful download. Copy from tmp to the right spot. Note + # that this will clobber. + shutil.copyfile(tmp.name, location) @app.task( @@ -396,7 +410,7 @@ def get_and_save_free_document_report( :param log_id: a PACERFreeDocumentLog object id :return: The status code of the scrape """ - session_data = get_or_cache_pacer_cookies( + session_data = async_to_sync(get_or_cache_pacer_cookies)( "pacer_scraper", username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD, @@ -410,11 +424,11 @@ def get_and_save_free_document_report( report = FreeOpinionReport(court_id, s) msg = "" try: - report.query(start, end, sort="case_number") + async_to_sync(report.query)(start, end, sort="case_number") except ( TypeError, - RequestException, - ReadTimeoutError, + RequestError, + ReadTimeout, PacerLoginException, ParsingException, SoftTimeLimitExceeded, @@ -425,7 +439,7 @@ def get_and_save_free_document_report( "TypeError getting free document report results, likely due " "to failure to get Nonce." ) - elif isinstance(exc, (RequestException | ReadTimeoutError)): + elif isinstance(exc, (RequestError | ReadTimeout)): msg = "Unable to get free document report results" elif isinstance(exc, PacerLoginException): msg = "PacerLoginException while getting free docs" @@ -699,13 +713,13 @@ def get_and_process_free_pdf( return None raise self.retry() - cookies_data = get_or_cache_pacer_cookies( + cookies_data = async_to_sync(get_or_cache_pacer_cookies)( "pacer_scraper", username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD, ) try: - r, r_msg = download_pacer_pdf_by_rd( + r, r_msg = async_to_sync(download_pacer_pdf_by_rd)( rd.pk, result.pacer_case_id, result.pacer_doc_id, @@ -747,14 +761,14 @@ def get_and_process_free_pdf( msg = "PacerLoginException while getting free docs." logger.info(f"{msg} Retrying.") # noqa: G004 # Refresh cookies before retrying - get_or_cache_pacer_cookies( + async_to_sync(get_or_cache_pacer_cookies)( "pacer_scraper", username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD, refresh=True, ) raise self.retry(exc=exc) - except (ReadTimeoutError, requests.RequestException) as exc: + except (ReadTimeout, RequestError) as exc: msg = "Request exception getting free PDF" if self.request.retries == self.max_retries: logger.warning(msg) @@ -767,8 +781,7 @@ def get_and_process_free_pdf( if r: pdf_bytes = r.content attachment_number = 0 # Always zero for free opinions - success, msg = update_rd_metadata( - self, + success, msg = async_to_sync(update_rd_metadata)( rd.pk, pdf_bytes, r_msg, @@ -781,6 +794,7 @@ def get_and_process_free_pdf( if success is False: PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg) + self.request.chain = None return None rd.refresh_from_db() @@ -854,7 +868,7 @@ def upload_to_ia( source_url: str, media_type: str, description: str, -) -> list[Response] | None: +) -> list[requests.Response] | None: """Upload an item and its files to the Internet Archive On the Internet Archive there are Items and files. Items have a global @@ -1081,7 +1095,7 @@ def get_pacer_case_id_and_title( ) if not session_data and user_pk: - session_data = get_pacer_cookie_from_cache(user_pk) + session_data = async_to_sync(get_pacer_cookie_from_cache)(user_pk) if not session_data: raise Exception("Cookies not available in cache") else: @@ -1095,9 +1109,9 @@ def get_pacer_case_id_and_title( report = PossibleCaseNumberApi(map_cl_to_pacer_id(court_id), s) msg = "" try: - report.query(docket_number) - except (RequestException, ReadTimeoutError, PacerLoginException) as exc: - if isinstance(exc, (RequestException | ReadTimeoutError)): + async_to_sync(report.query)(docket_number) + except (RequestError, ReadTimeout, PacerLoginException) as exc: + if isinstance(exc, (RequestError | ReadTimeout)): msg = ( "Network error while running possible case number query on: " "%s.%s" @@ -1130,7 +1144,7 @@ def get_pacer_case_id_and_title( @app.task( bind=True, - autoretry_for=(PacerLoginException, RequestException), + autoretry_for=(PacerLoginException, RequestError), max_retries=5, interval_start=5 * 60, interval_step=10 * 60, @@ -1174,7 +1188,7 @@ def do_case_query_by_pacer_case_id( except Docket.MultipleObjectsReturned: d = None - report.query(pacer_case_id) + async_to_sync(report.query)(pacer_case_id) docket_data = report.data logger.info( "Querying and parsing complete for %s.%s", court_id, pacer_case_id @@ -1262,7 +1276,7 @@ def filter_docket_by_tags( return data -def query_case_query_report( +async def query_case_query_report( court_id: str, pacer_case_id: int ) -> tuple[dict[str, Any], str]: """Query the iquery page for a given PACER case ID. @@ -1272,7 +1286,7 @@ def query_case_query_report( :return: A two tuple, the report data and the report HTML text. """ - session_data = get_or_cache_pacer_cookies( + session_data = await get_or_cache_pacer_cookies( "pacer_scraper", settings.PACER_USERNAME, password=settings.PACER_PASSWORD, @@ -1284,7 +1298,7 @@ def query_case_query_report( proxy=session_data.proxy_address, ) report = CaseQuery(map_cl_to_pacer_id(court_id), s) - report.query(pacer_case_id) + await report.query(pacer_case_id) return report.data, report.response.text @@ -1314,10 +1328,10 @@ def make_docket_by_iquery_base( """ try: - report_data, report_text = query_case_query_report( + report_data, report_text = async_to_sync(query_case_query_report)( court_id, pacer_case_id ) - except (requests.Timeout, requests.RequestException) as exc: + except (TimeoutException, RequestError) as exc: logger.warning( "Timeout or unknown RequestException on iquery crawl. " "Trying again if retries not exceeded." @@ -1457,7 +1471,7 @@ def make_docket_by_iquery_sweep( ) -@retry((requests.Timeout, PacerLoginException), tries=3, delay=0.25, backoff=1) +@retry((TimeoutException, PacerLoginException), tries=3, delay=0.25, backoff=1) def query_iquery_page( court_id: str, pacer_case_id: int ) -> tuple[bool, None] | tuple[dict[str, Any], str]: @@ -1470,7 +1484,9 @@ def query_iquery_page( and the report HTML text. """ - report_data, report_text = query_case_query_report(court_id, pacer_case_id) + report_data, report_text = async_to_sync(query_case_query_report)( + court_id, pacer_case_id + ) if not report_data: logger.info( "No valid data found in iquery page for %s.%s", @@ -1555,6 +1571,13 @@ def probe_or_scrape_iquery_pages( report_data, report_text = query_iquery_page( court_id, pacer_case_id_to_lookup ) + except TimeoutException: + logger.warning( + "The court %s website is probably down. Aborting the probe task.", + court_id, + ) + break + except HTTPError: # Set expiration accordingly and value to 2 to difference from # other waiting times. @@ -1599,13 +1622,6 @@ def probe_or_scrape_iquery_pages( delete_redis_semaphore("CACHE", make_iquery_probing_key(court_id)) return None - except requests.Timeout: - logger.warning( - "The court %s website is probably down. Aborting the probe task.", - court_id, - ) - break - if report_data: # Find and update/store the Docket. reports_data.append( @@ -1793,8 +1809,8 @@ def get_docket_by_pacer_case_id( ) report = DocketReport(map_cl_to_pacer_id(court_id), s) try: - report.query(pacer_case_id, **kwargs) - except (RequestException, ReadTimeoutError) as exc: + async_to_sync(report.query)(pacer_case_id, **kwargs) + except (RequestError, ReadTimeout) as exc: msg = "Network error getting docket: %s" if self.request.retries == self.max_retries: logger.error(f"{msg} Aborting chain.", logging_id) # noqa: G004 @@ -1873,8 +1889,8 @@ def get_appellate_docket_by_docket_number( logger.info("Querying docket report %s", logging_id) try: - report.query(docket_number, **kwargs) - except requests.RequestException as e: + async_to_sync(report.query)(docket_number, **kwargs) + except RequestError as e: logger.warning("Problem getting docket %s", logging_id) if self.request.retries == self.max_retries: self.request.chain = None @@ -1922,7 +1938,7 @@ def get_appellate_docket_by_docket_number( } -def get_att_report_by_rd( +async def get_att_report_by_rd( rd: RECAPDocument, session_data: SessionData, ) -> AttachmentPage | None: @@ -1939,9 +1955,11 @@ def get_att_report_by_rd( s = ProxyPacerSession( cookies=session_data.cookies, proxy=session_data.proxy_address ) - pacer_court_id = map_cl_to_pacer_id(rd.docket_entry.docket.court_id) - is_appellate_case = is_appellate_court(pacer_court_id) - is_acms_document = rd.is_acms_document() + de = await DocketEntry.objects.aget(id=rd.docket_entry_id) + d = await Docket.objects.aget(id=de.docket_id) + pacer_court_id = map_cl_to_pacer_id(d.court_id) + is_appellate_case = await is_appellate_court(pacer_court_id) + is_acms_document = await sync_to_async(rd.is_acms_document)() if is_acms_document: report_class = ACMSAttachmentPage @@ -1953,11 +1971,11 @@ def get_att_report_by_rd( att_report = report_class(pacer_court_id, s) if is_acms_document: - docket_case_id = rd.docket_entry.docket.pacer_case_id + docket_case_id = d.pacer_case_id rd_entry_id = rd.pacer_doc_id - att_report.query(docket_case_id, rd_entry_id) + await att_report.query(docket_case_id, rd_entry_id) else: - att_report.query(rd.pacer_doc_id) + await att_report.query(rd.pacer_doc_id) return att_report @@ -1988,7 +2006,7 @@ def get_attachment_page_by_rd( self.request.chain = None return None try: - att_report = get_att_report_by_rd(rd, session_data) + att_report = async_to_sync(get_att_report_by_rd)(rd, session_data) except HTTPError as exc: if exc.response and exc.response.status_code in [ HTTPStatus.INTERNAL_SERVER_ERROR, @@ -2008,7 +2026,7 @@ def get_attachment_page_by_rd( logger.error(msg, str(exc)) self.request.chain = None return None - except requests.RequestException as exc: + except RequestError as exc: logger.warning("Unable to get attachment page for %s", rd) raise self.retry(exc=exc) return att_report @@ -2056,8 +2074,8 @@ def get_bankr_claims_registry( logger.info("Querying claims information for %s", logging_id) report = ClaimsRegister(map_cl_to_pacer_id(d.court_id), s) try: - report.query(d.pacer_case_id, d.docket_number_raw) - except (RequestException, ReadTimeoutError) as exc: + async_to_sync(report.query)(d.pacer_case_id, d.docket_number) + except (RequestError, ReadTimeout) as exc: if self.request.retries == self.max_retries: self.request.chain = None logger.error( @@ -2180,7 +2198,7 @@ def save_attachment_pq_from_text( return pq.pk -def download_acms_pdf_by_rd( +async def download_acms_pdf_by_rd( court_id: str, acms_entry_id: str, acms_doc_id: str, @@ -2202,11 +2220,11 @@ def download_acms_pdf_by_rd( cookies=session_data.cookies, proxy=session_data.proxy_address ) report = ACMSDocketReport(pacer_court_id, s) - r, r_msg = report.download_pdf(acms_entry_id, acms_doc_id) + r, r_msg = await report.download_pdf(acms_entry_id, acms_doc_id) return r, r_msg -def download_pacer_pdf_by_rd( +async def download_pacer_pdf_by_rd( rd_pk: int, pacer_case_id: str, pacer_doc_id: str, @@ -2224,34 +2242,36 @@ def download_pacer_pdf_by_rd( and proxy. :param magic_number: The magic number to fetch PACER documents for free this is an optional field, only used by RECAP Email documents - :return: A two-tuple of requests.Response object usually containing a PDF, + :return: A two-tuple of httpx.Response object usually containing a PDF, or None if that wasn't possible, and a string representing the error if there was one. """ - rd = RECAPDocument.objects.get(pk=rd_pk) - pacer_court_id = map_cl_to_pacer_id(rd.docket_entry.docket.court_id) + rd = await RECAPDocument.objects.aget(pk=rd_pk) + de = await DocketEntry.objects.aget(id=rd.docket_entry_id) + d = await Docket.objects.aget(id=de.docket_id) + pacer_court_id = map_cl_to_pacer_id(d.court_id) s = ProxyPacerSession( cookies=session_data.cookies, proxy=session_data.proxy_address ) - if is_appellate_court(pacer_court_id): + if await is_appellate_court(pacer_court_id): report = AppellateDocketReport(pacer_court_id, s) pacer_doc_id = ( pacer_doc_id if not rd.attachment_number else f"{pacer_doc_id[:3]}1{pacer_doc_id[4:]}" ) - r, r_msg = report.download_pdf( + r, r_msg = await report.download_pdf( pacer_doc_id=pacer_doc_id, pacer_case_id=pacer_case_id ) else: report = FreeOpinionReport(pacer_court_id, s) - r, r_msg = report.download_pdf( + r, r_msg = await report.download_pdf( pacer_case_id, pacer_doc_id, magic_number, de_seq_num=de_seq_num ) return r, r_msg -def download_pdf_by_magic_number( +async def download_pdf_by_magic_number( court_id: str, pacer_doc_id: str, pacer_case_id: str, @@ -2273,7 +2293,7 @@ def download_pdf_by_magic_number( :param de_seq_num: The sequential number assigned by the PACER system to identify the docket entry within a case. :param acms: Whether the download belongs to an ACMS notification. - :return: A two-tuple of requests.Response object usually containing a PDF, + :return: A two-tuple of httpx.Response object usually containing a PDF, or None if that wasn't possible, and a string representing the error if there was one. """ @@ -2281,13 +2301,13 @@ def download_pdf_by_magic_number( cookies=session_data.cookies, proxy=session_data.proxy_address ) report = FreeOpinionReport(court_id, s) - r, r_msg = report.download_pdf( + r, r_msg = await report.download_pdf( pacer_case_id, pacer_doc_id, magic_number, appellate, de_seq_num, acms ) return r, r_msg -def get_document_number_from_confirmation_page( +async def get_document_number_from_confirmation_page( court_id: str, pacer_doc_id: str ) -> str: """Get the PACER document number from the PACER download confirmation page. @@ -2297,20 +2317,20 @@ def get_document_number_from_confirmation_page( :return: The PACER document number is available or an empty string if not. """ - recap_email_user = User.objects.get(username="recap-email") - session_data = get_or_cache_pacer_cookies( + recap_email_user = await User.objects.aget(username="recap-email") + session_data = await get_or_cache_pacer_cookies( recap_email_user.pk, settings.PACER_USERNAME, settings.PACER_PASSWORD ) s = ProxyPacerSession( cookies=session_data.cookies, proxy=session_data.proxy_address ) doc_num_report = DownloadConfirmationPage(court_id, s) - doc_num_report.query(pacer_doc_id) + await doc_num_report.query(pacer_doc_id) data = doc_num_report.data return data.get("document_number", "") -def get_document_number_for_appellate( +async def get_document_number_for_appellate( court_id: str, pacer_doc_id: str, pq: ProcessingQueue, @@ -2335,7 +2355,7 @@ def get_document_number_for_appellate( pdf_bytes = local_path.read() if pdf_bytes: # For other jurisdictions try first to get it from the PDF document. - dn_response = async_to_sync(microservice)( + dn_response = await microservice( service="document-number", file_type="pdf", file=pdf_bytes, @@ -2346,7 +2366,7 @@ def get_document_number_for_appellate( if not document_number and pacer_doc_id and not acms: # If we still don't have the document number fall back on the # download confirmation page - document_number = get_document_number_from_confirmation_page( + document_number = await get_document_number_from_confirmation_page( court_id, pacer_doc_id ) @@ -2367,7 +2387,7 @@ def get_document_number_for_appellate( return document_number -def is_pacer_doc_sealed(court_id: str, pacer_doc_id: str) -> bool: +async def is_pacer_doc_sealed(court_id: str, pacer_doc_id: str) -> bool: """Check if a pacer doc is sealed, querying the document in PACER. If a receipt is returned the document is not sealed, otherwise is sealed. @@ -2376,22 +2396,22 @@ def is_pacer_doc_sealed(court_id: str, pacer_doc_id: str) -> bool: :return: True if the document is sealed on PACER, False otherwise. """ - recap_email_user = User.objects.get(username="recap-email") - session_data = get_or_cache_pacer_cookies( + recap_email_user = await User.objects.aget(username="recap-email") + session_data = await get_or_cache_pacer_cookies( recap_email_user.pk, settings.PACER_USERNAME, settings.PACER_PASSWORD ) s = ProxyPacerSession( cookies=session_data.cookies, proxy=session_data.proxy_address ) receipt_report = DownloadConfirmationPage(court_id, s) - receipt_report.query(pacer_doc_id) + await receipt_report.query(pacer_doc_id) data = receipt_report.data if data == {}: return True return False -def is_docket_entry_sealed( +async def is_docket_entry_sealed( court_id: str, case_id: str, doc_id: str | None ) -> bool: """Check if a docket entry is sealed, querying the download confirmation @@ -2408,8 +2428,8 @@ def is_docket_entry_sealed( if not doc_id: return False - recap_email_user = User.objects.get(username="recap-email") - session_data = get_or_cache_pacer_cookies( + recap_email_user = await User.objects.aget(username="recap-email") + session_data = await get_or_cache_pacer_cookies( recap_email_user.pk, settings.PACER_USERNAME, settings.PACER_PASSWORD ) @@ -2420,8 +2440,7 @@ def is_docket_entry_sealed( return report.is_entry_sealed(case_id, doc_id) -def update_rd_metadata( - self: Task, +async def update_rd_metadata( rd_pk: int, pdf_bytes: bytes | None, r_msg: str, @@ -2434,7 +2453,6 @@ def update_rd_metadata( ) -> tuple[bool, str]: """After querying PACER and downloading a document, save it to the DB. - :param self: The celery task :param rd_pk: The primary key of the RECAPDocument to work on :param pdf_bytes: The byte array of the PDF. :param r_msg: A message from the download function about an error that was @@ -2450,8 +2468,8 @@ def update_rd_metadata( error/success message string. """ - rd = RECAPDocument.objects.get(pk=rd_pk) - if pdf_bytes is None: + rd = await RECAPDocument.objects.aget(pk=rd_pk) + if not pdf_bytes: if r_msg and "An attachment page was returned instead" in r_msg: msg = ( "This PACER document is part of an attachment page. " @@ -2465,14 +2483,13 @@ def update_rd_metadata( f"Unable to get PDF for RECAP Document '{rd_pk}' " f"at '{court_id}' with doc id '{pacer_doc_id}'" ) - self.request.chain = None return False, msg file_name = get_document_filename( court_id, pacer_case_id, document_number, attachment_number ) cf = ContentFile(pdf_bytes) - rd.filepath_local.save(file_name, cf, save=False) + await sync_to_async(rd.filepath_local.save)(file_name, cf, save=False) rd.file_size = rd.filepath_local.size rd.is_available = True # We've got the PDF. rd.date_upload = rd.date_upload or now() @@ -2481,7 +2498,7 @@ def update_rd_metadata( # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. rd.sha1 = sha1(pdf_bytes) - response = async_to_sync(doc_page_count_service)(rd) + response = await doc_page_count_service(rd) if response.is_success: rd.page_count = int(response.text) assert isinstance(rd.page_count, (int | type(None))), ( @@ -2489,12 +2506,10 @@ def update_rd_metadata( ) # Save and extract, skipping OCR. - rd.save() + await rd.asave() # Make sure we mark the docket as needing upload - async_to_sync(mark_ia_upload_needed)( - rd.docket_entry.docket, save_docket=True - ) + await mark_ia_upload_needed(rd.docket_entry.docket, save_docket=True) return True, "Saved item successfully" @@ -2516,7 +2531,7 @@ def add_tags(rd: RECAPDocument, tag_name: str | None) -> None: @app.task( bind=True, - autoretry_for=(PacerLoginException, RequestException, HTTPError), + autoretry_for=(PacerLoginException, RequestError, HTTPError), max_retries=3, interval_start=5, interval_step=5, @@ -2547,7 +2562,7 @@ def get_pacer_doc_by_rd( pacer_case_id = rd.docket_entry.docket.pacer_case_id de_seq_num = rd.docket_entry.pacer_sequence_number - r, r_msg = download_pacer_pdf_by_rd( + r, r_msg = async_to_sync(download_pacer_pdf_by_rd)( rd.pk, pacer_case_id, rd.pacer_doc_id, @@ -2559,8 +2574,7 @@ def get_pacer_doc_by_rd( pdf_bytes = None if r: pdf_bytes = r.content - success, msg = update_rd_metadata( - self, + success, msg = async_to_sync(update_rd_metadata)( rd_pk, pdf_bytes, r_msg, @@ -2580,7 +2594,7 @@ def get_pacer_doc_by_rd( @app.task( bind=True, - autoretry_for=(ConnectionError, ReadTimeout, HTTPError, RequestException), + autoretry_for=(ConnectError, ReadTimeout, HTTPError, RequestError), max_retries=15, interval_start=5, interval_step=5, @@ -2662,7 +2676,7 @@ def get_pacer_doc_by_rd_and_description( pacer_case_id = rd.docket_entry.docket.pacer_case_id de_seq_num = rd.docket_entry.pacer_sequence_number - r, r_msg = download_pacer_pdf_by_rd( + r, r_msg = async_to_sync(download_pacer_pdf_by_rd)( rd.pk, pacer_case_id, att_found["pacer_doc_id"], @@ -2674,8 +2688,7 @@ def get_pacer_doc_by_rd_and_description( pdf_bytes = None if r: pdf_bytes = r.content - success, msg = update_rd_metadata( - self, + success, msg = async_to_sync(update_rd_metadata)( rd_pk, pdf_bytes, r_msg, @@ -2687,6 +2700,7 @@ def get_pacer_doc_by_rd_and_description( ) if success is False: + self.request.chain = None return # Skip OCR for now. It'll happen in a second step. @@ -2723,12 +2737,12 @@ def get_pacer_doc_id_with_show_case_doc_url( last_try = self.request.retries == self.max_retries try: if rd.document_type == rd.ATTACHMENT: - report.query( + async_to_sync(report.query)( d.pacer_case_id, rd.document_number, rd.attachment_number ) else: - report.query(d.pacer_case_id, rd.document_number) - except (RequestException, ReadTimeoutError) as exc: + async_to_sync(report.query)(d.pacer_case_id, rd.document_number) + except (RequestError, ReadTimeout) as exc: msg = "Unable to get PDF for %s" if last_try: logger.error(msg, rd) @@ -2849,7 +2863,7 @@ def query_and_save_list_of_creditors( if not os.path.exists(html_file): try: report_hidden_api = PossibleCaseNumberApi(court_id, s) - report_hidden_api.query(docket_number) + async_to_sync(report_hidden_api.query)(docket_number) result = report_hidden_api.data( office_number=row["OFFICE"], docket_number_letters="bk", @@ -2903,7 +2917,7 @@ def query_and_save_list_of_creditors( # First get the POST param to ensure the same cost as in the browser. try: - post_param = report.query_post_param() + post_param = async_to_sync(report.query_post_param)() except IndexError as exc: # Sometimes this query fails, retry if there are retries available. if self.request.retries == self.max_retries: @@ -2931,7 +2945,7 @@ def query_and_save_list_of_creditors( logger.info("Invalid POST param for %s, aborting...", court_id) return None - report.query( + async_to_sync(report.query)( pacer_case_id=pacer_case_id, docket_number=docket_number, post_param=post_param, @@ -3250,7 +3264,7 @@ def download_pdf_in_stream( """ @retry( - (ConnectionError, Timeout), + (ConnectionError, requests.Timeout), tries=3, delay=0.25, backoff=1, @@ -3849,6 +3863,7 @@ def ingest_scotus_docket(docket_data: dict[str, Any]) -> None: ignore_result=True, # No retries because download_pdf_in_stream already has retry logic ) +@throttle_task("2/s") def download_texas_document_pdf( self: Task, texas_document_pk: int ) -> int | None: @@ -3903,9 +3918,224 @@ def download_texas_document_pdf( return texas_document_pk +class MergeResult[T = int](NamedTuple): + """Stores data about the result of an attempted merge operation.""" + + create: bool + """Whether a document needed to be created.""" + update: bool + """Whether a document needed to be updated.""" + success: bool + """Whether the operation was successful.""" + pk: T | None + """The primary key of the created or updated object.""" + + @staticmethod + def created[S](pk: S) -> MergeResult[S]: + """Shorthand for the result of a successful creation operation. + + :param pk: The primary key of the created object. + :return: The constructed MergeResult object.""" + return MergeResult(create=True, update=False, success=True, pk=pk) + + @staticmethod + def updated[S](pk: S) -> MergeResult[S]: + """Shorthand for the result of a successful update operation. + + :param pk: The primary key of the updated object. + :return: The constructed MergeResult object.""" + return MergeResult(create=False, update=True, success=True, pk=pk) + + @staticmethod + def failed[S]() -> MergeResult[S]: + """Shorthand for the result of a failed merge operation. + + :return: The constructed MergeResult object.""" + return MergeResult[S]( + create=False, update=False, success=False, pk=None + ) + + @staticmethod + def unnecessary[S](pk: S | None) -> MergeResult[S]: + """Shorthand for the result of an unnecessary merge operation. + + :return: The constructed MergeResult object.""" + return MergeResult[S](create=False, update=False, success=True, pk=pk) + + +def merge_texas_trial_court_data( + docket: Docket, + docket_data: TexasCourtOfCriminalAppealsDocket | TexasSupremeCourtDocket, +) -> MergeResult: + """ + Create or update a TrialCourtData object to capture trial court information + for Texas SC and CCA cases. + + :param docket: The docket in the SC or CCA. + :param docket_data: The scraped docket data. + + :return: The result of the attempted merge operation. + """ + originating_court = docket_data["originating_court"] + if originating_court["court_type"] == CourtType.APPELLATE.value: + logger.info( + "Originating court for Texas docket %s is appellate. TrialCourtData unnecessary.", + docket.docket_number, + ) + return MergeResult.unnecessary(None) + dn_trial = originating_court["case"] + judge_name = originating_court["judge"] + reporter = originating_court["reporter"] + punishment = originating_court["punishment"] + county = originating_court["county"] + court_id = texas_originating_court_to_court_id(originating_court) + court = None + court_name = originating_court["name"] + judge = None + if court_id: + try: + court = Court.objects.get(pk=court_id) + except Court.DoesNotExist: + logger.error("Court with ID %s not found.", court_id) + court = None + else: + court_name = court.full_name + if judge_name: + judge = async_to_sync(lookup_judge_by_full_name)( + name=judge_name, + court_id=court_id, + event_date=None, + require_living_judge=False, + ) + + try: + trial_court_data = TrialCourtData.objects.get( + docket=docket, + ) + except TrialCourtData.DoesNotExist: + logger.info( + "No existing TrialCourtData object found for Texas docket %s. Creating...", + docket.docket_number, + ) + created = True + trial_court_data = TrialCourtData( + docket=docket, + ) + else: + created = False + + new_values = { + "docket_number_trial": dn_trial, + "docket_number_raw_trial": dn_trial, + "judge_str": judge_name, + "judge": judge, + "reporter": reporter, + "court_name": court_name, + "court": court, + "punishment": punishment, + "county": county, + } + + updated = False + if not created: + updated = any( + getattr(trial_court_data, k) != v for k, v in new_values.items() + ) + if not updated: + return MergeResult.unnecessary(trial_court_data.pk) + + for k, v in new_values.items(): + setattr(trial_court_data, k, v) + trial_court_data.save() + return MergeResult( + create=created, update=updated, success=True, pk=trial_court_data.pk + ) + + +def merge_case_transfer(case_transfer: CaseTransfer) -> MergeResult: + """Merges a CaseTransfer object in the database by first checking if it can + be used to update an existing object, and if not, creating a new object (if + necessary). + + :param case_transfer: The CaseTransfer object to be merged. + :return: The result of this merge attempt.""" + logger.info( + "Merging CaseTransfer from docket %s in court %s to docket %s in court %s on %s with type %s.", + case_transfer.origin_docket_number, + case_transfer.origin_court.pk, + case_transfer.destination_docket_number, + case_transfer.destination_court.pk, + case_transfer.transfer_date.isoformat(), + case_transfer.transfer_type, + ) + candidate_case_transfers = CaseTransfer.objects.filter( + origin_court=case_transfer.origin_court, + origin_docket_number=case_transfer.origin_docket_number, + destination_court=case_transfer.destination_court, + destination_docket_number=case_transfer.destination_docket_number, + transfer_date=case_transfer.transfer_date, + transfer_type=case_transfer.transfer_type, + ) + try: + # Try to find an existing CaseTransfer to fill in info for. + if case_transfer.origin_docket: + existing_case_transfer = candidate_case_transfers.get( + origin_docket=None + ) + else: + existing_case_transfer = candidate_case_transfers.get( + destination_docket=None + ) + except CaseTransfer.MultipleObjectsReturned: + # This should never happen + logger.error( + "Found multiple matching CaseTransfer objects.", + ) + return MergeResult.failed() + except CaseTransfer.DoesNotExist: + logger.info( + "Could not find existing transfer to update. Checking if transfer already exists..." + ) + try: + existing_case_transfer = candidate_case_transfers.get( + origin_docket=case_transfer.origin_docket, + destination_docket=case_transfer.destination_docket, + ) + except CaseTransfer.MultipleObjectsReturned: + # Should never happen + logger.error( + "Found multiple matching CaseTransfer objects.", + ) + return MergeResult.failed() + except CaseTransfer.DoesNotExist: + logger.info( + "Did not find existing CaseTransfer object. Creating..." + ) + case_transfer.save() + return MergeResult.created(case_transfer.pk) + logger.info( + "Identical CaseTransfer object already exists. Merge is unnecessary." + ) + return MergeResult.unnecessary(existing_case_transfer.pk) + else: + logger.info( + "Updating existing CaseTransfer %s.", existing_case_transfer.pk + ) + if case_transfer.origin_docket: + existing_case_transfer.origin_docket = case_transfer.origin_docket + else: + existing_case_transfer.destination_docket = ( + case_transfer.destination_docket + ) + existing_case_transfer.save() + return MergeResult.updated(existing_case_transfer.pk) + + def merge_texas_document( - docket_entry: TexasDocketEntry, input_document: TexasCaseDocument -) -> tuple[bool, bool, int]: + docket_entry: TexasDocketEntry, + input_document: TexasCaseDocument, + download_attachments: bool = True, +) -> MergeResult: """Merge a single TexasCaseDocument object into CL. Checks if the document exists, creating a TexasDocument object if it does @@ -3915,161 +4145,152 @@ def merge_texas_document( :param docket_entry: The docket entry this attachment belongs to. :param input_document: The attachment to merge. - :return: Tuple with entries - - Flag indicating whether a document needed to be created or updated - - Flag indicating whether the update operation was successful or not - applicable - - Primary key of the TexasDocument object which matches the input document + :param download_attachments: Whether to download docket entry attachments. + + :return: The result of the merge operation. """ - (texas_document, created) = TexasDocument.objects.get_or_create( - media_id=input_document["media_id"], - docket_entry=docket_entry, - defaults={ - "description": input_document["description"], - "media_version_id": input_document["media_version_id"], - "url": input_document["document_url"], - }, - ) + try: + texas_document = TexasDocument.objects.get( + media_id=input_document["media_id"], + docket_entry=docket_entry, + ) + except TexasDocument.DoesNotExist: + existed = False + needs_update = True + texas_document = TexasDocument( + media_id=input_document["media_id"], + docket_entry=docket_entry, + ) + else: + existed = True + needs_update = ( + str(texas_document.media_version_id) + != input_document["media_version_id"] + or not texas_document.filepath_local + ) - if ( - created - or str(texas_document.media_version_id) - != input_document["media_version_id"] - or not texas_document.filepath_local - ): + if needs_update: texas_document.description = input_document["description"] texas_document.media_version_id = input_document["media_version_id"] texas_document.url = input_document["document_url"] + if texas_document.filepath_local: + texas_document.filepath_local.delete(save=False) + texas_document.filepath_local = "" + texas_document.ocr_status = None texas_document.save() - chain( - download_texas_document_pdf.si(texas_document.pk), - extract_pdf_document.s( - check_if_needed=False, model_name="search.TexasDocument" - ), - ).apply_async() - return True, True, texas_document.pk - - return False, True, texas_document.pk - - -def merge_texas_documents( - docket_entry: TexasDocketEntry, - documents: list[TexasCaseDocument], -) -> list[tuple[bool, bool, int]]: - """Merges a list of Texas docket entry attachments into CL. - - :param docket_entry: The docket entry this attachment belongs to. - :param documents: List of TexasCaseDocument attached to this docket entry. - :return: List of tuples with the following entries: - - A flag indicating whether the document needed to be created or updated, - - A flag indicating which is set to True when the document was successfully - created or updated or when an update was unnecessary, - - The primary key of the updated TexasDocument object.""" - output = [ - merge_texas_document(docket_entry, document) for document in documents - ] + if download_attachments: + transaction.on_commit( + # Lambda captures the pk without needing to keep the whole + # object around. It needs to be wrapped in another lambda to + # prevent mypy from complaining. + ( + lambda pk: lambda: chain( + download_texas_document_pdf.si(pk), + extract_pdf_document.s( + check_if_needed=False, + model_name="search.TexasDocument", + ), + ).apply_async() + )(texas_document.pk) + ) + return MergeResult( + create=not existed, + update=existed, + success=True, + pk=texas_document.pk, + ) - return output + return MergeResult.unnecessary(texas_document.pk) @transaction.atomic def merge_texas_docket_entry( docket: Docket, sequence_number: str, - appellate_brief: bool, - input_docket_entry: TexasCaseEvent - | TexasAppellateBrief - | TexasSupremeCourtCaseEvent - | TexasSupremeCourtAppellateBrief, -) -> tuple[bool, bool, int]: + case_event: TexasCaseEvent | TexasSupremeCourtCaseEvent, + appellate_brief: TexasAppellateBrief + | TexasSupremeCourtAppellateBrief + | None = None, + download_attachments: bool = True, +) -> MergeResult: """Merges a Texas docket entry into CL. :param docket: The docket this entry belongs to. :param sequence_number: The sequence number of the docket entry. - :param appellate_brief: Whether the docket entry is an appellate brief. - :param input_docket_entry: The docket entry being merged. + :param case_event: The docket entry information being merged. + :param appellate_brief: Appellate brief information if the docket entry is + an appellate brief, None otherwise. + :param download_attachments: Whether to download docket entry attachments. + :return: Tuple with the following entries - A flag indicating whether the docket entry or an attached document needed to be created or updated, - A flag which is set to true when the create/update operations are all either successful or unnecessary, - - The primary key of the updated TexasDocketEntry object.""" + - The primary key of the updated TexasDocketEntry object. + """ logger.info( "Merging TexasDocketEntry with sequence number %s into Docket %s", sequence_number, docket.pk, ) + appellate_brief_flag = bool(appellate_brief) Docket.objects.select_for_update().get(pk=docket.pk) docket_entries = TexasDocketEntry.objects.filter( docket=docket, - date_filed=input_docket_entry["date"], - entry_type=input_docket_entry["type"], - appellate_brief=appellate_brief, + date_filed=case_event["date"], + entry_type=case_event["type"], + appellate_brief=appellate_brief_flag, ) + docket_entry = None try: docket_entry = docket_entries.get() except TexasDocketEntry.DoesNotExist: - logger.info( - "No existing TexasDocketEntry found for sequence number %s on Docket %s. Creating new entry.", - sequence_number, - docket.pk, - ) - docket_entry = TexasDocketEntry( - docket=docket, - date_filed=input_docket_entry["date"], - entry_type=input_docket_entry["type"], - appellate_brief=appellate_brief, - ) - created = True + pass except TexasDocketEntry.MultipleObjectsReturned: # More filtering needed - matching_sequence_number = docket_entries.filter( + matching_sequence_numbers = docket_entries.filter( sequence_number=sequence_number - ).first() - logger.info( - "Multiple matching TexasDocketEntries found for sequence number %s on Docket %s.", - sequence_number, - docket.pk, ) - if matching_sequence_number: - logger.info( - "Found existing TexasDocketEntry for sequence number %s on Docket %s. Updating entry.", + try: + docket_entry = matching_sequence_numbers.get() + except TexasDocketEntry.MultipleObjectsReturned: + logger.error( + "Multiple matching TexasDocketEntries found for sequence number %s on Docket %s.", sequence_number, docket.pk, ) - docket_entry = matching_sequence_number - created = False - else: + docket_entry = matching_sequence_numbers.first() + except TexasDocketEntry.DoesNotExist: logger.error( - "No existing TexasDocketEntry found for sequence number %s on Docket %s. Creating new entry.", + "Could not find matching TexasDocketEntry with sequence number %s on Docket %s. Creating new docket entry, which may be a duplicate...", sequence_number, docket.pk, ) - docket_entry = TexasDocketEntry( - docket=docket, - date_filed=input_docket_entry["date"], - entry_type=input_docket_entry["type"], - appellate_brief=appellate_brief, - ) - created = True else: logger.info( "Found existing TexasDocketEntry for sequence number %s on Docket %s. Updating entry.", sequence_number, docket.pk, ) - created = False + + created = False + if not docket_entry: + docket_entry = TexasDocketEntry( + docket=docket, + date_filed=case_event["date"], + entry_type=case_event["type"], + appellate_brief=appellate_brief_flag, + ) + created = True docket_entry.sequence_number = sequence_number - docket_entry.description = input_docket_entry.get("description", "") - docket_entry.disposition = input_docket_entry.get("disposition", "") - docket_entry.remarks = input_docket_entry.get("remarks", "") - logger.info( - "Saving TexasDocketEntry %s on Docket %s", - docket_entry.pk, - docket.pk, + docket_entry.description = ( + appellate_brief["description"] if appellate_brief else "" ) + docket_entry.disposition = case_event["disposition"] + docket_entry.remarks = case_event.get("remarks", "") docket_entry.save() logger.info( @@ -4077,17 +4298,73 @@ def merge_texas_docket_entry( docket_entry.pk, docket.pk, ) - document_results = merge_texas_documents( - docket_entry, input_docket_entry["attachments"] - ) + document_results = [ + merge_texas_document( + docket_entry, document, download_attachments=download_attachments + ) + for document in case_event["attachments"] + ] - return ( - created or any(r[0] for r in document_results), - all(r[1] for r in document_results), - docket_entry.pk, + return MergeResult( + create=created or any(r.create for r in document_results), + update=not created or any(r.update for r in document_results), + success=all(r.success for r in document_results), + pk=docket_entry.pk, ) +def merge_texas_docket_entries( + docket: Docket, + case_events: list[TexasCaseEvent] | list[TexasSupremeCourtCaseEvent], + appellate_briefs: list[TexasAppellateBrief] + | list[TexasSupremeCourtAppellateBrief], + download_attachments: bool = True, +) -> MergeResult: + """ + Merges a list of Texas case events and Texas appellate briefs for a given + docket into CL. + + :param docket: The parent docket. + :param case_events: Scraped case events. + :param appellate_briefs: Scraped appellate briefs. + :param download_attachments: Whether to download attachments. + + :return: The result of the attempted merge operation. + """ + brief_iter = iter(appellate_briefs) + next_brief = next(brief_iter, None) + + create = False + update = False + success = True + for i, (case_event, sequence_number) in enumerate( + zip(case_events, create_docket_entry_sequence_numbers(case_events)) + ): + appellate_brief = None + if ( + next_brief is not None + and case_event["date"] == next_brief["date"] + and case_event["type"] == next_brief["type"] + and case_event["attachments"] == next_brief["attachments"] + ): + appellate_brief = next_brief + next_brief = next(brief_iter, None) + + merge_result = merge_texas_docket_entry( + docket, + sequence_number, + case_event, + appellate_brief, + download_attachments=download_attachments, + ) + + create = merge_result.create or create + update = merge_result.update or update + success = merge_result.success and success + + return MergeResult(create=create, update=update, success=success, pk=None) + + def normalize_texas_parties( parties: list[TexasCaseParty], ) -> list[dict[str, Any]]: @@ -4112,14 +4389,42 @@ def normalize_texas_parties( "contact": attorney, "roles": ["LEAD_ATTORNEY"] if i == 0 else ["UNKNOWN"], } - for i, attorney in enumerate(party["representatives"]) + for i, attorney in enumerate( + [rep for rep in party["representatives"] if len(rep) > 0] + ) ], } for party in parties ] -def merge_texas_parties(docket: Docket, parties: list[TexasCaseParty]) -> None: +def texas_docket_has_appellate_info( + docket_data: TexasCourtOfAppealsDocket + | TexasCourtOfCriminalAppealsDocket + | TexasSupremeCourtDocket, +) -> bool: + """ + Helper method returning whether a scraped Texas docket has appellate case + info. + + Checks that the docket court is not an appellate court (cases in appellate + courts cannot be appealed to appellate courts) and that the "appeals_court" + entry of docket data is filled in. + + :param docket_data: The scraped docket data. + + :return: Whether the docket has appellate case information. + """ + + return ( + docket_data["court_type"] != CourtType.APPELLATE.value + and docket_data["appeals_court"]["court_id"] != CourtID.UNKNOWN.value + ) + + +def merge_texas_parties( + docket: Docket, parties: list[TexasCaseParty] +) -> MergeResult: """Merge Texas case parties and attorneys into the given docket. This function takes a docket and a list of parties associated with a Texas @@ -4129,5 +4434,539 @@ def merge_texas_parties(docket: Docket, parties: list[TexasCaseParty]) -> None: :param docket: The docket to which parties and attorneys should be added. :param parties: The parties involved in the Texas case. + :return: A MergeResult indicating the operation succeeded. Note that + create and update flags are always False and pk is always None since + add_parties_and_attorneys does not return this information. """ add_parties_and_attorneys(docket, normalize_texas_parties(parties)) + return MergeResult(create=False, update=False, success=True, pk=None) + + +def merge_texas_docket_originating_court( + docket: Docket, + docket_data: TexasCourtOfAppealsDocket + | TexasCourtOfCriminalAppealsDocket + | TexasSupremeCourtDocket, +) -> MergeResult: + """Merge originating court information into the given Texas docket. + + :param docket: The docket to add the originating court to. + :param docket_data: The docket data from Juriscraper. + :return: The result of the merge operation.""" + + if texas_docket_has_appellate_info(docket_data): + ocd = docket_data["appeals_court"] + oc_dn = ocd["case_number"] + oc_reporter = "" + oc_judge = ocd["justice"] + oc_id = texas_js_court_id_to_court_id(ocd["court_id"]) + elif ( + docket_data["originating_court"]["court_type"] + != CourtType.UNKNOWN.value + ): + ocd = docket_data["originating_court"] + oc_dn = ocd["case"] + oc_reporter = ocd["reporter"] + oc_judge = ocd["judge"] + oc_id = texas_originating_court_to_court_id(ocd) + else: + logger.warning( + "Skipping merge of OCI for Texas docket %s due to unknown originating court type.", + docket.docket_number, + ) + return MergeResult.failed() + + created = False + if not docket.originating_court_information: + created = True + docket.originating_court_information = OriginatingCourtInformation() + + oci = docket.originating_court_information + + oci.docket_number = oc_dn + oci.docket_number_raw = oc_dn + oci.court_reporter = oc_reporter + oci.assigned_to_str = oc_judge + # Only update judge if we're able to associate them with a court. + if oc_id: + async_to_sync(lookup_judge_by_full_name_and_set_attr)( + item=oci, + target_field="assigned_to", + full_name=oc_judge, + court_id=oc_id, + event_date=None, + require_living_judge=False, + ) + oci.save() + if created: + docket.save() + + return MergeResult(create=created, update=False, success=True, pk=None) + + +def merge_texas_case_transfers( + docket: Docket, + docket_data: TexasCourtOfAppealsDocket + | TexasCourtOfCriminalAppealsDocket + | TexasSupremeCourtDocket, +) -> MergeResult: + """This method creates or updates up any `CaseTransfer` objects which point to + or originate from a given docket to capture appeal and work sharing + information. + + If a `CaseTransfer` exists with the same origin and destination docket + numbers and court fields as one we want to create, update the origin or + destination docket foreign key field to point to this docket. This allows + us to merge `CaseTransfer` objects for which we only have partial + information and complete the information at a later time (or never if the + origin/destination is a court we don't scrape). + + :param docket: The docket to add the appeal information to. + :param docket_data: The docket data from Juriscraper. + :return: The result of the CaseTransfer merge operation""" + logger.info( + "Determining transfers for docket %s in court %s...", + docket.docket_number, + docket.court.pk, + ) + + originating_court = docket_data["originating_court"] + oc_type = originating_court["court_type"] + oc_dn = originating_court["case"] + appeals_court = docket_data.get("appeals_court", {}) + ac_id = appeals_court.get("court_id", "") + ac_dn = appeals_court.get("case_number", "") + trial_court_id = texas_originating_court_to_court_id(originating_court) + appeal_transfer_origin_court_id: str | None = "" + appeal_transfer_origin_dn = "" + + transfers = [] + + match docket_data["court_id"]: + # Death penalty cases are automatically appealed to the CCA so the + # appellate court may be missing. + case CourtID.COURT_OF_CRIMINAL_APPEALS.value if ( + ac_id == CourtID.UNKNOWN.value + ): + logger.info( + "Docket %s in the CCA is a death penalty appeal", + docket.docket_number, + ) + + if not trial_court_id: + logger.error( + "Unable to determine trial court ID for Texas docket %s to create death penalty appeal CaseTransfer", + docket.docket_number, + ) + return MergeResult.failed() + + appeal_transfer_origin_dn = oc_dn + appeal_transfer_origin_court_id = trial_court_id + case CourtID.COURT_OF_CRIMINAL_APPEALS.value: + logger.info( + "Docket %s is a non-death penalty CCA docket", + docket.docket_number, + ) + + appeal_transfer_origin_dn = ac_dn + appeal_transfer_origin_court_id = texas_js_court_id_to_court_id( + ac_id + ) + case CourtID.SUPREME_COURT.value if ac_id == CourtID.UNKNOWN.value: + if oc_type == CourtType.UNKNOWN.value: + logger.warning( + "Found Texas SC docket with no originating or appellate information (docket number %s).", + docket.docket_number, + ) + + return MergeResult.failed() + + logger.warning( + "Found Texas SC docket with originating information but no appellate information (docket number %s). Falling back to using trial court to create appeal type transfer.", + docket.docket_number, + ) + + appeal_transfer_origin_dn = oc_dn + appeal_transfer_origin_court_id = trial_court_id + case CourtID.SUPREME_COURT.value: + logger.info("Docket %s is a SC docket", docket.docket_number) + appeal_transfer_origin_court_id = texas_js_court_id_to_court_id( + ac_id + ) + appeal_transfer_origin_dn = ac_dn + case _ if docket_data["court_type"] == CourtType.APPELLATE.value: + logger.info( + "Docket %s is an appellate docket", docket.docket_number + ) + + appeal_transfer_origin_court_id = trial_court_id + appeal_transfer_origin_dn = oc_dn + + transfer_from = docket_data.get("transfer_from") + if transfer_from: + logger.info( + "Appellate docket %s has an incoming transfer", + docket.docket_number, + ) + + coa_transfer_date = transfer_from["date"] + # If the transfer date is absent or empty, assume it matches the filing date + if not coa_transfer_date: + logger.warning( + "Missing transfer date for transfer of docket %s. Defaulting to filing date.", + docket.docket_number, + ) + coa_transfer_date = docket_data["date_filed"] + + coa_transfer_origin_court_id = texas_js_court_id_to_court_id( + transfer_from["court_id"] + ) + + try: + coa_transfer_origin_court = Court.objects.get( + pk=coa_transfer_origin_court_id + ) + except Court.DoesNotExist: + logger.error( + "Court with ID %s not found while populating CaseTransfer.origin_court.", + coa_transfer_origin_court_id, + ) + else: + # Texas Government Code 73.001 (accessed 2026-02-23) + coa_transfer_type = ( + CaseTransfer.JURISDICTION + if docket_data["court_id"] + == CourtID.FIFTEENTH_COURT_OF_APPEALS.value + else CaseTransfer.WORKLOAD + ) + transfers.append( + CaseTransfer( + origin_court=coa_transfer_origin_court, + origin_docket_number=transfer_from[ + "origin_docket" + ], + destination_court=docket.court, + destination_docket_number=docket.docket_number, + destination_docket=docket, + transfer_date=coa_transfer_date, + transfer_type=coa_transfer_type, + ) + ) + case _: + logger.error( + "Unrecognized Texas court ID %s and type %s while creating CaseTransfer", + docket_data["court_id"], + docket_data["court_type"], + ) + + return MergeResult.failed() + + if appeal_transfer_origin_court_id: + try: + appeal_origin_court = Court.objects.get( + pk=appeal_transfer_origin_court_id + ) + except Court.DoesNotExist: + logger.error( + "Court with ID %s not found while populating CaseTransfer.origin_court with appeal type.", + appeal_transfer_origin_court_id, + ) + else: + transfers.append( + CaseTransfer( + destination_court=docket.court, + destination_docket_number=docket.docket_number, + destination_docket=docket, + origin_court=appeal_origin_court, + origin_docket_number=appeal_transfer_origin_dn, + transfer_date=docket_data["date_filed"], + transfer_type=CaseTransfer.APPEAL, + ) + ) + + results = [merge_case_transfer(transfer) for transfer in transfers] + + return MergeResult( + success=all([r.success for r in results]), + create=any([r.create for r in results]), + update=any([r.update for r in results]), + pk=None, + ) + + +def merge_texas_docket( + docket_data: TexasCourtOfAppealsDocket + | TexasCourtOfCriminalAppealsDocket + | TexasSupremeCourtDocket, + download_attachments: bool = True, +) -> MergeResult: + """Merges scraped data from a Texas docket into the `Docket` table. + + :param docket_data: The scraped Texas docket data. + :param download_attachments: Whether to download docket entry attachments. + + :return: The result of the merge operation.""" + court = Court.objects.get( + pk=texas_js_court_id_to_court_id(docket_data["court_id"]) + ) + docket_number = docket_data["docket_number"] + logger.info("Merging Texas docket %s", docket_number) + + if docket_data["court_type"] == CourtType.UNKNOWN.value: + logger.error("Texas docket %s has unknown court type", docket_number) + return MergeResult.failed() + + with transaction.atomic(): + docket = None + if docket_data["court_type"] == CourtType.APPELLATE.value: + logger.info( + "Docket is appellate. Checking if disaggregation is necessary..." + ) + docket = async_to_sync(find_docket_object)( + court_id="texapp", + pacer_case_id=None, + docket_number=docket_number, + federal_defendant_number=None, + federal_dn_judge_initials_assigned=None, + federal_dn_judge_initials_referred=None, + docket_source=Docket.SCRAPER, + allow_create=False, + ) + if docket is not None: + logger.info( + "Disaggregating Texas appellate docket %s", docket_number + ) + docket.court = court + if docket is None: + docket = async_to_sync(find_docket_object)( + court_id=court.pk, + pacer_case_id=None, + docket_number=docket_number, + federal_defendant_number=None, + federal_dn_judge_initials_assigned=None, + federal_dn_judge_initials_referred=None, + docket_source=Docket.SCRAPER, + allow_create=True, + ) + docket.add_scraper_source() + docket.docket_number = docket_number + docket.docket_number_core = make_texas_docket_number_core( + docket_number + ) + docket.date_filed = docket_data["date_filed"] + docket.cause = docket_data["case_type"] + docket.case_name = docket_data["case_name"] + docket.case_name_full = docket_data["case_name_full"] + docket.docket_number_raw = docket_number + originating_court_merge_result = merge_texas_docket_originating_court( + docket, docket_data + ) + + if texas_docket_has_appellate_info(docket_data): + lower_court_data = docket_data["appeals_court"] + lower_court_id = texas_js_court_id_to_court_id( + lower_court_data["court_id"] + ) + lower_court_name = lower_court_data["district"] + else: + lower_court_data = docket_data["originating_court"] + lower_court_id = texas_originating_court_to_court_id( + lower_court_data + ) + lower_court_name = lower_court_data.get("name", "") + + # Assumes that we will only fail to generate a court ID for trial courts and never appellate courts + if lower_court_id: + try: + lower_court = Court.objects.get(pk=lower_court_id) + except Court.DoesNotExist: + logger.error( + "Could not find lower court with ID %s to set appeal_from for Texas docket.", + lower_court_id, + ) + else: + docket.appeal_from = lower_court + lower_court_name = lower_court.full_name + docket.appeal_from_str = lower_court_name + + docket.save() + + if docket_data["court_type"] == CourtType.SUPREME.value: + trial_court_result = merge_texas_trial_court_data(docket, docket_data) + else: + trial_court_result = MergeResult.unnecessary(None) + + party_merge_result = merge_texas_parties(docket, docket_data["parties"]) + + entry_merge_result = merge_texas_docket_entries( + docket, + docket_data["case_events"], + docket_data["appellate_briefs"], + download_attachments=download_attachments, + ) + + merge_case_transfer_result = merge_texas_case_transfers( + docket, docket_data + ) + + create = ( + party_merge_result.create + or trial_court_result.create + or originating_court_merge_result.create + or merge_case_transfer_result.create + or entry_merge_result.create + ) + update = ( + party_merge_result.update + or trial_court_result.update + or originating_court_merge_result.update + or merge_case_transfer_result.update + or entry_merge_result.update + ) + success = ( + party_merge_result.success + and trial_court_result.success + and originating_court_merge_result.success + and merge_case_transfer_result.success + and entry_merge_result.success + ) + if not success: + logger.error( + "One or more steps in Texas case merging failed for docket %s (pk %s) in court %s. Please review logs.", + docket_number, + docket.pk, + court.pk, + ) + + return MergeResult( + create=create, + update=update, + success=success, + pk=docket.pk, + ) + + +@app.task( + bind=True, + max_retries=5, + ignore_result=True, +) +@time_call(logger) +def texas_ingest_docket_task( + task: Task, + i: tuple[bytes, TexasDocketMeta], + download_attachments: bool = True, +) -> MergeResult: + """ + Task to parse and merge a Texas docket. + + :param task: The Celery task. + + :param i: Tuple with the following entries: + - Bytes string to parse. + - Docket metadata. + :param download_attachments: Whether to download docket entry attachments. + + :return: The result of the merge operation. + """ + content, meta = i + logger.info("Attempting to parse Texas docket %s...", meta.case_number) + try: + match meta.court_code: + case "cossup": + parser = TexasSupremeCourtScraper() + case "coscca": + parser = TexasCourtOfCriminalAppealsScraper() + case court_code if court_code.startswith("coa"): + parser = TexasCourtOfAppealsScraper(meta.court_code) + case _: + logger.error( + "Unrecognized Texas court type %s. Cannot parse docket %s.", + meta.court_code, + meta.case_number, + ) + task.request.chain = None + return MergeResult.failed() + + parser._parse_text(content.decode("utf-8")) + docket_data = parser.data + except Exception as e: + logger.error( + "Encountered error parsing Texas docket at URL %s: %s", + meta.case_url, + str(e), + ) + task.request.chain = None + return MergeResult.failed() + return merge_texas_docket( + docket_data, download_attachments=download_attachments + ) + + +@app.task( + autoretry_for=( + botocore.exceptions.HTTPClientError, + botocore.exceptions.ConnectionError, + ), + max_retries=5, + retry_backoff=10, + ignore_result=True, +) +def default_corpus_download_task( + bucket: str, s3_key: str +) -> tuple[bytes, str, str]: + """Downloads a scraped file from S3 and returns it for parsing. + + :param bucket: S3 bucket name. + :param s3_key: S3 key to download file from. + :return: Tuple with entries: Bytes of downloaded file, the bucket + parameter, and the s3_key parameter.""" + logger.info("Downloading file from S3: %s", s3_key) + storage = AWSMediaStorage(bucket_name=bucket) + with storage.open(s3_key, "rb") as f: + content = f.read() + return content, bucket, s3_key + + +@app.task( + autoretry_for=( + botocore.exceptions.HTTPClientError, + botocore.exceptions.ConnectionError, + ), + max_retries=5, + retry_backoff=10, + ignore_result=True, +) +@time_call(logger) +def texas_corpus_download_task( + docket: tuple[str, str], + docket_meta: tuple[str, str], +) -> tuple[bytes, TexasDocketMeta]: + """Downloads a scraped file from S3 and returns it for parsing. + + :param docket: Tuple of S3 bucket name and key where docket HTML is stored. + + :param docket_meta: Tuple of S3 bucket name and key where docket metadata\ + is stored. + + :return: Tuple with entries: Bytes of downloaded file, dictionary with\ + response headers, and docket metadata.""" + storage = AWSMediaStorage(bucket_name=docket[0]) + logger.info( + "Downloading docket HTML from S3: (Bucket: %s; Path: %s)", + docket[0], + docket[1], + ) + with storage.open(docket[1], "rb") as f: + content = f.read() + + storage = AWSMediaStorage(bucket_name=docket_meta[0]) + logger.info( + "Downloading docket meta from S3: (Bucket: %s; Path: %s)", + docket_meta[0], + docket_meta[1], + ) + with storage.open(docket_meta[1], "r") as f: + meta = TexasDocketMeta.model_validate_json(f.read()) + + return content, meta diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 4f849633cd..cd8552fbcf 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -25,6 +25,7 @@ from juriscraper.state.texas import ( TexasCaseParty, ) +from juriscraper.state.texas.common import CourtID, CourtType from openai import RateLimitError from pydantic import ValidationError @@ -81,14 +82,18 @@ update_latest_case_id_and_schedule_iquery_sweep, ) from cl.corpus_importer.tasks import ( + MergeResult, classify_case_name_by_llm, download_texas_document_pdf, generate_ia_json, get_and_save_free_document_report, + merge_texas_case_transfers, + merge_texas_docket, merge_texas_docket_entry, + merge_texas_docket_originating_court, merge_texas_document, - merge_texas_documents, merge_texas_parties, + merge_texas_trial_court_data, normalize_texas_parties, probe_or_scrape_iquery_pages, ) @@ -108,6 +113,7 @@ winnow_case_name, ) from cl.favorites.models import PrayerAvailability +from cl.lib.model_helpers import make_texas_docket_number_core from cl.lib.pacer import process_docket_data from cl.lib.redis_utils import get_redis_interface from cl.people_db.factories import ( @@ -148,6 +154,7 @@ from cl.scrapers.tasks import update_docket_info_iquery from cl.search.cluster_sources import ClusterSources from cl.search.factories import ( + CaseTransferFactory, CourtFactory, DocketEntryFactory, DocketFactory, @@ -161,22 +168,36 @@ ) from cl.search.models import ( SEARCH_TYPES, + CaseTransfer, Citation, Docket, Opinion, OpinionCluster, + OriginatingCourtInformation, RECAPDocument, + TrialCourtData, ) from cl.search.state.texas.factories import ( + TexasAppellateBriefDictFactory, + TexasAppellateCourtInfoDictFactory, + TexasAppellateTransferDictFactory, TexasCaseDocumentDictFactory, - TexasDocketEntryDictFactory, + TexasCaseEventDictFactory, + TexasCasePartyDictFactory, + TexasCourtOfAppealsDocketDictFactory, TexasDocketEntryFactory, TexasDocumentFactory, + TexasFinalCourtDocketDictFactory, + TexasOriginatingCourtDictFactory, + TexasOriginatingDistrictCourtDictFactory, + TexasSupremeCourtAppellateBriefDictFactory, + TexasSupremeCourtCaseEventDictFactory, ) from cl.search.state.texas.models import TexasDocketEntry, TexasDocument from cl.settings import MEDIA_ROOT from cl.tests.cases import TestCase from cl.tests.fakes import FakeCaseQueryReport, FakeFreeOpinionReport +from cl.tests.providers import fake from cl.tests.utils import MockResponse from cl.users.factories import UserProfileWithParentsFactory @@ -2165,12 +2186,17 @@ def setUp(self): @classmethod def setUpTestData(cls): """Create test data for Texas merger tests""" - cls.texas_sc = CourtFactory.create(id="texas_sc") - cls.texas_cca = CourtFactory.create(id="texas_cca") - cls.texas_coa1 = CourtFactory.create(id="texas_coa1") + cls.texas_sc = CourtFactory.create(id="tex") + cls.texas_cca = CourtFactory.create(id="texcrimapp") + cls.texas_coa1 = CourtFactory.create(id="txctapp1") + cls.texas_dc100 = CourtFactory.create(id="texdistct101") cls.docket_number_coa1 = "01-25-00011-CV" cls.docket_coa1 = DocketFactory.create( - court=cls.texas_coa1, docket_number=cls.docket_number_coa1 + court=cls.texas_coa1, + docket_number=cls.docket_number_coa1, + docket_number_core=make_texas_docket_number_core( + cls.docket_number_coa1 + ), ) cls.docket_coa1_entry = TexasDocketEntryFactory.create( docket=cls.docket_coa1, @@ -2182,19 +2208,153 @@ def tearDown(self): self.extract_pdf_document_patch.stop() self.download_pdf_patch.stop() + def get_random_docket_entry_dict(self, **kwargs): + return fake.random_element( + ( + TexasSupremeCourtAppellateBriefDictFactory(**kwargs), + TexasSupremeCourtCaseEventDictFactory(**kwargs), + TexasAppellateBriefDictFactory(**kwargs), + TexasCaseEventDictFactory(**kwargs), + ) + ) + + def test_normalize_texas_parties_empty_atty_name(self): + party_0 = TexasCasePartyDictFactory(representatives=[""]) + # Filter out empty representatives + parties = [party_0] + + normalized = normalize_texas_parties(parties) + + self.assertEqual( + normalized, + [ + { + "name": party_0["name"], + "type": party_0["type"], + "date_terminated": None, + "extra_info": "", + "attorneys": [], + } + ], + ) + + @patch( + "cl.corpus_importer.tasks.merge_texas_parties", + return_value=MergeResult.created(1), + ) + @patch( + "cl.corpus_importer.tasks.merge_texas_case_transfers", + return_value=MergeResult.created(1), + ) + @patch( + "cl.corpus_importer.tasks.merge_texas_trial_court_data", + return_value=MergeResult.created(1), + ) + @patch( + "cl.corpus_importer.tasks.merge_texas_docket_originating_court", + return_value=MergeResult.created(1), + ) + def test_merge_docket_entries_integration( + self, + mock_texas_oci, + mock_texas_tcd, + mock_texas_transfers, + mock_texas_parties, + ): + n_events = fake.random_int(min=0, max=30) + case_events = sorted( + [TexasSupremeCourtCaseEventDictFactory() for _ in range(n_events)], + key=lambda ce: ce["date"], + ) + + if len(case_events) == 0: + appellate_brief_indices = [] + else: + appellate_brief_indices = sorted( + fake.random_elements(range(len(case_events)), unique=True) + ) + + appellate_briefs = [ + TexasSupremeCourtAppellateBriefDictFactory( + date=case_events[i]["date"], + type=case_events[i]["type"], + attachments=case_events[i]["attachments"], + remarks=case_events[i]["remarks"], + ) + for i in appellate_brief_indices + ] + + actual_flags = [ + True if i in appellate_brief_indices else False + for i in range(len(case_events)) + ] + + docket_dict = TexasFinalCourtDocketDictFactory( + court_id=CourtID.SUPREME_COURT.value, + case_events=case_events, + appellate_briefs=appellate_briefs, + ) + original_docket_entries = [ + e["pk"] for e in TexasDocketEntry.objects.all().values("pk") + ] + merge_result = merge_texas_docket(docket_dict) + + docket_entries = list( + TexasDocketEntry.objects.exclude( + pk__in=original_docket_entries + ).order_by("sequence_number") + ) + self.assertEqual( + len(docket_entries), + len(case_events), + f"Generated {len(docket_entries)} docket entries from {len(case_events)} input case events.", + ) + + ab_index = 0 + for i, docket_entry in enumerate(docket_entries): + self.assertEqual( + docket_entry.appellate_brief, + actual_flags[i], + f"Docket entry {i} has the wrong appellate brief flag (found {docket_entry.appellate_brief}, expected {actual_flags[i]}).", + ) + self.assertEqual( + docket_entry.remarks, + case_events[i]["remarks"], + f"Docket entry {i} has the wrong remarks (found {docket_entry.remarks}, expected {case_events[i]['remarks']}).", + ) + self.assertEqual( + docket_entry.disposition, + case_events[i]["disposition"], + f"Docket entry {i} has the wrong disposition (found {docket_entry.disposition}, expected {case_events[i]['disposition']}).", + ) + if actual_flags[i]: + self.assertEqual( + docket_entry.description, + appellate_briefs[ab_index]["description"], + f"Docket entry {i} has the wrong description (found {docket_entry.description}, expected {appellate_briefs[ab_index]['description']}).", + ) + ab_index += 1 + else: + self.assertEqual( + docket_entry.description, + "", + f"Docket entry {i} should not have description (found {docket_entry.description}).", + ) + def test_merge_texas_document_new_document(self): """Can we correctly add a new attachment to an existing docket entry?""" docket_entry = self.docket_coa1_entry input_document = TexasCaseDocumentDictFactory() - # Run the function - result = merge_texas_document(docket_entry, input_document) + with self.captureOnCommitCallbacks(execute=True): + result = merge_texas_document(docket_entry, input_document) - # Assertions - assert result == (True, True, result[2]) + assert result.create is True + assert result.success is True + assert result.pk is not None try: - created_document = TexasDocument.objects.get(pk=result[2]) + created_document = TexasDocument.objects.get(pk=result.pk) except ObjectDoesNotExist: created_document = None assert created_document is not None @@ -2225,12 +2385,12 @@ def test_merge_texas_document_existing_document_no_update(self): current_document.filepath_local = "a" current_document.save() - # Run the function result = merge_texas_document(docket_entry, input_document) - # Assertions - assert result == (False, True, current_document.pk) - result_document = TexasDocument.objects.get(pk=result[2]) + assert result.create is False + assert result.success is True + assert result.pk == current_document.pk + result_document = TexasDocument.objects.get(pk=result.pk) assert result_document is not None assert result_document.docket_entry_id == docket_entry.id assert result_document.description == input_document["description"] @@ -2251,7 +2411,6 @@ def test_merge_texas_document_existing_document_update(self): media_id=input_document["media_id"], ) - # Create an attachment current_document = TexasDocumentFactory.create( docket_entry=docket_entry, description=old_document["description"], @@ -2260,12 +2419,14 @@ def test_merge_texas_document_existing_document_update(self): url=old_document["document_url"], ) - # Run the function - result = merge_texas_document(docket_entry, input_document) + with self.captureOnCommitCallbacks(execute=True): + result = merge_texas_document(docket_entry, input_document) - # Assertions - assert result == (True, True, current_document.pk) - result_document = TexasDocument.objects.get(pk=result[2]) + assert result.create is False + assert result.update is True + assert result.success is True + assert result.pk == current_document.pk + result_document = TexasDocument.objects.get(pk=result.pk) assert result_document is not None assert result_document.docket_entry_id == docket_entry.id assert result_document.description == input_document["description"] @@ -2278,11 +2439,30 @@ def test_merge_texas_document_existing_document_update(self): self.download_task_mock.assert_called_once_with(current_document.pk) + def test_merge_texas_document_skips_download_when_disabled(self): + """Are attachment downloads skipped when download_attachments=False?""" + docket_entry = self.docket_coa1_entry + input_document = TexasCaseDocumentDictFactory() + + result = merge_texas_document( + docket_entry, input_document, download_attachments=False + ) + + assert result.create is True + assert result.success is True + assert result.pk is not None + self.download_task_mock.assert_not_called() + + @mock.patch("cl.lib.celery_utils.get_task_wait", return_value=0) + @mock.patch("cl.corpus_importer.tasks.doc_page_count_service") @responses.activate - def test_merge_texas_document_plaintext_extraction(self): + def test_merge_texas_document_plaintext_extraction( + self, pcs_mock, throttle_mock + ): """ Ensure plaintext extraction is triggered by `merge_texas_document`. """ + pcs_mock.return_value = httpx.Response(200, text="1") # Stop the mocks just for this test self.download_task_patch.stop() self.extract_pdf_document_patch.stop() @@ -2305,57 +2485,49 @@ def get_test_pdf( ) docket_entry = self.docket_coa1_entry - (_, _, pk) = merge_texas_document(docket_entry, input_document) + with self.captureOnCommitCallbacks(execute=True): + result = merge_texas_document(docket_entry, input_document) docket_entry.refresh_from_db() - document = TexasDocument.objects.get(pk=pk) + document = TexasDocument.objects.get(pk=result.pk) self.assertEqual(response.call_count, 1) self.assertEqual(document.url, input_document["document_url"]) self.assertTrue(document.filepath_local) self.assertIn("UNITED", document.plain_text) - def test_merge_texas_documents(self): - """Can we correctly handle multiple documents?""" - docket_entry = self.docket_coa1_entry - existing_document = TexasCaseDocumentDictFactory() - current_attachment = TexasDocumentFactory.create( - docket_entry=docket_entry, - description=existing_document["description"], - media_id=existing_document["media_id"], - media_version_id=existing_document["media_version_id"], - url=existing_document["document_url"], - ) - current_attachment.filepath_local = "a" - current_attachment.save() - input_documents = [ - TexasCaseDocumentDictFactory(), - existing_document, - ] - - result = merge_texas_documents(docket_entry, input_documents) - - assert len(result) == 2 - assert result[0] == (True, True, result[0][2]) - assert result[1] == (False, True, current_attachment.pk) - def test_merge_texas_docket_entry_new_entry(self): """Can we correctly handle a docket entry?""" - docket_entry = TexasDocketEntryDictFactory( + case_event = TexasCaseEventDictFactory( attachments=[TexasCaseDocumentDictFactory()], date=date.fromisoformat("2025-01-02"), type="Brief", ) + appellate_brief = None + if fake.boolean(): + appellate_brief = TexasAppellateBriefDictFactory( + date=case_event["date"], + type=case_event["type"], + attachments=case_event["attachments"], + ) - output = merge_texas_docket_entry( - self.docket_coa1, "2025-01-02.000", True, docket_entry - ) + with self.captureOnCommitCallbacks(execute=True): + output = merge_texas_docket_entry( + self.docket_coa1, "2025-01-02.000", case_event, appellate_brief + ) - assert output == (True, True, output[2]) - created_docket_entry = TexasDocketEntry.objects.get(pk=output[2]) + assert output.create is True + assert output.update is False + assert output.success is True + assert output.pk is not None + created_docket_entry = TexasDocketEntry.objects.get(pk=output.pk) assert created_docket_entry.docket_id == self.docket_coa1.id - assert created_docket_entry.entry_type == docket_entry["type"] - assert created_docket_entry.description == docket_entry["description"] - assert created_docket_entry.date_filed == docket_entry["date"] + assert created_docket_entry.entry_type == case_event["type"] + assert created_docket_entry.disposition == case_event["disposition"] + assert created_docket_entry.description == ( + appellate_brief["description"] if appellate_brief else "" + ) + assert created_docket_entry.remarks == case_event.get("remarks", "") + assert created_docket_entry.date_filed == case_event["date"] n_attachments = TexasDocument.objects.filter( docket_entry_id=created_docket_entry.id ).count() @@ -2364,11 +2536,24 @@ def test_merge_texas_docket_entry_new_entry(self): def test_merge_texas_docket_entry_no_update(self): """Can we correctly handle a docket entry update noop?""" - js_docket_entry = TexasDocketEntryDictFactory() - - (_, _, pk) = merge_texas_docket_entry( - self.docket_coa1, "2025-01-02.000", True, js_docket_entry + case_event = TexasCaseEventDictFactory( + attachments=[TexasCaseDocumentDictFactory()], + date=date.fromisoformat("2025-01-02"), + type="Brief", ) + appellate_brief = None + if fake.boolean(): + appellate_brief = TexasAppellateBriefDictFactory( + date=case_event["date"], + type=case_event["type"], + attachments=case_event["attachments"], + ) + + with self.captureOnCommitCallbacks(execute=True): + result = merge_texas_docket_entry( + self.docket_coa1, "2025-01-02.000", case_event, appellate_brief + ) + pk = result.pk documents = TexasDocument.objects.filter(docket_entry_id=pk) for document in documents: document.filepath_local = "a" @@ -2377,33 +2562,52 @@ def test_merge_texas_docket_entry_no_update(self): self.extract_pdf_document_mock.reset_mock() # noop - output = merge_texas_docket_entry( - self.docket_coa1, "2025-01-02.000", True, js_docket_entry - ) + with self.captureOnCommitCallbacks(execute=True): + output = merge_texas_docket_entry( + self.docket_coa1, "2025-01-02.000", case_event, appellate_brief + ) - assert output == (False, True, output[2]) - assert output[2] == pk - created_docket_entry = TexasDocketEntry.objects.get(pk=output[2]) + assert output.create is False + assert output.update is True + assert output.success is True + assert output.pk is not None + assert output.pk == pk + created_docket_entry = TexasDocketEntry.objects.get(pk=output.pk) assert created_docket_entry.docket_id == self.docket_coa1.id - assert created_docket_entry.entry_type == js_docket_entry["type"] - assert ( - created_docket_entry.description == js_docket_entry["description"] + assert created_docket_entry.entry_type == case_event["type"] + assert created_docket_entry.disposition == case_event["disposition"] + assert created_docket_entry.description == ( + appellate_brief["description"] if appellate_brief else "" ) - assert created_docket_entry.date_filed == js_docket_entry["date"] + assert created_docket_entry.remarks == case_event.get("remarks", "") + assert created_docket_entry.date_filed == case_event["date"] n_attachments = TexasDocument.objects.filter( docket_entry_id=created_docket_entry.id ).count() - assert n_attachments == len(js_docket_entry["attachments"]) + assert n_attachments == len(case_event["attachments"]) assert self.extract_pdf_document_mock.call_count == 0 def test_merge_texas_docket_entry_add_document(self): """Can we correctly add a new document to an existing docket entry?""" - js_docket_entry = TexasDocketEntryDictFactory() - initial_n_attachments = len(js_docket_entry["attachments"]) - - (_, _, pk) = merge_texas_docket_entry( - self.docket_coa1, "2025-01-02.000", True, js_docket_entry + case_event = TexasCaseEventDictFactory( + attachments=[TexasCaseDocumentDictFactory()], + date=date.fromisoformat("2025-01-02"), + type="Brief", ) + appellate_brief = None + if fake.boolean(): + appellate_brief = TexasAppellateBriefDictFactory( + date=case_event["date"], + type=case_event["type"], + attachments=case_event["attachments"], + ) + initial_n_attachments = len(case_event["attachments"]) + + with self.captureOnCommitCallbacks(execute=True): + result = merge_texas_docket_entry( + self.docket_coa1, "2025-01-02.000", case_event, appellate_brief + ) + pk = result.pk documents = TexasDocument.objects.filter(docket_entry_id=pk) for document in documents: document.filepath_local = "a" @@ -2411,20 +2615,26 @@ def test_merge_texas_docket_entry_add_document(self): # Reset call count self.extract_pdf_document_mock.reset_mock() - js_docket_entry["attachments"].append(TexasCaseDocumentDictFactory()) - output = merge_texas_docket_entry( - self.docket_coa1, "2025-01-02.000", True, js_docket_entry - ) + case_event["attachments"].append(TexasCaseDocumentDictFactory()) + with self.captureOnCommitCallbacks(execute=True): + output = merge_texas_docket_entry( + self.docket_coa1, "2025-01-02.000", case_event, appellate_brief + ) - assert output == (True, True, output[2]) - assert output[2] == pk - created_docket_entry = TexasDocketEntry.objects.get(pk=output[2]) + assert output.create is True + assert output.update is True + assert output.success is True + assert output.pk is not None + assert output.pk == pk + created_docket_entry = TexasDocketEntry.objects.get(pk=output.pk) assert created_docket_entry.docket_id == self.docket_coa1.id - assert created_docket_entry.entry_type == js_docket_entry["type"] - assert ( - created_docket_entry.description == js_docket_entry["description"] + assert created_docket_entry.entry_type == case_event["type"] + assert created_docket_entry.remarks == case_event.get("remarks", "") + assert created_docket_entry.description == ( + appellate_brief["description"] if appellate_brief else "" ) - assert created_docket_entry.date_filed == js_docket_entry["date"] + assert created_docket_entry.disposition == case_event["disposition"] + assert created_docket_entry.date_filed == case_event["date"] n_attachments = TexasDocument.objects.filter( docket_entry_id=created_docket_entry.id ).count() @@ -2440,7 +2650,7 @@ def test_merge_texas_docket_entry_multiple_matches_with_sequence(self): entry_type="Brief", appellate_brief=True, sequence_number="2025-01-02.000", - description="First entry", + disposition="First entry", ) existing_entry_2 = TexasDocketEntryFactory.create( docket=self.docket_coa1, @@ -2448,28 +2658,36 @@ def test_merge_texas_docket_entry_multiple_matches_with_sequence(self): entry_type="Brief", appellate_brief=True, sequence_number="2025-01-02.001", - description="Second entry", + disposition="Second entry", ) - js_docket_entry = TexasDocketEntryDictFactory( + case_event = TexasCaseEventDictFactory( attachments=[], - description="Updated description", + disposition="Updated disposition", date=date.fromisoformat("2025-01-02"), type="Brief", ) + appellate_brief = TexasAppellateBriefDictFactory( + date=case_event["date"], + type=case_event["type"], + attachments=case_event["attachments"], + ) # Should match the second entry by sequence number output = merge_texas_docket_entry( - self.docket_coa1, "2025-01-02.001", True, js_docket_entry + self.docket_coa1, "2025-01-02.001", case_event, appellate_brief ) - assert output == (False, True, existing_entry_2.pk) - updated_entry = TexasDocketEntry.objects.get(pk=output[2]) - assert updated_entry.description == "Updated description" + assert output.create is False + assert output.update is True + assert output.success is True + assert output.pk == existing_entry_2.pk + updated_entry = TexasDocketEntry.objects.get(pk=output.pk) + assert updated_entry.disposition == case_event["disposition"] assert updated_entry.sequence_number == "2025-01-02.001" # Ensure the first entry was not modified existing_entry_1.refresh_from_db() - assert existing_entry_1.description == "First entry" + assert existing_entry_1.disposition == "First entry" def test_merge_texas_docket_entry_single_match_updates_entry(self): """When exactly one entry matches by date/type/brief, update it even @@ -2480,24 +2698,32 @@ def test_merge_texas_docket_entry_single_match_updates_entry(self): entry_type="Brief", appellate_brief=True, sequence_number="2025-01-04.000", - description="Original description", + disposition="Original description", ) - js_docket_entry = TexasDocketEntryDictFactory( + case_event = TexasCaseEventDictFactory( attachments=[], - description="Updated description", + disposition="Updated disposition", date=date.fromisoformat("2025-01-04"), type="Brief", ) + appellate_brief = TexasAppellateBriefDictFactory( + date=case_event["date"], + type=case_event["type"], + attachments=case_event["attachments"], + ) # Should update existing entry and change its sequence number output = merge_texas_docket_entry( - self.docket_coa1, "2025-01-04.001", True, js_docket_entry + self.docket_coa1, "2025-01-04.001", case_event, appellate_brief ) - assert output == (False, True, existing_entry.pk) - updated_entry = TexasDocketEntry.objects.get(pk=output[2]) - assert updated_entry.description == "Updated description" + assert output.create is False + assert output.update is True + assert output.success is True + assert output.pk == existing_entry.pk + updated_entry = TexasDocketEntry.objects.get(pk=output.pk) + assert updated_entry.disposition == case_event["disposition"] assert updated_entry.sequence_number == "2025-01-04.001" def test_merge_texas_docket_entry_multiple_matches_without_sequence(self): @@ -2509,7 +2735,7 @@ def test_merge_texas_docket_entry_multiple_matches_without_sequence(self): entry_type="Brief", appellate_brief=True, sequence_number="2025-01-03.000", - description="First entry", + disposition="First entry", ) existing_entry_2 = TexasDocketEntryFactory.create( docket=self.docket_coa1, @@ -2517,32 +2743,38 @@ def test_merge_texas_docket_entry_multiple_matches_without_sequence(self): entry_type="Brief", appellate_brief=True, sequence_number="2025-01-03.001", - description="Second entry", + disposition="Second entry", ) - - js_docket_entry = TexasDocketEntryDictFactory( + case_event = TexasCaseEventDictFactory( attachments=[], - description="New third entry", + disposition="New third entry", date=date.fromisoformat("2025-01-03"), type="Brief", ) + appellate_brief = TexasAppellateBriefDictFactory( + date=case_event["date"], + type=case_event["type"], + attachments=case_event["attachments"], + ) # Should create a new entry since no sequence number matches output = merge_texas_docket_entry( - self.docket_coa1, "2025-01-03.002", True, js_docket_entry + self.docket_coa1, "2025-01-03.002", case_event, appellate_brief ) - assert output[0] is True # created - assert output[1] is True # success - assert output[2] not in (existing_entry_1.pk, existing_entry_2.pk) - new_entry = TexasDocketEntry.objects.get(pk=output[2]) - assert new_entry.description == "New third entry" + assert output.create is True + assert output.update is False + assert output.success is True + assert output.pk is not None + assert output.pk not in (existing_entry_1.pk, existing_entry_2.pk) + new_entry = TexasDocketEntry.objects.get(pk=output.pk) + assert new_entry.disposition == case_event["disposition"] assert new_entry.sequence_number == "2025-01-03.002" # Ensure existing entries were not modified existing_entry_1.refresh_from_db() existing_entry_2.refresh_from_db() - assert existing_entry_1.description == "First entry" - assert existing_entry_2.description == "Second entry" + assert existing_entry_1.disposition == "First entry" + assert existing_entry_2.disposition == "Second entry" def test_merge_single_party_with_attorney(self): """Can we merge a single party with an attorney?""" @@ -2728,9 +2960,12 @@ def test_normalize_empty_parties_list(self): result = normalize_texas_parties([]) assert result == [] + @mock.patch("cl.lib.celery_utils.get_task_wait", return_value=0) @mock.patch("cl.corpus_importer.tasks.doc_page_count_service") @responses.activate - def test_download_texas_document_pdf_success(self, pcs_mock): + def test_download_texas_document_pdf_success( + self, pcs_mock, throttle_mock + ): """Can we successfully download a PDF for a TexasDocument?""" self.download_pdf_patch.stop() texas_document = TexasDocumentFactory.create() @@ -2785,6 +3020,484 @@ def test_download_texas_document_pdf_download_failure(self): texas_document.refresh_from_db() assert not texas_document.filepath_local + def test_merge_texas_docket_originating_court_creates_new(self): + """Can we create new originating court information?""" + self.docket_coa1.originating_court_information = None + self.docket_coa1.save() + docket_data = TexasCourtOfAppealsDocketDictFactory( + docket_number=self.docket_number_coa1, + originating_court=TexasOriginatingDistrictCourtDictFactory( + district=5, + ), + ) + + result = merge_texas_docket_originating_court( + self.docket_coa1, docket_data + ) + + assert result.create is True + assert result.success is True + + self.docket_coa1.refresh_from_db() + originating_info = self.docket_coa1.originating_court_information + assert originating_info is not None + assert ( + originating_info.docket_number + == docket_data["originating_court"]["case"] + ) + assert ( + originating_info.court_reporter + == docket_data["originating_court"]["reporter"] + ) + assert ( + originating_info.assigned_to_str + == docket_data["originating_court"]["judge"] + ) + + def test_merge_texas_docket_originating_court_updates_existing(self): + """Can we update existing originating court information?""" + # Create existing originating court information + self.docket_coa1.originating_court_information = ( + OriginatingCourtInformation.objects.create( + docket_number="OLD-123", + court_reporter="Old Reporter", + assigned_to_str="Old Judge", + ) + ) + self.docket_coa1.save() + + originating_court = TexasOriginatingDistrictCourtDictFactory() + docket_data = TexasCourtOfAppealsDocketDictFactory( + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, + docket_number=self.docket_number_coa1, + originating_court=originating_court, + ) + + result = merge_texas_docket_originating_court( + self.docket_coa1, docket_data + ) + + assert result.create is False + assert result.success is True + + self.docket_coa1.refresh_from_db() + updated_info = self.docket_coa1.originating_court_information + assert updated_info is not None + assert updated_info.docket_number == originating_court["case"] + assert updated_info.court_reporter == originating_court["reporter"] + assert updated_info.assigned_to_str == originating_court["judge"] + + def test_merge_texas_case_transfers_appellate_court_from_trial(self): + """Can we create a CaseTransfer for an appellate court case?""" + texas_district = CourtFactory.create(id="texdistct6") + + originating_court = TexasOriginatingDistrictCourtDictFactory( + district=5, + ) + docket_data = TexasCourtOfAppealsDocketDictFactory( + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, + docket_number=self.docket_number_coa1, + originating_court=originating_court, + transfer_from=None, + ) + + result = merge_texas_case_transfers(self.docket_coa1, docket_data) + + assert result.success is True + assert result.create is True + + transfers = CaseTransfer.objects.all() + assert transfers.count() == 1 + transfer = transfers.first() + assert transfer.destination_court == self.texas_coa1 + assert transfer.destination_docket_number == self.docket_number_coa1 + assert transfer.origin_court == texas_district + assert transfer.origin_docket_number == originating_court["case"] + assert transfer.transfer_type == CaseTransfer.APPEAL + assert transfer.transfer_date == docket_data["date_filed"] + + def test_merge_texas_case_transfers_appellate_with_workload_transfer( + self, + ): + """Can we create CaseTransfer for appellate case with work sharing?""" + texas_district = CourtFactory.create(id="texdistct6") + texas_coa2 = CourtFactory.create(id="txctapp2") + + originating_court = TexasOriginatingDistrictCourtDictFactory( + district=5, + ) + transfer_from = TexasAppellateTransferDictFactory( + court_id=CourtID.SECOND_COURT_OF_APPEALS.value, + ) + docket_data = TexasCourtOfAppealsDocketDictFactory( + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, + docket_number=self.docket_number_coa1, + originating_court=originating_court, + transfer_from=transfer_from, + ) + + result = merge_texas_case_transfers(self.docket_coa1, docket_data) + + assert result.success is True + assert result.create is True + + transfers = CaseTransfer.objects.all() + assert transfers.count() == 2 + + appeal_transfer = transfers.get(transfer_type=CaseTransfer.APPEAL) + assert appeal_transfer.destination_court == self.texas_coa1 + assert ( + appeal_transfer.destination_docket_number + == self.docket_number_coa1 + ) + assert appeal_transfer.origin_court == texas_district + assert ( + appeal_transfer.origin_docket_number == originating_court["case"] + ) + + workload_transfer = transfers.get(transfer_type=CaseTransfer.WORKLOAD) + assert workload_transfer.destination_court == self.texas_coa1 + assert ( + workload_transfer.destination_docket_number + == self.docket_number_coa1 + ) + assert workload_transfer.origin_court == texas_coa2 + assert ( + workload_transfer.origin_docket_number + == transfer_from["origin_docket"] + ) + + def test_merge_texas_case_transfers_supreme_court(self): + """Can we create a CaseTransfer for a Supreme Court case?""" + docket_sc = DocketFactory.create(court=self.texas_sc) + + appeals_court = TexasAppellateCourtInfoDictFactory( + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, + case_number=self.docket_number_coa1, + ) + docket_data = TexasFinalCourtDocketDictFactory( + court_id=CourtID.SUPREME_COURT.value, + docket_number=docket_sc.docket_number, + appeals_court=appeals_court, + is_direct_appeal=False, + ) + + result = merge_texas_case_transfers(docket_sc, docket_data) + + assert result.success is True + assert result.create is True + + transfers = CaseTransfer.objects.all() + assert transfers.count() == 1 + transfer = transfers.first() + assert transfer.destination_court == self.texas_sc + assert transfer.destination_docket_number == docket_sc.docket_number + assert transfer.origin_court == self.texas_coa1 + assert transfer.origin_docket_number == self.docket_number_coa1 + assert transfer.transfer_type == CaseTransfer.APPEAL + assert transfer.transfer_date == docket_data["date_filed"] + + def test_merge_texas_case_transfers_cca_from_appellate(self): + """Can we create a CaseTransfer for CCA from appellate court?""" + docket_cca = DocketFactory.create(court=self.texas_cca) + + appeals_court = TexasAppellateCourtInfoDictFactory( + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, + case_number=self.docket_number_coa1, + ) + docket_data = TexasFinalCourtDocketDictFactory( + court_id=CourtID.COURT_OF_CRIMINAL_APPEALS.value, + docket_number=docket_cca.docket_number, + appeals_court=appeals_court, + is_direct_appeal=False, + ) + + result = merge_texas_case_transfers(docket_cca, docket_data) + + assert result.success is True + assert result.create is True + + transfers = CaseTransfer.objects.all() + assert transfers.count() == 1 + transfer = transfers.first() + assert transfer.destination_court == self.texas_cca + assert transfer.destination_docket_number == docket_cca.docket_number + assert transfer.origin_court == self.texas_coa1 + assert transfer.origin_docket_number == self.docket_number_coa1 + assert transfer.transfer_type == CaseTransfer.APPEAL + + def test_merge_texas_case_transfers_cca_death_penalty_direct_appeal( + self, + ): + """Can we handle death penalty direct appeals to CCA?""" + texas_district = CourtFactory.create(id="texdistct6") + docket_cca = DocketFactory.create(court=self.texas_cca) + + originating_court = TexasOriginatingDistrictCourtDictFactory( + district=5 + ) + docket_data = TexasFinalCourtDocketDictFactory( + court_id=CourtID.COURT_OF_CRIMINAL_APPEALS.value, + docket_number=docket_cca.docket_number, + originating_court=originating_court, + is_direct_appeal=True, + ) + + result = merge_texas_case_transfers(docket_cca, docket_data) + + assert result.success is True + assert result.create is True + + transfers = CaseTransfer.objects.all() + assert transfers.count() == 1 + transfer = transfers.first() + assert transfer.destination_court == self.texas_cca + assert transfer.destination_docket_number == docket_cca.docket_number + assert transfer.origin_court == texas_district + assert transfer.origin_docket_number == originating_court["case"] + assert transfer.transfer_type == CaseTransfer.APPEAL + + def test_merge_texas_case_transfers_no_trial_court_info(self): + """Do we handle appellate cases without trial court info?""" + originating_court = TexasOriginatingCourtDictFactory( + court_type=CourtType.UNKNOWN.value, + case="", + ) + docket_data = TexasCourtOfAppealsDocketDictFactory( + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, + docket_number=self.docket_number_coa1, + originating_court=originating_court, + transfer_from=None, + ) + + result = merge_texas_case_transfers(self.docket_coa1, docket_data) + + assert result.success is True + + transfers = CaseTransfer.objects.all() + assert transfers.count() == 0 + + def test_merge_texas_case_transfers_duplicate_handling(self): + """Do we properly handle duplicate CaseTransfer objects?""" + texas_district = CourtFactory.create(id="texdistct6") + + originating_court = TexasOriginatingDistrictCourtDictFactory( + district=5, + ) + docket_data = TexasCourtOfAppealsDocketDictFactory( + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, + docket_number=self.docket_number_coa1, + originating_court=originating_court, + transfer_from=None, + ) + + CaseTransferFactory.create( + origin_court=texas_district, + origin_docket=None, + origin_docket_number=originating_court["case"], + destination_court=self.texas_coa1, + destination_docket_number=self.docket_number_coa1, + destination_docket=self.docket_coa1, + transfer_date=docket_data["date_filed"], + transfer_type=CaseTransfer.APPEAL, + ) + + result = merge_texas_case_transfers(self.docket_coa1, docket_data) + + assert result.success is True + assert result.create is False + + transfers = CaseTransfer.objects.all() + assert transfers.count() == 1 + + def test_merge_texas_docket_appellate_sets_appeal_from(self): + """Does merge_texas_docket set appeal_from for appellate courts?""" + texas_district = CourtFactory.create(id="texdistct6") + originating_court = TexasOriginatingDistrictCourtDictFactory( + district=5, + ) + docket_data = TexasCourtOfAppealsDocketDictFactory( + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, + docket_number=self.docket_number_coa1, + originating_court=originating_court, + transfer_from=None, + ) + + result = merge_texas_docket(docket_data) + + assert result.success is True + assert result.pk == self.docket_coa1.pk + + self.docket_coa1.refresh_from_db() + assert self.docket_coa1.date_filed == docket_data["date_filed"] + assert self.docket_coa1.cause == docket_data["case_type"] + assert self.docket_coa1.appeal_from_id == "texdistct6" + assert self.docket_coa1.appeal_from_str == texas_district.full_name + + def test_merge_texas_docket_final_court_sets_appeal_from(self): + """Does merge_texas_docket set appeal_from for final courts?""" + sc_dn = "25-1066" + docket_sc = DocketFactory.create( + court=self.texas_sc, + docket_number=sc_dn, + docket_number_core=make_texas_docket_number_core(sc_dn), + ) + appeals_court = TexasAppellateCourtInfoDictFactory( + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, + ) + docket_data = TexasFinalCourtDocketDictFactory( + court_id=CourtID.SUPREME_COURT.value, + docket_number=docket_sc.docket_number, + appeals_court=appeals_court, + originating_court=TexasOriginatingDistrictCourtDictFactory( + district=100 + ), + is_direct_appeal=False, + ) + + result = merge_texas_docket(docket_data) + + assert result.success is True + assert result.pk == docket_sc.pk + + docket_sc.refresh_from_db() + assert docket_sc.date_filed == docket_data["date_filed"] + assert docket_sc.cause == docket_data["case_type"] + assert docket_sc.appeal_from_id == "txctapp1" + assert docket_sc.appeal_from_str == self.texas_coa1.full_name + + def test_merge_texas_docket_appeal_from_missing_court(self): + docket_dict = TexasFinalCourtDocketDictFactory.create( + is_direct_appeal=False, + appeals_court=TexasAppellateCourtInfoDictFactory( + court_id="texas_coa17", district="Not Real Court of Appeals" + ), + ) + + result = merge_texas_docket(docket_dict) + + assert result.success is True + + docket = Docket.objects.get(pk=result.pk) + + assert ( + docket.appeal_from_str == docket_dict["appeals_court"]["district"] + ) + assert docket.appeal_from is None + + @patch( + "cl.corpus_importer.tasks.merge_texas_case_transfers", + return_value=MergeResult.created(1), + ) + @patch( + "cl.corpus_importer.tasks.merge_texas_docket_entry", + return_value=MergeResult.created(1), + ) + @patch( + "cl.corpus_importer.tasks.merge_texas_parties", + return_value=MergeResult.created(1), + ) + @patch( + "cl.corpus_importer.tasks.merge_texas_docket_originating_court", + return_value=MergeResult.created(1), + ) + def test_merge_texas_docket_populates_all_fields( + self, mock_oci, mock_parties, mock_entry, mock_transfers + ): + """Does merge_texas_docket populate all Docket fields from input data?""" + texas_district = CourtFactory.create(id="texdistct6") + originating_court = TexasOriginatingDistrictCourtDictFactory( + district=5, + ) + docket_data = TexasCourtOfAppealsDocketDictFactory( + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, + originating_court=originating_court, + transfer_from=None, + ) + + result = merge_texas_docket(docket_data) + + assert result.success is True + assert result.pk is not None + + docket = Docket.objects.get(pk=result.pk) + assert docket.source & Docket.SCRAPER + assert docket.court_id == "txctapp1" + assert docket.docket_number == docket_data["docket_number"] + assert docket.docket_number_core == make_texas_docket_number_core( + docket_data["docket_number"] + ) + assert docket.docket_number_raw == docket_data["docket_number"] + assert docket.case_name == docket_data["case_name"] + assert docket.case_name_full == docket_data["case_name_full"] + assert docket.date_filed == docket_data["date_filed"] + assert docket.cause == docket_data["case_type"] + assert docket.appeal_from_id == "texdistct6" + assert docket.appeal_from_str == texas_district.full_name + + def test_merge_trial_court_data(self): + """Can we create and then update TrialCourtData?""" + # Test written with the help of Claude Code + texas_district = CourtFactory.create(id="texdistct6") + originating_court = TexasOriginatingDistrictCourtDictFactory( + district=5, + ) + docket_data = TexasFinalCourtDocketDictFactory( + court_id=CourtID.SUPREME_COURT.value, + originating_court=originating_court, + ) + + docket_sc = DocketFactory.create(court=self.texas_sc) + result = merge_texas_trial_court_data(docket_sc, docket_data) + + assert result.create is True + assert result.success is True + assert result.pk is not None + + tcd = TrialCourtData.objects.get(pk=result.pk) + assert tcd.docket_id == docket_sc.pk + assert tcd.docket_number_raw_trial == originating_court["case"] + assert tcd.docket_number_trial == originating_court["case"] + assert tcd.judge_str == originating_court["judge"] + assert tcd.reporter == originating_court["reporter"] + assert tcd.punishment == originating_court["punishment"] + assert tcd.county == originating_court["county"] + assert tcd.court == texas_district + assert tcd.court_name == texas_district.full_name + + # Merging the same data again should be unnecessary + result2 = merge_texas_trial_court_data(docket_sc, docket_data) + tcd.refresh_from_db() + assert result2.create is False + assert result2.update is False + assert result2.success is True + assert tcd.docket_number_raw_trial == originating_court["case"] + assert tcd.docket_number_trial == originating_court["case"] + assert tcd.judge_str == originating_court["judge"] + assert tcd.reporter == originating_court["reporter"] + assert tcd.punishment == originating_court["punishment"] + assert tcd.county == originating_court["county"] + assert result2.pk == tcd.pk + assert TrialCourtData.objects.filter(docket=docket_sc).count() == 1 + + # Merging changed data should update + new_dn = originating_court["case"] + "Different" + originating_court["case"] = new_dn + + result3 = merge_texas_trial_court_data(docket_sc, docket_data) + tcd.refresh_from_db() + assert result3.create is False + assert result3.update is True + assert result3.success is True + assert tcd.docket_number_raw_trial == originating_court["case"] + assert tcd.docket_number_trial == new_dn + assert tcd.judge_str == originating_court["judge"] + assert tcd.reporter == originating_court["reporter"] + assert tcd.punishment == originating_court["punishment"] + assert tcd.county == originating_court["county"] + assert result3.pk == tcd.pk + assert TrialCourtData.objects.filter(docket=docket_sc).count() == 1 + @patch("cl.corpus_importer.tasks.get_or_cache_pacer_cookies") @override_settings( diff --git a/cl/corpus_importer/utils.py b/cl/corpus_importer/utils.py index a0ed08836e..1db2828fe7 100644 --- a/cl/corpus_importer/utils.py +++ b/cl/corpus_importer/utils.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import itertools import math import random @@ -21,6 +23,11 @@ from eyecite.models import FullCaseCitation from eyecite.tokenizers import HyperscanTokenizer from juriscraper.lib.string_utils import harmonize, titlecase +from juriscraper.state.texas import ( + TexasOriginatingAppellateCourt, + TexasOriginatingDistrictCourt, +) +from juriscraper.state.texas.common import CourtID, CourtType from cl.citations.utils import map_reporter_db_cite_type from cl.lib.command_utils import logger @@ -138,19 +145,7 @@ def is_bankruptcy_court(court_id: str) -> bool: return bankr_court_ids.filter(pk=court_id).exists() -def is_appellate_court(court_id: str) -> bool: - """Checks if the given court_id belongs to an appellate court. - - :param court_id: The unique identifier of the court. - - :return: True if the court_id corresponds to an appellate court, - False otherwise. - """ - appellate_court_ids = Court.federal_courts.appellate_courts() - return appellate_court_ids.filter(pk=court_id).exists() - - -async def ais_appellate_court(court_id: str) -> bool: +async def is_appellate_court(court_id: str) -> bool: """Checks if the given court_id belongs to an appellate court. :param court_id: The unique identifier of the court. @@ -1311,3 +1306,49 @@ class DownloadPDFResult: success: bool sha1: str | None = None + + +def texas_js_court_id_to_court_id(js_court_id: str) -> str | None: + """Translates a Juriscraper Texas court ID to a CourtListener Court ID. + + :param js_court_id: The court ID extracted from Juriscraper. + :return: The corresponding Court ID or None if invalid.""" + if js_court_id == CourtID.SUPREME_COURT.value: + return "tex" + if js_court_id == CourtID.COURT_OF_CRIMINAL_APPEALS.value: + return "texcrimapp" + if js_court_id == CourtID.UNKNOWN.value: + logger.error("Unknown court ID: %s", js_court_id) + return None + # Court of appeals + appellate_number = str(int(js_court_id.removeprefix("texas_coa"))) + return f"txctapp{appellate_number}" + + +def texas_originating_court_to_court_id( + court_data: TexasOriginatingAppellateCourt | TexasOriginatingDistrictCourt, +) -> str | None: + """Attempts to translate Juriscraper Texas originating court data to a + CourtListener Court ID. + + :param court_data: The originating court data from Juriscraper. + :return: The matching Court ID or None if no court could be found.""" + court_type = court_data["court_type"] + match court_type: + case CourtType.APPELLATE.value: + return texas_js_court_id_to_court_id(court_data["court_id"]) + case CourtType.DISTRICT.value: + district_number = court_data["district"] + if district_number: + if district_number > 1: + district_number = district_number + 1 + return f"texdistct{district_number}" + return "texdistct" + case CourtType.BUSINESS.value: + return "texbizct" + case CourtType.MUNICIPAL.value: + return "texctyct" + case CourtType.PROBATE.value: + return "texprobct" + # County, justice, and unknown court types + return None diff --git a/cl/favorites/tasks.py b/cl/favorites/tasks.py index 71aaf8884c..79992ad8fd 100644 --- a/cl/favorites/tasks.py +++ b/cl/favorites/tasks.py @@ -1,3 +1,4 @@ +from asgiref.sync import async_to_sync from django.conf import settings from django.contrib.auth.models import User from django.utils.timezone import now @@ -38,14 +39,14 @@ def check_prayer_pacer(self, rd_pk: int, user_pk: int): court_id = map_cl_to_pacer_id(rd.docket_entry.docket.court_id) pacer_doc_id = rd.pacer_doc_id recap_user = User.objects.get(username="recap") - session_data = get_or_cache_pacer_cookies( + session_data = async_to_sync(get_or_cache_pacer_cookies)( recap_user.pk, settings.PACER_USERNAME, settings.PACER_PASSWORD ) s = ProxyPacerSession( cookies=session_data.cookies, proxy=session_data.proxy_address ) receipt_report = DownloadConfirmationPage(court_id, s) - receipt_report.query(pacer_doc_id) + async_to_sync(receipt_report.query)(pacer_doc_id) data = receipt_report.data if data == {} and not is_pdf(receipt_report.response): diff --git a/cl/lasc/tasks.py b/cl/lasc/tasks.py index 22690880fd..b9defb6a34 100644 --- a/cl/lasc/tasks.py +++ b/cl/lasc/tasks.py @@ -3,13 +3,14 @@ import os import pickle +from asgiref.sync import async_to_sync from django.apps import apps from django.conf import settings from django.core.files.base import ContentFile from django.db import transaction +from httpx import RequestError from juriscraper.lasc.fetch import LASCSearch from juriscraper.lasc.http import LASCSession -from requests import RequestException from cl.celery_init import app from cl.lasc.models import ( @@ -110,8 +111,8 @@ def download_pdf(self, pdf_pk): return try: - pdf_data = lasc.get_pdf_from_url(q_pdf.document_url) - except RequestException as exc: + pdf_data = async_to_sync(lasc.get_pdf_from_url)(q_pdf.document_url) + except RequestError as exc: logger.warning( "Got RequestException trying to get PDF for PDF Queue %s", q_pdf.pk, @@ -194,9 +195,11 @@ def add_or_update_case_db(self, case_id): clean_data = {} try: - clean_data = lasc.get_json_from_internal_case_id(case_id) + clean_data = async_to_sync(lasc.get_json_from_internal_case_id)( + case_id + ) logger.info("Successful Query") - except RequestException as e: + except RequestError as e: retries_remaining = self.max_retries - self.request.retries if retries_remaining == 0: logger.error("RequestException, unable to get case at %s", case_id) @@ -349,8 +352,8 @@ def fetch_date_range(self, start, end): lasc = make_lasc_search() try: - cases = lasc.query_cases_by_date(start, end) - except RequestException as exc: + cases = async_to_sync(lasc.query_cases_by_date)(start, end) + except RequestError as exc: logger.warning( "Got RequestException trying to get cases by date " "between %s and %s", diff --git a/cl/lib/decorators.py b/cl/lib/decorators.py index cc008010ce..f466acc49e 100644 --- a/cl/lib/decorators.py +++ b/cl/lib/decorators.py @@ -299,3 +299,20 @@ def document_model(model: type[models.Model]) -> type[models.Model]: field.db_comment = doc return model + + +def time_call(fn_logger: logging.Logger) -> Callable: + def decorator(f: Callable) -> Callable: + @wraps(f) + def wrapper(*args: Any, **kwargs: Any) -> Any: + start = time.perf_counter_ns() + result = f(*args, **kwargs) + elapsed = time.perf_counter_ns() - start + fn_logger.debug( + "Ran %s in %d.3 ms", f.__qualname__, elapsed / 1_000_000 + ) + return result + + return wrapper + + return decorator diff --git a/cl/lib/model_helpers.py b/cl/lib/model_helpers.py index 9f146ebaa4..75fea09a5a 100644 --- a/cl/lib/model_helpers.py +++ b/cl/lib/model_helpers.py @@ -197,16 +197,32 @@ def clean_texas_docket_number(docket_number: str | None) -> str: if regex.fullmatch(docket_number): return docket_number - # Try fullmatch on each whitespace-separated token (dirty input - # like "Case Number: 04-97-00972-CV"). We use fullmatch rather - # than search because these regexes were designed for fullmatch - # and can produce false positives with partial matching. - for token in docket_number.split(): + tokens = [ + # Strip leading and trailing punctuation from tokens since it's likely invalid. + re.compile(r"^[^a-z0-9]+|[^a-z0-9]+$", re.IGNORECASE).sub("", token) + for token in docket_number.split() + ] + matching_parts = [] + for token in tokens: for regex in TEXAS_DN_REGEXES: if regex.fullmatch(token): - return token + matching_parts.append(token) - return "" + if len(matching_parts) == 0: + logger.warning( + "Could not find valid Texas docket number in string %s. Using empty string as clean docket number", + docket_number, + ) + return "" + + matching_parts.sort() + if len(matching_parts) > 1: + logger.warning( + "Found multiple docket numbers combined %s. Using %s as clean docket number.", + docket_number, + matching_parts[0], + ) + return matching_parts[0] def make_texas_docket_number_core(docket_number: str | None) -> str: diff --git a/cl/lib/pacer_session.py b/cl/lib/pacer_session.py index 50f3056fad..3c1c3829a8 100644 --- a/cl/lib/pacer_session.py +++ b/cl/lib/pacer_session.py @@ -1,12 +1,12 @@ import pickle import random from dataclasses import dataclass -from urllib.parse import urlparse +from asgiref.sync import async_to_sync, sync_to_async from django.conf import settings +from httpx import URL, Cookies, Request from juriscraper.pacer import PacerSession from redis import Redis -from requests.cookies import RequestsCookieJar from cl.lib.redis_utils import get_redis_interface @@ -26,7 +26,7 @@ class SessionData: `ProxyPacerSession` class. """ - cookies: RequestsCookieJar + cookies: Cookies proxy_address: str = "" def __post_init__(self): @@ -68,10 +68,10 @@ def __init__( } self.headers["X-WhSentry-TLS"] = "true" - def send(self, request, **kwargs): + async def send(self, request: Request, **kwargs): """Send a given PreparedRequest.""" request.url = self._change_protocol(request.url) - return super().send(request, **kwargs) + return await super().send(request, **kwargs) def _pick_proxy_connection(self) -> str: """ @@ -85,7 +85,7 @@ def _pick_proxy_connection(self) -> str: """ return random.choice(settings.EGRESS_PROXY_HOSTS) - def _change_protocol(self, url: str) -> str: + def _change_protocol(self, url: URL | str) -> URL: """Converts a URL from HTTPS to HTTP protocol. By default, HTTP clients create a CONNECT tunnel when a proxy is @@ -100,26 +100,25 @@ def _change_protocol(self, url: str) -> str: https://github.com/juggernaut/webhook-sentry?tab=readme-ov-file#https-target Args: - url (str): The URL to modify. + url (URL): The URL to modify. Returns: - str: The URL with the protocol changed from HTTPS to HTTP. + URL: The URL with the protocol changed from HTTPS to HTTP. """ - new_url = urlparse(url) - return new_url._replace(scheme="http").geturl() + return URL(url, scheme="http") - def _prepare_login_request(self, url, *args, **kwargs): - return super(PacerSession, self).post( + async def _prepare_login_request(self, url, *args, **kwargs): + return await super(PacerSession, self).post( self._change_protocol(url), **kwargs ) - def post(self, url, *args, **kwargs): - return super().post(self._change_protocol(url), **kwargs) + async def post(self, url, *args, **kwargs): + return await super().post(self._change_protocol(url), **kwargs) - def get(self, url, *args, **kwargs): - return super().get(self._change_protocol(url), **kwargs) + async def get(self, url, *args, **kwargs): + return await super().get(self._change_protocol(url), **kwargs) - def _get_saml_auth_request_parameters( + async def _get_saml_auth_request_parameters( self, court_id: str ) -> dict[str, str]: """ @@ -129,14 +128,16 @@ def _get_saml_auth_request_parameters( workflow can be reused in subsequent requests through a proxy connection by setting their 'secure' attribute to False. """ - saml_credentials = super()._get_saml_auth_request_parameters(court_id) + saml_credentials = await super()._get_saml_auth_request_parameters( + court_id + ) # Update cookies so they can be sent over non-HTTPS connections for cookie in self.cookies: cookie.secure = False return saml_credentials -def log_into_pacer( +async def log_into_pacer( username: str, password: str, client_code: str | None = None, @@ -154,10 +155,11 @@ def log_into_pacer( password=password, client_code=client_code, ) - s.login() + await s.login() return SessionData(s.cookies, s.proxy_address) +@sync_to_async def get_or_cache_pacer_cookies( user_pk: str | int, username: str, @@ -184,7 +186,7 @@ def get_or_cache_pacer_cookies( :return: A SessionData object containing the session's cookies and proxy. """ r = get_redis_interface("CACHE", decode_responses=False) - cookies_data = get_pacer_cookie_from_cache(user_pk, r=r) + cookies_data = async_to_sync(get_pacer_cookie_from_cache)(user_pk, r=r) ttl_seconds = r.ttl(session_key % user_pk) if cookies_data and ttl_seconds >= 300 and not refresh: # cookies were found in cache and ttl >= 5 minutes, return them @@ -192,7 +194,9 @@ def get_or_cache_pacer_cookies( # Unable to find cookies in cache, are about to expire or refresh needed # Login and cache new values. - session_data = log_into_pacer(username, password, client_code) + session_data = async_to_sync(log_into_pacer)( + username, password, client_code + ) cookie_expiration = 60 * 60 r.set( session_key % user_pk, @@ -202,6 +206,7 @@ def get_or_cache_pacer_cookies( return session_data +@sync_to_async def get_pacer_cookie_from_cache( user_pk: str | int, r: Redis | None = None, @@ -220,6 +225,7 @@ def get_pacer_cookie_from_cache( return pickle.loads(pickled_cookie) +@sync_to_async def delete_pacer_cookie_from_cache( user_pk: str | int, r: Redis | None = None, diff --git a/cl/lib/tests.py b/cl/lib/tests.py index 6ce719e8fb..91b71424ea 100644 --- a/cl/lib/tests.py +++ b/cl/lib/tests.py @@ -242,7 +242,7 @@ def test_compute_new_cookies_with_new_format(self, mock_log_into_pacer): self.test_cookies, "http://proxy_1:9090", ) - session_data = get_or_cache_pacer_cookies( + session_data = async_to_sync(get_or_cache_pacer_cookies)( "test_user_new_cookie", username="test", password="password" ) self.assertEqual(mock_log_into_pacer.call_count, 1) @@ -252,7 +252,7 @@ def test_compute_new_cookies_with_new_format(self, mock_log_into_pacer): @patch("cl.lib.pacer_session.log_into_pacer") def test_parse_cookie_proxy_pair_properly(self, mock_log_into_pacer): """Can we parse the dataclass from cache properly?""" - session_data = get_or_cache_pacer_cookies( + session_data = async_to_sync(get_or_cache_pacer_cookies)( "test_user_new_format", username="test", password="password" ) self.assertEqual(mock_log_into_pacer.call_count, 0) @@ -270,7 +270,7 @@ def test_compute_cookies_for_almost_expired_data( # Attempts to get almost expired cookies with the new format from cache # Expects refresh. - session_data = get_or_cache_pacer_cookies( + session_data = async_to_sync(get_or_cache_pacer_cookies)( "test_new_format_almost_expired", username="test", password="password", @@ -451,7 +451,10 @@ def test_is_texas_court(self) -> None: def test_clean_texas_docket_number(self) -> None: """Can we extract Texas docket numbers from dirty input?""" test_cases = [ + ("Case Number: AP-77,129; 04-97-00972-CV", "04-97-00972-CV"), ("Case Number: 04-97-00972-CV", "04-97-00972-CV"), + ("Case Number: 04-97-00972-CV; AP-77,129", "04-97-00972-CV"), + ("AP-77,129, 04-97-00972-CV, and WR-70,849-04", "04-97-00972-CV"), ("04-97-00972-CV", "04-97-00972-CV"), ("AP-77,129", "AP-77,129"), ("WR-70,849-04", "WR-70,849-04"), @@ -460,9 +463,13 @@ def test_clean_texas_docket_number(self) -> None: ("", ""), ("garbage text", ""), ] - for input_dn, expected in test_cases: + for i, (input_dn, expected) in enumerate(test_cases): with self.subTest(input_dn=input_dn): - self.assertEqual(clean_texas_docket_number(input_dn), expected) + self.assertEqual( + clean_texas_docket_number(input_dn), + expected, + f"Failed test case {i}", + ) def test_texas_docket_number_core(self) -> None: """Can we correctly normalize Texas docket numbers?""" diff --git a/cl/people_db/lookup_utils.py b/cl/people_db/lookup_utils.py index a376c8c36f..62c8c74d11 100644 --- a/cl/people_db/lookup_utils.py +++ b/cl/people_db/lookup_utils.py @@ -555,7 +555,8 @@ async def lookup_judge_by_full_name_and_set_attr( target_field: str, full_name: HumanName | str, court_id: str, - event_date: date, + event_date: date | None, + require_living_judge: bool = True, ) -> None: """Lookup a judge by the attribute of an object @@ -564,11 +565,15 @@ async def lookup_judge_by_full_name_and_set_attr( :param full_name: The full name of the judge to look up :param court_id: The court where the judge did something :param event_date: The date the judge did something + :param require_living_judge: Whether to ensure that the judge found was + born before the event date and died after it. :return None """ if not full_name: return None - judge = await lookup_judge_by_full_name(full_name, court_id, event_date) + judge = await lookup_judge_by_full_name( + full_name, court_id, event_date, require_living_judge + ) setattr(item, target_field, judge) diff --git a/cl/recap/api_serializers.py b/cl/recap/api_serializers.py index 88a7c4256e..6710fbb8d3 100644 --- a/cl/recap/api_serializers.py +++ b/cl/recap/api_serializers.py @@ -1,3 +1,4 @@ +from asgiref.sync import async_to_sync from juriscraper.lib.exceptions import PacerLoginException from rest_framework import serializers from rest_framework.exceptions import ValidationError @@ -128,7 +129,7 @@ def validate(self, attrs): UPLOAD_TYPE.APPELLATE_CASE_QUERY_RESULT_PAGE, ]: # Appellate court dockets. Is the court valid? - if not is_appellate_court(attrs["court"].pk): + if not async_to_sync(is_appellate_court)(attrs["court"].pk): raise ValidationError( "{} is not an appellate court ID. Did you mean to use the " "upload_type for district dockets?".format(attrs["court"]) @@ -319,7 +320,7 @@ def validate(self, attrs): if ( attrs.get("pacer_case_id") and not attrs.get("docket_number") - and is_appellate_court(attrs.get("court").pk) + and async_to_sync(is_appellate_court)(attrs.get("court").pk) ): # The user is trying to purchase an appellate docket using only the # PACER case ID, which is not supported. @@ -331,7 +332,7 @@ def validate(self, attrs): court_id = get_court_id_from_fetch_queue(attrs) if ( attrs.get("de_number_end") or attrs.get("de_number_start") - ) and is_appellate_court(court_id): + ) and async_to_sync(is_appellate_court)(court_id): raise ValidationError( "Docket entry filtering by number is not supported for " "appellate courts. Use date range filtering with " @@ -371,7 +372,7 @@ def validate(self, attrs): # Do the PACER credentials work? try: - _ = get_or_cache_pacer_cookies( + _ = async_to_sync(get_or_cache_pacer_cookies)( attrs["user"].pk, username=attrs.pop("pacer_username"), password=attrs.pop("pacer_password"), diff --git a/cl/recap/management/commands/merge_idb_into_dockets.py b/cl/recap/management/commands/merge_idb_into_dockets.py index ba90c71071..d8fc587108 100644 --- a/cl/recap/management/commands/merge_idb_into_dockets.py +++ b/cl/recap/management/commands/merge_idb_into_dockets.py @@ -1,5 +1,6 @@ import os +from asgiref.sync import async_to_sync from celery.canvas import chain from django.conf import settings from juriscraper.lib.string_utils import CaseNameTweaker @@ -119,7 +120,7 @@ def update_any_missing_pacer_case_ids(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() for i, d in enumerate(ds.iterator()): if i < options["offset"]: continue @@ -132,7 +133,7 @@ def update_any_missing_pacer_case_ids(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() throttle.maybe_wait() logger.info("Getting pacer_case_id for item %s", d) diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py index 317b827ce9..fd8f84c05f 100644 --- a/cl/recap/mergers.py +++ b/cl/recap/mergers.py @@ -20,7 +20,7 @@ set_skip_percolation_if_parties_data, ) from cl.corpus_importer.utils import ( - ais_appellate_court, + is_appellate_court, is_long_appellate_document_number, mark_ia_upload_needed, ) @@ -134,7 +134,9 @@ async def find_docket_object( federal_dn_judge_initials_assigned: str | None, federal_dn_judge_initials_referred: str | None, using: str = "default", -) -> Docket: + docket_source: int = Docket.RECAP, + allow_create: bool = True, +) -> Docket | None: """Attempt to find the docket based on the parsed docket data. If cannot be found, create a new docket. If multiple are found, return the oldest. @@ -147,6 +149,9 @@ async def find_docket_object( :param federal_dn_judge_initials_referred: The judge's initials referred to validate the match. :param using: The database to use for the lookup queries. + :param docket_source: The source to set when creating a new docket. + :param allow_create: Whether to create a new docket if no matching one is + found :return The docket found or created. """ # Attempt several lookups of decreasing specificity. Note that @@ -265,10 +270,14 @@ async def find_docket_object( break if d is None: # Couldn't find a docket. Return a new one. - return Docket( - source=Docket.RECAP, - pacer_case_id=pacer_case_id, - court_id=court_id, + return ( + Docket( + source=docket_source, + pacer_case_id=pacer_case_id, + court_id=court_id, + ) + if allow_create + else None ) if using != "default": @@ -992,7 +1001,8 @@ async def add_docket_entries( # entry, we avoid creating the main RD a second+ time when we get the # docket sheet a second+ time. - appellate_court_id_exists = await ais_appellate_court(d.court_id) + appellate_court_id_exists = await is_appellate_court(d.court_id) + appellate_rd_att_exists = False if de_created is False and appellate_court_id_exists: # In existing appellate entry merges, check if the entry has at # least one attachment. @@ -1953,7 +1963,7 @@ async def merge_attachment_page_data( ContentFile(text.encode()), ) - court_is_appellate = await ais_appellate_court(court.pk) + court_is_appellate = await is_appellate_court(court.pk) main_rd_to_att = False for attachment in attachment_dicts: sanity_checks = [ diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index 3d281aa91f..2b607e7392 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -12,7 +12,6 @@ from zipfile import ZipFile import httpx -import requests from asgiref.sync import async_to_sync, sync_to_async from botocore import exceptions as botocore_exception from celery import Task @@ -42,8 +41,6 @@ from juriscraper.pacer.email import DocketType from lxml.etree import ParserError from redis import ConnectionError as RedisConnectionError -from requests import HTTPError -from requests.packages.urllib3.exceptions import ReadTimeoutError from cl.alerts.tasks import enqueue_docket_alert, send_alert_and_webhook from cl.alerts.utils import ( @@ -64,7 +61,6 @@ update_rd_metadata, ) from cl.corpus_importer.utils import ( - ais_appellate_court, is_appellate_court, is_bankruptcy_court, is_long_appellate_document_number, @@ -188,7 +184,7 @@ def do_pacer_fetch(fq: PacerFetchQueue): court_id = get_court_id_from_fetch_queue(fq) c = ( chain(fetch_appellate_docket.si(fq.pk)) - if is_appellate_court(court_id) + if async_to_sync(is_appellate_court)(court_id) else chain(fetch_docket.si(fq.pk)) ) c = c | mark_fq_successful.si(fq.pk) @@ -421,7 +417,7 @@ async def process_recap_pdf(pk, subdocket_replication: bool = False): # from PQ if this task is part of a subdocket replication. In subdockets, # this metadata may differ even when the document is the same. if ( - not await ais_appellate_court(court_id) + not await is_appellate_court(court_id) or not is_long_appellate_document_number(rd.document_number) ) and not subdocket_replication: rd.document_number = str(pq.document_number) @@ -820,7 +816,7 @@ async def find_subdocket_pdf_rds( ) subdocket_replication = False - if await ais_appellate_court(pq.court_id): + if await is_appellate_court(pq.court_id): # Abort the process for appellate documents. Subdockets cannot be found # in appellate cases. return [(pq.pk, subdocket_replication)] @@ -1978,15 +1974,15 @@ def fetch_pacer_doc_by_rd_base( if not is_pacer_court_accessible(rd.docket_entry.docket.court_id): if self.request.retries == self.max_retries: msg = f"Blocked by court: {rd.docket_entry.docket.court_id}" - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None raise self.retry() - mark_fq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS) + async_to_sync(mark_fq_status)(fq, "", PROCESSING_STATUS.IN_PROGRESS) if rd.is_available: msg = "PDF already marked as 'is_available'. Doing nothing." - mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.SUCCESSFUL) self.request.chain = None return @@ -1998,14 +1994,16 @@ def fetch_pacer_doc_by_rd_base( "document associated with it, or it may need to be updated via " "the docket report to acquire a pacer_doc_id. Aborting request." ) - mark_fq_status(fq, msg, PROCESSING_STATUS.INVALID_CONTENT) + async_to_sync(mark_fq_status)( + fq, msg, PROCESSING_STATUS.INVALID_CONTENT + ) self.request.chain = None return - session_data = get_pacer_cookie_from_cache(fq.user_id) + session_data = async_to_sync(get_pacer_cookie_from_cache)(fq.user_id) if not session_data: msg = "Unable to find cached cookies. Aborting request." - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return @@ -2014,14 +2012,14 @@ def fetch_pacer_doc_by_rd_base( court_id = rd.docket_entry.docket.court_id try: if rd.is_acms_document(): - r, r_msg = download_acms_pdf_by_rd( + r, r_msg = async_to_sync(download_acms_pdf_by_rd)( court_id=court_id, acms_entry_id=rd.pacer_doc_id, acms_doc_id=rd.acms_document_guid, session_data=session_data, ) else: - r, r_msg = download_pacer_pdf_by_rd( + r, r_msg = async_to_sync(download_pacer_pdf_by_rd)( rd.pk, pacer_case_id, pacer_doc_id, @@ -2029,19 +2027,19 @@ def fetch_pacer_doc_by_rd_base( magic_number, de_seq_num=de_seq_num, ) - except (requests.RequestException, HTTPError): + except (httpx.RequestError, httpx.HTTPError): msg = "Failed to get PDF from network." - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return except PacerLoginException as exc: msg = f"PacerLoginException while getting document for rd: {rd.pk}." if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) - delete_pacer_cookie_from_cache(fq.user_id) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(delete_pacer_cookie_from_cache)(fq.user_id) self.request.chain = None return None - mark_fq_status( + async_to_sync(mark_fq_status)( fq, f"{msg} Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) @@ -2049,8 +2047,7 @@ def fetch_pacer_doc_by_rd_base( pdf_bytes = None if r: pdf_bytes = r.content - success, msg = update_rd_metadata( - self, + success, msg = async_to_sync(update_rd_metadata)( rd_pk, pdf_bytes, r_msg, @@ -2063,13 +2060,13 @@ def fetch_pacer_doc_by_rd_base( ) if success is False: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return # Logic to replicate the PDF sub-dockets matched by RECAPDocument subdocket_pqs_to_replicate = [] - if not is_appellate_court(court_id): + if not async_to_sync(is_appellate_court)(court_id): subdocket_pqs_to_replicate = find_subdocket_pdf_rds_from_data( fq.user_id, court_id, pacer_doc_id, [pacer_case_id], pdf_bytes ) @@ -2144,7 +2141,7 @@ def fetch_pacer_doc_by_rd_and_mark_fq_completed( # case, fetch_pacer_doc_by_rd_base will return None. fq = PacerFetchQueue.objects.get(pk=fq_pk) msg = "Successfully completed fetch and save." - mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.SUCCESSFUL) return None @@ -2180,15 +2177,15 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[tuple[int, bool]]: if not is_pacer_court_accessible(court_id): if self.request.retries == self.max_retries: msg = f"Blocked by court: {court_id}" - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return [] raise self.retry() - mark_fq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS) + async_to_sync(mark_fq_status)(fq, "", PROCESSING_STATUS.IN_PROGRESS) if not pacer_doc_id: msg = f"Unable to get attachment page: Unknown pacer_doc_id for RECAP Document object {rd.pk}" - mark_fq_status(fq, msg, PROCESSING_STATUS.NEEDS_INFO) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.NEEDS_INFO) self.request.chain = None return [] @@ -2199,15 +2196,15 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[tuple[int, bool]]: self.request.chain = None return [] - session_data = get_pacer_cookie_from_cache(fq.user_id) + session_data = async_to_sync(get_pacer_cookie_from_cache)(fq.user_id) if not session_data: msg = "Unable to find cached cookies. Aborting request." - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return [] try: - r = get_att_report_by_rd(rd, session_data) + r = async_to_sync(get_att_report_by_rd)(rd, session_data) except ParserError as exc: if self.request.retries == self.max_retries: msg = "ParserError while getting attachment page" @@ -2215,14 +2212,24 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[tuple[int, bool]]: self.request.chain = None return [] raise self.retry(exc=exc) - except HTTPError as exc: + except httpx.RequestError as exc: + if self.request.retries == self.max_retries: + msg = "Failed to get attachment page from network." + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) + self.request.chain = None + return [] + logger.info("Ran into a RequestException. Retrying.") + raise self.retry(exc=exc) + except httpx.HTTPError as exc: msg = "Failed to get attachment page from network." if exc.response.status_code in [ HTTPStatus.INTERNAL_SERVER_ERROR, HTTPStatus.GATEWAY_TIMEOUT, ]: if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)( + fq, msg, PROCESSING_STATUS.FAILED + ) self.request.chain = None return [] logger.info( @@ -2230,30 +2237,22 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[tuple[int, bool]]: ) raise self.retry(exc=exc) else: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) - self.request.chain = None - return [] - except requests.RequestException as exc: - if self.request.retries == self.max_retries: - msg = "Failed to get attachment page from network." - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return [] - logger.info("Ran into a RequestException. Retrying.") - raise self.retry(exc=exc) except PacerLoginException as exc: msg = "PacerLoginException while getting attachment page" if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) - delete_pacer_cookie_from_cache(fq.user_id) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(delete_pacer_cookie_from_cache)(fq.user_id) self.request.chain = None return [] - mark_fq_status( + async_to_sync(mark_fq_status)( fq, f"{msg} Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) - is_appellate = is_appellate_court(court_id) + is_appellate = async_to_sync(is_appellate_court)(court_id) if not is_acms_case: text = r.response.text # Determine the appropriate parser function based on court jurisdiction @@ -2270,7 +2269,9 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[tuple[int, bool]]: if att_data == {}: msg = "Not a valid attachment page upload" - mark_fq_status(fq, msg, PROCESSING_STATUS.INVALID_CONTENT) + async_to_sync(mark_fq_status)( + fq, msg, PROCESSING_STATUS.INVALID_CONTENT + ) self.request.chain = None return [] @@ -2297,22 +2298,24 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[tuple[int, bool]]: "Too many documents found when attempting to associate " "attachment data" ) - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return [] except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return [] - mark_fq_status(fq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) + async_to_sync(mark_fq_status)( + fq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY + ) raise self.retry(exc=exc) msg = "Successfully completed fetch and save." - mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.SUCCESSFUL) # Logic to replicate the attachment page to sub-dockets matched by RECAPDocument - if is_appellate_court(court_id): + if async_to_sync(is_appellate_court)(court_id): # Subdocket replication for appellate courts is currently not supported. self.request.chain = None return [] @@ -2410,7 +2413,7 @@ def get_fq_appellate_docket_kwargs(fq: PacerFetchQueue): } -def fetch_pacer_case_id_and_title(s, fq, court_id): +async def fetch_pacer_case_id_and_title(s, fq, court_id): """Use PACER's hidden API to learn the pacer_case_id of a case :param s: A PacerSession object to use @@ -2427,7 +2430,7 @@ def fetch_pacer_case_id_and_title(s, fq, court_id): ) report = PossibleCaseNumberApi(map_cl_to_pacer_id(court_id), s) - report.query(docket_number) + await report.query(docket_number) return report.data() return {} @@ -2483,7 +2486,7 @@ def fetch_docket_by_pacer_case_id( :return: a dict with information about the docket and the new data """ report = DocketReport(map_cl_to_pacer_id(court_id), session) - report.query(pacer_case_id, **get_fq_docket_kwargs(fq)) + async_to_sync(report.query)(pacer_case_id, **get_fq_docket_kwargs(fq)) docket_data = report.data if not docket_data: @@ -2512,7 +2515,7 @@ def purchase_appellate_docket_by_docket_number( if should_check_acms_court(court_id): acms_search = AcmsCaseSearch(court_id=court_id, pacer_session=session) - acms_search.query(docket_number) + async_to_sync(acms_search.query)(docket_number) acms_case_id = ( acms_search.data["pcx_caseid"] if acms_search.data else None ) @@ -2524,9 +2527,9 @@ def purchase_appellate_docket_by_docket_number( if acms_case_id: # ACMSDocketReport only accepts the case ID; filters are not currently # supported for ACMS docket reports. - report.query(acms_case_id) + async_to_sync(report.query)(acms_case_id) else: - report.query(docket_number, **kwargs) + async_to_sync(report.query)(docket_number, **kwargs) docket_data = report.data if not docket_data: @@ -2564,17 +2567,17 @@ def fetch_appellate_docket(self, fq_pk): if not is_pacer_court_accessible(court_id): if self.request.retries == self.max_retries: msg = f"Blocked by court: {court_id}" - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None raise self.retry() async_to_sync(mark_pq_status)(fq, "", PROCESSING_STATUS.IN_PROGRESS) - session_data = get_pacer_cookie_from_cache(fq.user_id) + session_data = async_to_sync(get_pacer_cookie_from_cache)(fq.user_id) if session_data is None: msg = f"Cookie cache expired before task could run for user: {fq.user_id}" - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None @@ -2594,13 +2597,13 @@ def fetch_appellate_docket(self, fq_pk): fq=fq, **get_fq_appellate_docket_kwargs(fq), ) - except (requests.RequestException, ReadTimeoutError) as exc: + except (httpx.RequestError, httpx.ReadTimeout) as exc: msg = f"Network error while purchasing docket for fq: {fq_pk}." if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None - mark_fq_status( + async_to_sync(mark_fq_status)( fq, f"{msg}Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) @@ -2609,16 +2612,16 @@ def fetch_appellate_docket(self, fq_pk): f"PacerLoginException while getting pacer_case_id for fq: {fq_pk}." ) if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None - mark_fq_status( + async_to_sync(mark_fq_status)( fq, f"{msg} Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) except ParsingException: msg = f"Unable to purchase docket for fq: {fq_pk}." - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None @@ -2662,17 +2665,17 @@ def fetch_docket(self, fq_pk): if not is_pacer_court_accessible(court_id): if self.request.retries == self.max_retries: msg = f"Blocked by court: {court_id}" - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None raise self.retry() async_to_sync(mark_pq_status)(fq, "", PROCESSING_STATUS.IN_PROGRESS) - session_data = get_pacer_cookie_from_cache(fq.user_id) + session_data = async_to_sync(get_pacer_cookie_from_cache)(fq.user_id) if session_data is None: msg = f"Cookie cache expired before task could run for user: {fq.user_id}" - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None @@ -2680,14 +2683,14 @@ def fetch_docket(self, fq_pk): cookies=session_data.cookies, proxy=session_data.proxy_address ) try: - result = fetch_pacer_case_id_and_title(s, fq, court_id) - except (requests.RequestException, ReadTimeoutError) as exc: + result = async_to_sync(fetch_pacer_case_id_and_title)(s, fq, court_id) + except (httpx.RequestError, httpx.ReadTimeout) as exc: msg = f"Network error getting pacer_case_id for fq: {fq_pk}." if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None - mark_fq_status( + async_to_sync(mark_fq_status)( fq, f"{msg} Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) @@ -2696,16 +2699,16 @@ def fetch_docket(self, fq_pk): f"PacerLoginException while getting pacer_case_id for fq: {fq_pk}." ) if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None - mark_fq_status( + async_to_sync(mark_fq_status)( fq, f"{msg} Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) except ParsingException: msg = "Unable to parse pacer_case_id for docket." - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None @@ -2716,7 +2719,7 @@ def fetch_docket(self, fq_pk): if result is None: msg = "Cannot find case by docket number (perhaps it's sealed?)" - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None @@ -2728,26 +2731,26 @@ def fetch_docket(self, fq_pk): if not pacer_case_id: msg = "Unable to determine pacer_case_id for docket." - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None start_time = now() try: result = fetch_docket_by_pacer_case_id(s, court_id, pacer_case_id, fq) - except (requests.RequestException, ReadTimeoutError) as exc: + except (httpx.RequestError, httpx.ReadTimeout) as exc: msg = "Network error getting pacer_case_id for fq: %s." if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None - mark_fq_status( + async_to_sync(mark_fq_status)( fq, f"{msg}Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) except ParsingException: msg = "Unable to parse pacer_case_id for docket." - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None @@ -2770,10 +2773,10 @@ def fetch_docket(self, fq_pk): def mark_fq_successful(fq_pk): fq = PacerFetchQueue.objects.get(pk=fq_pk) msg = "Successfully completed fetch and save." - mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.SUCCESSFUL) -def mark_fq_status(fq, msg, status): +async def mark_fq_status(fq, msg, status): """Update the PacerFetchQueue item with the status and message provided :param fq: The PacerFetchQueue item to update @@ -2786,8 +2789,8 @@ def mark_fq_status(fq, msg, status): fq.status = status if status == PROCESSING_STATUS.SUCCESSFUL: fq.date_completed = now() - fq.save() - send_recap_fetch_webhooks(fq) + await fq.asave() + await send_recap_fetch_webhooks(fq) def get_recap_email_recipients( @@ -2809,7 +2812,9 @@ def get_recap_email_recipients( return recap_email_recipients -def get_attachment_page_by_url(att_page_url: str, court_id: str) -> str | None: +async def get_attachment_page_by_url( + att_page_url: str, court_id: str +) -> str | None: """Get the attachment page report for recap.email documents without being logged into PACER. @@ -2823,7 +2828,8 @@ def get_attachment_page_by_url(att_page_url: str, court_id: str) -> str | None: att_page_url, ) req_timeout = (60, 300) - att_response = requests.get(att_page_url, timeout=req_timeout) + async with httpx.AsyncClient() as client: + att_response = await client.get(att_page_url, timeout=req_timeout) att_data = get_data_from_att_report(att_response.text, court_id) if att_data == {}: msg = "Not a valid attachment page upload for recap.email" @@ -2832,7 +2838,7 @@ def get_attachment_page_by_url(att_page_url: str, court_id: str) -> str | None: return att_response.text -def set_rd_sealed_status( +async def set_rd_sealed_status( rd: RECAPDocument, magic_number: str | None, potentially_sealed: bool ) -> None: """Set RD is_sealed status according to the following conditions: @@ -2848,25 +2854,26 @@ def set_rd_sealed_status( :return: None """ - rd.refresh_from_db() + await rd.arefresh_from_db() if not rd.pacer_doc_id: return if not potentially_sealed: rd.is_sealed = False - rd.save() + await rd.asave() return rd.is_sealed = True - if not magic_number and not is_pacer_doc_sealed( - rd.docket_entry.docket.court.pk, rd.pacer_doc_id + docket_entry = await DocketEntry.objects.aget(id=rd.docket_entry_id) + docket = await Docket.objects.aget(id=docket_entry.docket_id) + if not magic_number and not await is_pacer_doc_sealed( + docket.court_id, rd.pacer_doc_id ): rd.is_sealed = False - rd.save() + await rd.asave() -def save_pacer_doc_from_pq( - self: Task, +async def save_pacer_doc_from_pq( rd: RECAPDocument, fq: PacerFetchQueue, pq: ProcessingQueue, @@ -2885,23 +2892,22 @@ def save_pacer_doc_from_pq( if rd.is_available: msg = "PDF already marked as 'is_available'. Doing nothing." - mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) + await mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) return if pq.status == PROCESSING_STATUS.FAILED or not pq.filepath_local: - set_rd_sealed_status(rd, magic_number, potentially_sealed=True) - mark_fq_status(fq, pq.error_message, PROCESSING_STATUS.FAILED) + await set_rd_sealed_status(rd, magic_number, potentially_sealed=True) + await mark_fq_status(fq, pq.error_message, PROCESSING_STATUS.FAILED) return with pq.filepath_local.open(mode="rb") as local_path: pdf_bytes = local_path.read() - mark_fq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS) + await mark_fq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS) pacer_case_id = rd.docket_entry.docket.pacer_case_id court_id = rd.docket_entry.docket.court_id - success, msg = update_rd_metadata( - self, + success, msg = await update_rd_metadata( rd.pk, pdf_bytes, pq.error_message, @@ -2913,12 +2919,12 @@ def save_pacer_doc_from_pq( ) if success is False: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + await mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) return msg = "Successfully completed fetch and save." - mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) - set_rd_sealed_status(rd, magic_number, potentially_sealed=False) + await mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) + await set_rd_sealed_status(rd, magic_number, potentially_sealed=False) return rd.pk @@ -3001,7 +3007,7 @@ def download_pacer_pdf_and_save_to_pq( date_created__gt=cutoff_date, ) if created and magic_number and not is_bankr_short_doc_id: - response, r_msg = download_pdf_by_magic_number( + response, r_msg = async_to_sync(download_pdf_by_magic_number)( court_id, pacer_doc_id, pacer_case_id, @@ -3059,7 +3065,7 @@ def get_and_copy_recap_attachment_docs( :return: None """ - session_data = get_pacer_cookie_from_cache(user_pk) + session_data = async_to_sync(get_pacer_cookie_from_cache)(user_pk) appellate = False unique_pqs = [] for rd_att in att_rds: @@ -3081,7 +3087,7 @@ def get_and_copy_recap_attachment_docs( request_type=REQUEST_TYPE.PDF, recap_document=rd_att, ) - save_pacer_doc_from_pq(self, rd_att, fq, pq, magic_number) + async_to_sync(save_pacer_doc_from_pq)(rd_att, fq, pq, magic_number) if pq not in unique_pqs: unique_pqs.append(pq) @@ -3152,7 +3158,7 @@ def open_and_validate_email_notification( return data, body -def fetch_attachment_data( +async def fetch_attachment_data( document_url: str, court_id: str, dockets_updated: list[DocketUpdatedData], @@ -3169,18 +3175,18 @@ def fetch_attachment_data( :param user_pk: The user to associate with the ProcessingQueue object. :return: The HTML page text. """ - session_data = get_pacer_cookie_from_cache(user_pk) + session_data = await get_pacer_cookie_from_cache(user_pk) # Try to fetch the attachment page without being logged into PACER using # the free look URL. - att_report_text = get_attachment_page_by_url(document_url, court_id) + att_report_text = await get_attachment_page_by_url(document_url, court_id) if att_report_text is None: main_rd = ( - dockets_updated[0] + await dockets_updated[0] .des_returned[0] - .recap_documents.earliest("date_created") + .recap_documents.aearliest("date_created") ) # Get the attachment page being logged into PACER - att_report = get_att_report_by_rd(main_rd, session_data) + att_report = await get_att_report_by_rd(main_rd, session_data) att_report_text = att_report.response.text return att_report_text @@ -3327,7 +3333,7 @@ def get_acms_pacer_case_id( cookies=session_data.cookies, proxy=session_data.proxy_address ) acms_search = AcmsCaseSearch(court_id=court_id, pacer_session=s) - acms_search.query(docket_number) + async_to_sync(acms_search.query)(docket_number) return acms_search.data["pcx_caseid"] if acms_search.data else None @@ -3336,9 +3342,9 @@ def get_acms_pacer_case_id( autoretry_for=( botocore_exception.HTTPClientError, botocore_exception.ConnectionError, - requests.ConnectionError, - requests.RequestException, - requests.ReadTimeout, + httpx.ConnectError, + httpx.RequestError, + httpx.ReadTimeout, PacerLoginException, RedisConnectionError, ), @@ -3390,7 +3396,7 @@ def process_recap_email( start_time = now() # Ensures we have PACER cookies ready to go. - cookies_data = get_or_cache_pacer_cookies( + cookies_data = async_to_sync(get_or_cache_pacer_cookies)( user_pk, settings.PACER_USERNAME, settings.PACER_PASSWORD ) court_id = epq.court_id @@ -3420,7 +3426,9 @@ def process_recap_email( acms=acms, ) is_potentially_sealed_entry = ( - is_docket_entry_sealed(epq.court_id, pacer_case_id, pacer_doc_id) + async_to_sync(is_docket_entry_sealed)( + epq.court_id, pacer_case_id, pacer_doc_id + ) if pq.status == PROCESSING_STATUS.FAILED and not appellate and not bankr_short_doc_id @@ -3432,7 +3440,7 @@ def process_recap_email( ] if (appellate or acms) and doc_num_from_data is None: # Get the document number for appellate documents. - appellate_doc_num = get_document_number_for_appellate( + appellate_doc_num = async_to_sync(get_document_number_for_appellate)( epq.court_id, pacer_doc_id, pq, acms ) if appellate_doc_num: @@ -3517,7 +3525,7 @@ def process_recap_email( request_type=REQUEST_TYPE.PDF, recap_document=rd, ) - save_pacer_doc_from_pq(self, rd, fq, pq, magic_number) + async_to_sync(save_pacer_doc_from_pq)(rd, fq, pq, magic_number) rd.refresh_from_db() main_rds_available.append(rd.is_available) @@ -3532,7 +3540,7 @@ def process_recap_email( and not is_potentially_sealed_entry and not bankr_short_doc_id ): - att_report_text = fetch_attachment_data( + att_report_text = async_to_sync(fetch_attachment_data)( document_url, epq.court_id, dockets_updated, user_pk ) all_attachment_rds = merge_rd_attachments( @@ -3561,7 +3569,7 @@ def process_recap_email( pacer_doc_id and content_to_replicate and got_content_updated - and not is_appellate_court(court_id) + and not async_to_sync(is_appellate_court)(court_id) ): replicate_recap_email_to_subdockets( user_pk, diff --git a/cl/recap/tests/test_recap_email.py b/cl/recap/tests/test_recap_email.py index 514ccff062..83ea20568a 100644 --- a/cl/recap/tests/test_recap_email.py +++ b/cl/recap/tests/test_recap_email.py @@ -3,6 +3,7 @@ from pathlib import Path from unittest import mock +import httpx from asgiref.sync import async_to_sync, sync_to_async from django.conf import settings from django.contrib.auth.hashers import make_password @@ -1323,8 +1324,8 @@ async def test_docket_alert_toggle_confirmation_fails( @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b""), + return_value=( + httpx.Response(200, content=b"Hello World"), "OK", ), ) @@ -1333,9 +1334,9 @@ async def test_docket_alert_toggle_confirmation_fails( side_effect=lambda *args, **kwargs: MockResponse(200, mock_raw=True), ) @mock.patch( - "cl.recap.tasks.requests.get", - side_effect=lambda *args, **kwargs: MockResponse( - 200, mock_bucket_open("nyed_123019137279.html", "r", True) + "cl.recap.tasks.httpx.AsyncClient.get", + return_value=httpx.Response( + 200, content=mock_bucket_open("nyed_123019137279.html", "r", True) ), ) async def test_new_recap_email_with_attachments( @@ -1743,15 +1744,15 @@ async def test_new_nda_recap_email_case_no_auto_subscription( ) @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b"Hello World"), + return_value=( + httpx.Response(200, content=b"Hello World"), "OK", ), ) @mock.patch( - "cl.recap.tasks.requests.get", - side_effect=lambda *args, **kwargs: MockResponse( - 200, mock_bucket_open("jpml_85001321035.html", "r", True) + "cl.recap.tasks.httpx.AsyncClient.get", + return_value=httpx.Response( + 200, content=mock_bucket_open("jpml_85001321035.html", "r", True) ), ) @mock.patch( @@ -2055,9 +2056,9 @@ async def test_mark_as_sealed_nda_document_not_available_from_magic_link( side_effect=lambda *args, **kwargs: MockResponse(200, mock_raw=True), ) @mock.patch( - "cl.recap.tasks.requests.get", - side_effect=lambda *args, **kwargs: MockResponse( - 200, mock_bucket_open("nyed_123019137279.html", "r", True) + "cl.recap.tasks.httpx.AsyncClient.get", + return_value=httpx.Response( + 200, content=mock_bucket_open("nyed_123019137279.html", "r", True) ), ) async def test_mark_as_sealed_nef_documents_not_available_from_magic_link( @@ -3139,10 +3140,10 @@ def setUp(self) -> None: @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse( + return_value=( + httpx.Response( 200, - mock_bucket_open("nda_document.pdf", "rb", True), + content=mock_bucket_open("nda_document.pdf", "rb", True), ), "OK", ), @@ -3177,10 +3178,10 @@ async def test_nda_get_document_number_from_pdf( @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse( + return_value=( + httpx.Response( 200, - mock_bucket_open( + content=mock_bucket_open( "gov.uscourts.ca8.17-2543.00803263743.0.pdf", "rb", True ), ), @@ -3224,10 +3225,10 @@ async def test_nda_get_document_number_from_confirmation_page( @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse( + return_value=( + httpx.Response( 200, - mock_bucket_open( + content=mock_bucket_open( "gov.uscourts.ca8.17-2543.00803263743.0.pdf", "rb", True ), ), @@ -3270,8 +3271,8 @@ async def test_nda_get_document_number_fallback( @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b""), + return_value=( + httpx.Response(200, content=b""), "OK", ), ) @@ -3309,8 +3310,8 @@ async def test_nda_not_document_number_available( @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b""), + return_value=( + httpx.Response(200, content=b""), "OK", ), ) @@ -3359,7 +3360,7 @@ async def test_receive_same_recap_email_nda_notification_different_users( @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( + return_value=( None, "Document not available from magic link.", ), @@ -3397,13 +3398,13 @@ async def test_nda_document_not_available_get_from_confirmation_page( self.assertEqual(recap_document_first.docket_entry.entry_number, 148) -def mock_method_set_rd_sealed_status( +async def mock_method_set_rd_sealed_status( rd: RECAPDocument, magic_number: str | None, potentially_sealed: bool ) -> None: if rd.document_type == RECAPDocument.PACER_DOCUMENT: - set_rd_sealed_status(rd, magic_number, potentially_sealed=True) + await set_rd_sealed_status(rd, magic_number, potentially_sealed=True) return - return set_rd_sealed_status(rd, magic_number, potentially_sealed) + return await set_rd_sealed_status(rd, magic_number, potentially_sealed) @mock.patch("cl.recap.tasks.enqueue_docket_alert", return_value=True) @@ -3493,14 +3494,14 @@ def setUp(self) -> None: ) @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b"Hello World"), + return_value=( + httpx.Response(200, content=b"Hello World"), "OK", ), ) @mock.patch( - "cl.recap.tasks.requests.get", - side_effect=lambda *args, **kwargs: MockResponse(200, b"Att content."), + "cl.recap.tasks.httpx.AsyncClient.get", + return_value=httpx.Response(200, content=b"Att content."), ) async def test_nef_subdocket_replication_no_att( self, @@ -3639,14 +3640,14 @@ async def test_nef_subdocket_replication_no_att( ) @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b"Hello World"), + return_value=( + httpx.Response(200, content=b"Hello World"), "OK", ), ) @mock.patch( - "cl.recap.tasks.requests.get", - side_effect=lambda *args, **kwargs: MockResponse(200, b"Att content."), + "cl.recap.tasks.httpx.AsyncClient.get", + return_value=httpx.Response(200, content=b"Att content."), ) async def test_multi_nef_subdocket_replication( self, @@ -3848,14 +3849,14 @@ async def test_multi_nef_subdocket_replication( ) @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b"Hello World"), + return_value=( + httpx.Response(200, content=b"Hello World"), "OK", ), ) @mock.patch( - "cl.recap.tasks.requests.get", - side_effect=lambda *args, **kwargs: MockResponse(200, b"Att content."), + "cl.recap.tasks.httpx.AsyncClient.get", + return_value=httpx.Response(200, content=b"Att content."), ) async def test_avoid_triggering_replication_for_minute_entries( self, @@ -4002,8 +4003,8 @@ async def test_avoid_replication_seal_document_and_sealed_attachments( @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b"Hello World"), + return_value=( + httpx.Response(200, content=b"Hello World"), "OK", ), ) @@ -4013,7 +4014,7 @@ async def test_avoid_replication_seal_document_and_sealed_attachments( ) @mock.patch( "cl.recap.tasks.set_rd_sealed_status", - side_effect=mock_method_set_rd_sealed_status, + wraps=mock_method_set_rd_sealed_status, ) async def test_replication_sealed_document_with_no_sealed_attachments( self, @@ -4259,14 +4260,14 @@ async def test_avoid_replication_for_sealed_entry_with_attachments( ) @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b"Hello World"), + return_value=( + httpx.Response(200, content=b"Hello World"), "OK", ), ) @mock.patch( - "cl.recap.tasks.requests.get", - side_effect=lambda *args, **kwargs: MockResponse(200, b"Att content."), + "cl.recap.tasks.httpx.AsyncClient.get", + return_value=httpx.Response(200, content=b"Att content."), ) async def test_recap_email_avoid_replication_on_pdf_available( self, diff --git a/cl/recap_rss/tasks.py b/cl/recap_rss/tasks.py index 4301d16256..42646c7862 100644 --- a/cl/recap_rss/tasks.py +++ b/cl/recap_rss/tasks.py @@ -6,7 +6,7 @@ from calendar import SATURDAY, SUNDAY from datetime import datetime, timedelta -import requests +import httpx from asgiref.sync import async_to_sync from celery import Task from dateparser import parse @@ -15,10 +15,10 @@ from django.core.mail import send_mail from django.db import IntegrityError, transaction from django.utils.timezone import now +from httpx import HTTPError from juriscraper.pacer import PacerRssFeed from pytz import timezone from redis import Redis -from requests import HTTPError from cl.alerts.tasks import enqueue_docket_alert from cl.celery_init import app @@ -205,8 +205,8 @@ def check_if_feed_changed( feed_status = RssFeedStatus.objects.get(pk=feed_status_pk) rss_feed = PacerRssFeed(map_cl_to_pacer_id(court_pk)) try: - rss_feed.query() - except requests.RequestException: + async_to_sync(rss_feed.query)() + except httpx.RequestError: logger.warning( "Network error trying to get RSS feed at %s", rss_feed.url ) diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py index 1dc71f57ec..5b5f287f47 100644 --- a/cl/scrapers/tasks.py +++ b/cl/scrapers/tasks.py @@ -114,7 +114,7 @@ def update_document_from_text( @app.task( bind=True, - autoretry_for=(requests.ConnectionError, requests.ReadTimeout), + autoretry_for=(httpx.ConnectError, httpx.ReadTimeout), max_retries=5, retry_backoff=10, ) @@ -253,7 +253,7 @@ def extract_opinion_content( # TODO: Remove after the new extract_opinion_content is deployed. @app.task( bind=True, - autoretry_for=(requests.ConnectionError, requests.ReadTimeout), + autoretry_for=(httpx.ConnectError, httpx.ReadTimeout), max_retries=5, retry_backoff=10, ) @@ -584,8 +584,8 @@ async def extract_pdf_document_base( @app.task( bind=True, autoretry_for=( - requests.ConnectionError, - requests.ReadTimeout, + httpx.ConnectError, + httpx.ReadTimeout, httpx.TimeoutException, ), max_retries=3, @@ -664,7 +664,7 @@ def update_docket_info_iquery(self, d_pk: int, court_id: str) -> None: :param court_id: The court of the docket. Needed for throttling by court. :return: None """ - session_data = get_or_cache_pacer_cookies( + session_data = async_to_sync(get_or_cache_pacer_cookies)( "pacer_scraper", settings.PACER_USERNAME, password=settings.PACER_PASSWORD, @@ -678,8 +678,8 @@ def update_docket_info_iquery(self, d_pk: int, court_id: str) -> None: d = Docket.objects.get(pk=d_pk, court_id=court_id) report = CaseQuery(map_cl_to_pacer_id(d.court_id), s) try: - report.query(d.pacer_case_id) - except (requests.Timeout, requests.RequestException) as exc: + async_to_sync(report.query)(d.pacer_case_id) + except (httpx.Timeout, httpx.RequestError) as exc: logger.warning( "Timeout or unknown RequestException on iquery crawl. " "Trying again if retries not exceeded." diff --git a/cl/scrapers/utils.py b/cl/scrapers/utils.py index 9d7952e65e..a12790877a 100644 --- a/cl/scrapers/utils.py +++ b/cl/scrapers/utils.py @@ -164,12 +164,12 @@ async def test_for_meta_redirections( :param r: A response object :return: A boolean and value """ - response = await microservice( + mime_response = await microservice( service="buffer-extension", file=r.content, params={"mime": True}, ) - extension = response.text + extension = mime_response.text if extension == ".html": html_tree = html.fromstring(r.text) diff --git a/cl/search/admin.py b/cl/search/admin.py index 472702f06f..306a7dc44e 100644 --- a/cl/search/admin.py +++ b/cl/search/admin.py @@ -14,6 +14,7 @@ from cl.lib.string_utils import trunc from cl.search.models import ( BankruptcyInformation, + CaseTransfer, Citation, Claim, ClaimHistory, @@ -34,6 +35,7 @@ ScotusDocketMetadata, SCOTUSDocument, SearchQuery, + TrialCourtData, ) from cl.search.state.texas.models import TexasDocketEntry, TexasDocument from cl.search.utils import seal_documents @@ -373,6 +375,37 @@ class BankruptcyInformationAdmin(admin.ModelAdmin): raw_id_fields = ("docket",) +@admin.register(CaseTransfer) +class CaseTransferAdmin(CursorPaginatorAdmin): + raw_id_fields = ( + "origin_court", + "origin_docket", + "destination_court", + "destination_docket", + ) + list_display = ( + "pk", + "origin_court", + "origin_docket_number", + "destination_court", + "destination_docket_number", + "transfer_date", + "transfer_type", + ) + list_filter = ( + "transfer_type", + "transfer_date", + ) + search_fields = ( + "origin_docket_number", + "destination_docket_number", + ) + readonly_fields = ( + "date_created", + "date_modified", + ) + + @admin.register(RECAPDocument) class RECAPDocumentAdmin(CursorPaginatorAdmin): search_fields = ( @@ -530,6 +563,30 @@ def change_view(self, request, object_id, form_url="", extra_context=None): ) +@admin.register(TrialCourtData) +class TrialCourtDataAdmin(CursorPaginatorAdmin): + raw_id_fields = ( + "docket", + "judge", + ) + autocomplete_fields = ("court",) + readonly_fields = ( + "date_created", + "date_modified", + ) + list_display = ( + "__str__", + "docket_number_trial", + "court_name", + "date_filed", + ) + search_help_text = "Search by docket ID or trial court docket number." + search_fields = ( + "=docket__id", + "docket_number_trial", + ) + + @admin.register(OpinionsCited) class OpinionsCitedAdmin(CursorPaginatorAdmin): raw_id_fields = ( diff --git a/cl/search/es_indices.py b/cl/search/es_indices.py index 8adb416307..73b6600059 100644 --- a/cl/search/es_indices.py +++ b/cl/search/es_indices.py @@ -48,7 +48,7 @@ # Define people elasticsearch index # Define opinion elasticsearch index -opinion_index = Index("opinion_index") +opinion_index = Index("case_law_index") opinion_index.settings( number_of_shards=settings.ELASTICSEARCH_OPINION_NUMBER_OF_SHARDS, number_of_replicas=settings.ELASTICSEARCH_OPINION_NUMBER_OF_REPLICAS, diff --git a/cl/search/factories.py b/cl/search/factories.py index 17743829d8..3cd640d7ee 100644 --- a/cl/search/factories.py +++ b/cl/search/factories.py @@ -25,6 +25,7 @@ from cl.search.models import ( PRECEDENTIAL_STATUS, BankruptcyInformation, + CaseTransfer, Citation, Court, Docket, @@ -37,6 +38,7 @@ ParentheticalGroup, RECAPDocument, SCOTUSDocketEntry, + TrialCourtData, ) from cl.tests.providers import LegalProvider @@ -465,3 +467,48 @@ class ScotusDocketDataFactory(DictFactory): questions_presented = Faker("url") docket_entries = List([SubFactory(SCOTUSDocketEntryDataFactory)]) parties = List([SubFactory(SCOTUSPartyDataFactory)]) + + +class CaseTransferFactory(DjangoModelFactory): + origin_court = SubFactory(CourtFactory) + origin_docket_number = LazyAttribute( + lambda ct: ct.origin_docket.docket_number if ct.origin_docket else None + ) + origin_docket = SubFactory(DocketFactory) + destination_court = SubFactory(CourtFactory) + destination_docket_number = LazyAttribute( + lambda ct: ct.destination_docket.docket_number + if ct.destination_docket + else None + ) + destination_docket = SubFactory(DocketFactory) + transfer_date = Faker("date_object") + transfer_type = Faker( + "random_element", + elements=( + CaseTransfer.APPEAL, + CaseTransfer.WORKLOAD, + CaseTransfer.MERGE, + CaseTransfer.JURISDICTION, + ), + ) + + class Meta: + model = CaseTransfer + + +class TrialCourtDataFactory(DjangoModelFactory): + docket = SubFactory(DocketFactory) + docket_number_trial = Faker("federal_district_docket_number") + docket_number_raw_trial = SelfAttribute("docket_number_trial") + judge_str = Faker("name") + judge = SubFactory(PersonFactory) + reporter = Faker("name") + date_filed = Faker("date_object") + court_name = Faker("court_name") + court = SubFactory(CourtFactory) + punishment = Faker("pystr") + county = Faker("pystr") + + class Meta: + model = TrialCourtData diff --git a/cl/search/management/commands/pacer_bulk_fetch.py b/cl/search/management/commands/pacer_bulk_fetch.py index 2bbe40c2dd..f04bd16e8f 100644 --- a/cl/search/management/commands/pacer_bulk_fetch.py +++ b/cl/search/management/commands/pacer_bulk_fetch.py @@ -5,6 +5,7 @@ from enum import Enum from typing import Any +from asgiref.sync import async_to_sync from django.contrib.auth.models import User from django.core.cache import cache from django.core.management.base import CommandError @@ -186,7 +187,7 @@ def setup_celery(self) -> None: def handle_pacer_session(self) -> None: """Make sure we have an active PACER session for the user.""" - get_or_cache_pacer_cookies( + async_to_sync(get_or_cache_pacer_cookies)( self.user.pk, username=self.pacer_username, password=self.pacer_password, diff --git a/cl/search/models.py b/cl/search/models.py index e2ea39cd00..89e2b4eeb9 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -1,12 +1,12 @@ import logging import re from datetime import datetime -from typing import TypeVar +from typing import Literal, TypeVar import nh3 import pghistory import pytz -from asgiref.sync import sync_to_async +from asgiref.sync import async_to_sync, sync_to_async from celery.canvas import chain from django.contrib.contenttypes.fields import GenericRelation from django.contrib.postgres.indexes import HashIndex @@ -736,6 +736,10 @@ def save(self, update_fields=None, *args, **kwargs): def get_absolute_url(self) -> str: return reverse("view_docket", args=[self.pk, self.slug]) + def add_scraper_source(self) -> None: + if self.source in self.NON_SCRAPER_SOURCES(): + self.source = self.source + self.SCRAPER + def add_recap_source(self): if self.source == self.DEFAULT: self.source = self.RECAP_AND_SCRAPER @@ -4030,6 +4034,83 @@ class CaseTransfer(AbstractDateTimeModel): choices=transfer_type_choices.items(), ) + # We currently only generate transfers for state courts, and we do not + # scrape trial courts so skip trying to populate fields we'll never be able + # to populate. + TRACKED_JURISDICTIONS = (Court.STATE_APPELLATE, Court.STATE_SUPREME) + + @classmethod + def _fill_null_docket_side( + cls, side: Literal["origin"] | Literal["destination"] + ) -> tuple[int, int]: + """Fill null docket FKs for one side (origin or destination). + + :param side: Either "origin" or "destination". + :return: Tuple of (updated_count, total_count). + """ + from cl.recap.mergers import find_docket_object + + qs = cls.objects.filter( + **{ + f"{side}_court__jurisdiction__in": cls.TRACKED_JURISDICTIONS, + f"{side}_docket__isnull": True, + } + ) + total = qs.count() + updated_transfers: list[CaseTransfer] = [] + total_updated = 0 + + for transfer in qs.iterator(): + docket = async_to_sync(find_docket_object)( + court_id=getattr(transfer, f"{side}_court_id"), + pacer_case_id=None, + docket_number=getattr(transfer, f"{side}_docket_number"), + federal_defendant_number=None, + federal_dn_judge_initials_assigned=None, + federal_dn_judge_initials_referred=None, + allow_create=False, + ) + if docket: + logger.info( + "Found %s docket %s!", + side, + getattr(transfer, f"{side}_docket_number"), + ) + setattr(transfer, f"{side}_docket", docket) + updated_transfers.append(transfer) + + if len(updated_transfers) >= 100: + total_updated += cls.objects.bulk_update( + updated_transfers, [f"{side}_docket"] + ) + updated_transfers = [] + + if updated_transfers: + total_updated += cls.objects.bulk_update( + updated_transfers, [f"{side}_docket"] + ) + + return total_updated, total + + @classmethod + def fill_null_dockets(cls) -> None: + logger.info( + "Attempting to populate missing fields in CaseTransfer table..." + ) + + updated_origin, total_origin = cls._fill_null_docket_side("origin") + updated_destination, total_destination = cls._fill_null_docket_side( + "destination" + ) + + logger.info( + "Update complete. Populated %s/%s origin dockets and %s/%s destination dockets.", + updated_origin, + total_origin, + updated_destination, + total_destination, + ) + class Meta: constraints = [ CheckConstraint( diff --git a/cl/search/state/texas/factories.py b/cl/search/state/texas/factories.py index 2e791121f3..68ba762ab6 100644 --- a/cl/search/state/texas/factories.py +++ b/cl/search/state/texas/factories.py @@ -2,9 +2,11 @@ import random +import factory from factory import DictFactory, Faker, List, SubFactory from factory.declarations import LazyAttribute from factory.django import DjangoModelFactory +from juriscraper.state.texas.common import CourtID, CourtType from cl.search.factories import DocketFactory from cl.search.models import TexasDocketEntry, TexasDocument @@ -22,10 +24,25 @@ class TexasCaseDocumentDictFactory(DictFactory): class TexasDocketEntryDictFactory(DictFactory): date = Faker("date_object") type = Faker("pystr", min_chars=3, max_chars=3) - disposition = Faker("text") + attachments = List([SubFactory(TexasCaseDocumentDictFactory)]) + + +class TexasAppellateBriefDictFactory(TexasDocketEntryDictFactory): description = Faker("text") + + +class TexasSupremeCourtAppellateBriefDictFactory( + TexasAppellateBriefDictFactory +): + remarks = Faker("text") + + +class TexasCaseEventDictFactory(TexasDocketEntryDictFactory): + disposition = Faker("text") + + +class TexasSupremeCourtCaseEventDictFactory(TexasCaseEventDictFactory): remarks = Faker("text") - attachments = List([SubFactory(TexasCaseDocumentDictFactory)]) class TexasCasePartyDictFactory(DictFactory): @@ -34,15 +51,62 @@ class TexasCasePartyDictFactory(DictFactory): representatives = List([Faker("name")]) -class TexasTrialCourtDictFactory(DictFactory): - # TODO Placeholder - name = Faker("pystr") +class TexasOriginatingCourtDictFactory(DictFactory): + name = Faker("court_name") + court_type = Faker( + "random_element", + elements=( + CourtType.PROBATE.value, + CourtType.BUSINESS.value, + CourtType.COUNTY.value, + CourtType.MUNICIPAL.value, + CourtType.JUSTICE.value, + CourtType.UNKNOWN.value, + ), + ) + county = Faker("pystr") + judge = Faker("name") + # Close enough for testing + case = Faker("federal_district_docket_number") + reporter = Faker("name") + punishment = Faker("pystr") + + +class TexasOriginatingAppellateCourtDictFactory( + TexasOriginatingCourtDictFactory +): + court_type = CourtType.APPELLATE.value + court_id = Faker( + "random_element", + elements=( + CourtID.FIRST_COURT_OF_APPEALS.value, + CourtID.SECOND_COURT_OF_APPEALS.value, + CourtID.FOURTEENTH_COURT_OF_APPEALS.value, + CourtID.FIFTEENTH_COURT_OF_APPEALS.value, + ), + ) + + +class TexasOriginatingDistrictCourtDictFactory( + TexasOriginatingCourtDictFactory +): + court_type = CourtType.DISTRICT.value + district = Faker("random_element", elements=list(range(1, 527)) + [None]) class TexasCommonDataDictFactory(DictFactory): court_id = Faker( "random_element", - elements=("texctapp1", "texctapp2", "tex", "texcrimapp"), + elements=( + CourtID.FIRST_COURT_OF_APPEALS.value, + CourtID.SECOND_COURT_OF_APPEALS.value, + CourtID.SUPREME_COURT.value, + CourtID.COURT_OF_CRIMINAL_APPEALS.value, + ), + ) + court_type = Faker( + "random_element", + elements=(CourtType.APPELLATE.value, CourtType.SUPREME.value), ) # Not correct, but close enough docket_number = Faker("federal_district_docket_number") @@ -51,11 +115,21 @@ class TexasCommonDataDictFactory(DictFactory): date_filed = Faker("date_object") case_type = Faker("pystr") parties = List([SubFactory(TexasCasePartyDictFactory)]) - trial_court = SubFactory(TexasTrialCourtDictFactory) - case_events = List([SubFactory(TexasDocketEntryDictFactory)]) + originating_court = SubFactory(TexasOriginatingCourtDictFactory) + case_events = List([SubFactory(TexasCaseEventDictFactory)]) appellate_briefs = LazyAttribute( - lambda d: filter( - lambda e: True if random.random() < 0.1 else False, d.case_events + lambda d: list( + map( + lambda ce: TexasAppellateBriefDictFactory( + date=ce["date"], + type=ce["type"], + attachments=ce["attachments"], + ), + filter( + lambda e: True if random.random() < 0.1 else False, + d.case_events, + ), + ) ) ) @@ -85,3 +159,125 @@ class TexasDocumentFactory(DjangoModelFactory): class Meta: model = TexasDocument + + +class TexasAppellateCourtInfoDictFactory(DictFactory): + """Factory for appeals_court field in Texas final court dockets.""" + + court_id = Faker( + "random_element", + elements=( + CourtID.FIRST_COURT_OF_APPEALS.value, + CourtID.SECOND_COURT_OF_APPEALS.value, + CourtID.FOURTEENTH_COURT_OF_APPEALS.value, + CourtID.UNKNOWN.value, + ), + ) + case_number = Faker("federal_district_docket_number") + case_url = Faker("url") + disposition = Faker("pystr") + district = Faker("pystr") + justice = Faker("name") + opinion_cite = Faker("citation") + + +class TexasAppellateTransferDictFactory(DictFactory): + """Factory for transfer_from field in Texas appellate dockets.""" + + court_id = Faker( + "random_element", + elements=( + CourtID.FIRST_COURT_OF_APPEALS.value, + CourtID.SECOND_COURT_OF_APPEALS.value, + CourtID.FOURTEENTH_COURT_OF_APPEALS.value, + ), + ) + origin_docket = Faker("federal_district_docket_number") + date = Faker("date_object") + + +class TexasCourtOfAppealsDocketDictFactory(TexasCommonDataDictFactory): + """Factory for Texas Court of Appeals docket data.""" + + court_type = CourtType.APPELLATE.value + court_id = Faker( + "random_element", + elements=( + CourtID.FIRST_COURT_OF_APPEALS.value, + CourtID.SECOND_COURT_OF_APPEALS.value, + CourtID.FOURTEENTH_COURT_OF_APPEALS.value, + ), + ) + transfer_from = LazyAttribute( + lambda d: TexasAppellateTransferDictFactory.create() + if random.random() < 0.1 + else None + ) + transfer_to = LazyAttribute( + lambda d: TexasAppellateTransferDictFactory.create() + if random.random() < 0.1 + else None + ) + + +class TexasFinalCourtDocketDictFactory(TexasCommonDataDictFactory): + """Factory for Texas Supreme Court and Court of Criminal Appeals docket data.""" + + court_type = CourtType.SUPREME.value + appeals_court = SubFactory(TexasAppellateCourtInfoDictFactory) + court_id = Faker( + "random_element", + elements=( + CourtID.SUPREME_COURT.value, + CourtID.COURT_OF_CRIMINAL_APPEALS.value, + ), + ) + is_direct_appeal = Faker("pybool") + + @factory.post_generation + @staticmethod + def post_gen(obj, create, extracted, **kwargs): + if not create: + return + if obj["court_id"] == CourtID.SUPREME_COURT.value: + obj["case_events"] = list( + map( + lambda ce: TexasSupremeCourtCaseEventDictFactory( + date=ce["date"], + type=ce["type"], + attachments=ce["attachments"], + disposition=ce["disposition"], + remarks=ce.get( + "remarks", + TexasSupremeCourtCaseEventDictFactory.remarks, + ), + ), + obj["case_events"], + ) + ) + obj["appellate_briefs"] = list( + map( + lambda ab: TexasSupremeCourtAppellateBriefDictFactory( + date=ab["date"], + type=ab["type"], + attachments=ab["attachments"], + description=ab["description"], + remarks=ab.get( + "remarks", + TexasSupremeCourtAppellateBriefDictFactory.remarks, + ), + ), + obj["appellate_briefs"], + ) + ) + if obj["is_direct_appeal"]: + obj["appeals_court"] = TexasAppellateCourtInfoDictFactory( + court_id=CourtID.UNKNOWN.value, + case_number="", + case_url="", + disposition="", + district="", + justice="", + opinion_cite="", + ) + del obj["is_direct_appeal"] diff --git a/cl/search/tests/test_pacer_bulk_fetch.py b/cl/search/tests/test_pacer_bulk_fetch.py index 728ee373ee..c15fc7d431 100644 --- a/cl/search/tests/test_pacer_bulk_fetch.py +++ b/cl/search/tests/test_pacer_bulk_fetch.py @@ -1,12 +1,13 @@ from datetime import timedelta from unittest.mock import MagicMock, patch +import httpx import time_machine from django.core.cache import cache as django_cache from django.core.management import call_command from django.utils import timezone from django.utils.timezone import now -from requests import HTTPError +from httpx import HTTPError from cl.lib.utils import append_value_in_cache from cl.recap.factories import PacerFetchQueueFactory @@ -23,7 +24,6 @@ ) from cl.search.models import RECAPDocument from cl.tests.cases import TestCase -from cl.tests.utils import MockResponse from cl.users.factories import UserFactory @@ -781,11 +781,8 @@ def tearDown(self): @patch( "cl.recap.tasks.download_pacer_pdf_by_rd", - side_effect=lambda z, x, c, v, b, de_seq_num: ( - MockResponse( - 200, - b"binary content", - ), + return_value=( + httpx.Response(200, content=b"binary content"), "OK", ), ) @@ -919,11 +916,8 @@ def test_abort_fqs_after_error( @patch( "cl.recap.tasks.download_pacer_pdf_by_rd", - side_effect=lambda z, x, c, v, b, de_seq_num: ( - MockResponse( - 200, - None, - ), + return_value=( + httpx.Response(200, content=None), "Document is sealed", ), ) diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index 0ac33a46f3..044e56e58b 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -61,6 +61,7 @@ ) from cl.search.exception import InvalidRelativeDateSyntax from cl.search.factories import ( + CaseTransferFactory, CourtFactory, DocketEntryFactory, DocketFactory, @@ -85,6 +86,7 @@ from cl.search.models import ( PRECEDENTIAL_STATUS, SEARCH_TYPES, + CaseTransfer, Citation, ClusterRedirection, Court, @@ -3941,6 +3943,87 @@ def test_llm_clean_docket_number_daemon( ) +class CaseTransferFillNullDocketsTest(TestCase): + """Tests for CaseTransfer.fill_null_dockets.""" + + @classmethod + def setUpTestData(cls): + cls.appellate_court = CourtFactory.create( + jurisdiction=Court.STATE_APPELLATE, + ) + cls.supreme_court = CourtFactory.create( + jurisdiction=Court.STATE_SUPREME, + ) + + def test_fills_missing_origin_docket(self): + """Does fill_null_dockets populate a missing origin_docket FK?""" + origin_docket = DocketFactory.create(court=self.appellate_court) + destination_docket = DocketFactory.create(court=self.supreme_court) + transfer = CaseTransferFactory.create( + origin_court=origin_docket.court, + origin_docket=None, + origin_docket_number=origin_docket.docket_number, + destination_court=destination_docket.court, + destination_docket=destination_docket, + ) + + CaseTransfer.fill_null_dockets() + + transfer.refresh_from_db() + assert transfer.origin_docket_id == origin_docket.pk + + def test_fills_missing_destination_docket(self): + """Does fill_null_dockets populate a missing destination_docket FK?""" + origin_docket = DocketFactory.create(court=self.appellate_court) + destination_docket = DocketFactory.create(court=self.supreme_court) + transfer = CaseTransferFactory.create( + origin_court=origin_docket.court, + origin_docket=origin_docket, + destination_court=destination_docket.court, + destination_docket=None, + destination_docket_number=destination_docket.docket_number, + ) + + CaseTransfer.fill_null_dockets() + + transfer.refresh_from_db() + assert transfer.destination_docket == destination_docket + + def test_leaves_already_populated_dockets_unchanged(self): + """Does fill_null_dockets leave already-populated FKs alone?""" + origin = DocketFactory.create() + destination = DocketFactory.create() + transfer = CaseTransferFactory.create( + origin_court=origin.court, + origin_docket=origin, + destination_court=destination.court, + destination_docket=destination, + ) + + CaseTransfer.fill_null_dockets() + + transfer.refresh_from_db() + assert transfer.origin_docket == origin + assert transfer.destination_docket == destination + + def test_no_match_leaves_null(self): + """Does fill_null_dockets leave FK null when no docket is found?""" + destination_docket = DocketFactory.create() + transfer = CaseTransferFactory.create( + origin_court=CourtFactory.create(), + origin_docket=None, + origin_docket_number=destination_docket.docket_number + + "dontmatch", + destination_court=destination_docket.court, + destination_docket=destination_docket, + ) + + CaseTransfer.fill_null_dockets() + + transfer.refresh_from_db() + assert transfer.origin_docket is None + + class SearchFormCourtCleanTest(TestCase): """Tests that SearchForm.clean() correctly normalizes court selection inputs.""" diff --git a/cl/search/tests/tests_semantic_search_opinion.py b/cl/search/tests/tests_semantic_search_opinion.py index d7f880aa34..e82aebf80d 100644 --- a/cl/search/tests/tests_semantic_search_opinion.py +++ b/cl/search/tests/tests_semantic_search_opinion.py @@ -6,6 +6,7 @@ from unittest import mock from unittest.mock import MagicMock +from asgiref.sync import async_to_sync from django.conf import settings from django.core.management import call_command from django.test import TestCase, override_settings @@ -381,9 +382,11 @@ def _get_mock_for_inception(self, vectors: dict[str, Any] | None = None): inception_response.json.return_value = vectors return inception_response - def _test_api_results_count(self, params, expected_count, field_name): + async def _test_api_results_count( + self, params, expected_count, field_name + ): """Get the result count in a API query response""" - r = self.client.get( + r = await self.async_client.get( reverse("search-list", kwargs={"version": "v4"}), params ) got = len(r.data["results"]) @@ -400,7 +403,8 @@ def _test_api_results_count(self, params, expected_count, field_name): @override_flag("store-search-api-queries", active=True) @override_settings(WAFFLE_CACHE_PREFIX="test_semantic_search_opinion") - def test_can_perform_a_regular_semantic_query( + @async_to_sync + async def test_can_perform_a_regular_semantic_query( self, inception_mock ) -> None: """Can we perform a semantic search using the API?""" @@ -411,10 +415,12 @@ def test_can_perform_a_regular_semantic_query( # Perform search and check that exactly two results are returned search_params = {"q": self.situational_query, "semantic": True} - r = self._test_api_results_count(search_params, 2, "semantic query") + r = await self._test_api_results_count( + search_params, 2, "semantic query" + ) # Ensure a SearchQuery row was logged with SEMANTIC querymode - last_query = SearchQuery.objects.last() + last_query = await SearchQuery.objects.alast() self.assertEqual(last_query.query_mode, SearchQuery.SEMANTIC) content = r.content.decode() @@ -429,7 +435,7 @@ def test_can_perform_a_regular_semantic_query( cluster_id=cluster["cluster_id"], msg="Snippet content test." ): for opinion in cluster["opinions"]: - record = Opinion.objects.get(id=opinion["id"]) + record = await Opinion.objects.aget(id=opinion["id"]) self.assertNotEqual( opinion["snippet"], record.plain_text[: settings.NO_MATCH_HL_SIZE], @@ -439,7 +445,9 @@ def test_can_perform_a_regular_semantic_query( self.assertNotIn(f'"cluster_id":{self.opinion_4.cluster.id}', content) self.assertNotIn(f'"cluster_id":{self.opinion_5.cluster.id}', content) - def test_can_apply_filter_to_semantic_query(self, inception_mock) -> None: + async def test_can_apply_filter_to_semantic_query( + self, inception_mock + ) -> None: """Can we apply filtering to semantic search results?""" inception_mock.return_value = self._get_mock_for_inception( self.situational_query_vectors @@ -453,7 +461,7 @@ def test_can_apply_filter_to_semantic_query(self, inception_mock) -> None: } # Should return only the opinion from the Ohio court - r = self._test_api_results_count( + r = await self._test_api_results_count( search_params, 1, "semantic query with court filter" ) content = r.content.decode() @@ -468,14 +476,16 @@ def test_can_apply_filter_to_semantic_query(self, inception_mock) -> None: } # Should return only the result matching the docket number - r = self._test_api_results_count( + r = await self._test_api_results_count( search_params, 1, "semantic query with docket number filter" ) content = r.content.decode() self.assertNotIn(f'"cluster_id":{self.opinion_2.cluster.id}', content) self.assertIn(f'"cluster_id":{self.opinion_3.cluster.id}', content) - def test_can_sort_semantic_search_results(self, inception_mock) -> None: + async def test_can_sort_semantic_search_results( + self, inception_mock + ) -> None: """Can we sort semantic search results by cite count?""" inception_mock.return_value = self._get_mock_for_inception( self.situational_query_vectors @@ -487,7 +497,9 @@ def test_can_sort_semantic_search_results(self, inception_mock) -> None: "semantic": True, "order_by": "citeCount desc", } - r = self._test_api_results_count(search_params, 2, "citeCount desc") + r = await self._test_api_results_count( + search_params, 2, "citeCount desc" + ) content = r.content.decode() # Opinion with higher cite count should appear first @@ -504,7 +516,9 @@ def test_can_sort_semantic_search_results(self, inception_mock) -> None: "semantic": True, "order_by": "citeCount asc", } - r = self._test_api_results_count(search_params, 2, "citeCount asc") + r = await self._test_api_results_count( + search_params, 2, "citeCount asc" + ) content = r.content.decode() # Opinion with lower cite count should appear first @@ -515,14 +529,14 @@ def test_can_sort_semantic_search_results(self, inception_mock) -> None: " ordered by ascending citeCount.", ) - def test_is_semantic_score_standarized(self, inception_mock) -> None: + async def test_is_semantic_score_standarized(self, inception_mock) -> None: """Ensure that semantic scores are consistently returned as floats""" inception_mock.return_value = self._get_mock_for_inception( self.situational_query_vectors ) search_params = {"q": self.hybrid_query, "semantic": True} - r = self._test_api_results_count( + r = await self._test_api_results_count( search_params, 3, "hybrid semantic search query" ) @@ -542,7 +556,7 @@ def test_is_semantic_score_standarized(self, inception_mock) -> None: else: self.assertEqual(semantic_score, 0.0) - def test_can_do_hybrid_search_query(self, inception_mock) -> None: + async def test_can_do_hybrid_search_query(self, inception_mock) -> None: """Can we combine semantic and keyword matches in hybrid search?""" inception_mock.return_value = self._get_mock_for_inception( self.situational_query_vectors @@ -550,7 +564,7 @@ def test_can_do_hybrid_search_query(self, inception_mock) -> None: # Hybrid query should return semantic and keyword matches (3 total) search_params = {"q": self.hybrid_query, "semantic": True} - r = self._test_api_results_count( + r = await self._test_api_results_count( search_params, 3, "hybrid semantic search query" ) content = r.content.decode() @@ -573,7 +587,7 @@ def test_can_do_hybrid_search_query(self, inception_mock) -> None: cluster_id=cluster["cluster_id"], msg="Snippet content test." ): for opinion in cluster["opinions"]: - record = Opinion.objects.get(id=opinion["id"]) + record = await Opinion.objects.aget(id=opinion["id"]) if record.id == self.opinion_5.id: self.assertEqual( opinion["snippet"], diff --git a/cl/simple_pages/templates/help/index.html b/cl/simple_pages/templates/help/index.html index 1f4d509279..de0ad07453 100644 --- a/cl/simple_pages/templates/help/index.html +++ b/cl/simple_pages/templates/help/index.html @@ -48,7 +48,7 @@
We've built some of the biggest open datasets in the world. Learn more about them:
API Documentation