From b51fbfc6f8eae1702ecfe97ca0b3bf97e0ff4330 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Wed, 28 Jan 2026 17:04:36 -0700 Subject: [PATCH 01/87] feat(texas): Docket merger Partial docket merger implementation for Texas; new MergeResult class to decrease verbosity --- cl/corpus_importer/tasks.py | 240 +++++++++++++++++++++++++++++++++++- cl/corpus_importer/utils.py | 24 ++++ 2 files changed, 258 insertions(+), 6 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 27a8157c06..5dee90671c 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import copy import functools import logging @@ -12,7 +14,7 @@ from pyexpat import ExpatError from re import Pattern from tempfile import NamedTemporaryFile -from typing import IO, Any +from typing import IO, Any, NamedTuple import environ import eyecite @@ -55,15 +57,20 @@ ) from juriscraper.pacer.reports import BaseReport from juriscraper.state.texas import ( + TexasAppealsCourt, TexasCaseEvent, TexasCaseParty, + TexasCourtOfCriminalAppealsDocket, TexasSupremeCourtAppellateBrief, TexasSupremeCourtCaseEvent, + TexasSupremeCourtDocket, + TexasTrialCourt, ) from juriscraper.state.texas.common import ( TexasAppellateBrief, TexasCaseDocument, ) +from juriscraper.state.texas.court_of_appeals import TexasCourtOfAppealsDocket from openai import ( APIConnectionError, APIError, @@ -99,8 +106,10 @@ compute_binary_probe_jitter, compute_blocked_court_wait, compute_next_binary_probe, + create_docket_entry_sequence_numbers, is_appellate_court, is_long_appellate_document_number, + juriscraper_to_cl_court_id, make_iquery_probing_key, mark_ia_upload_needed, ) @@ -3416,6 +3425,42 @@ def download_texas_document_pdf( return texas_document_pk +class MergeResult[T = int](NamedTuple): + """Stores data about the result of an attempted merge operation.""" + + create: bool + """Whether a document needed to be created.""" + update: bool + """Whether a document needed to be updated.""" + success: bool + """Whether the operation was successful.""" + pk: T | None + """The primary key of the created or updated object.""" + + @staticmethod + def created(pk: T) -> MergeResult[T]: + """Shorthand for the result of a successful creation operation. + + :param pk: The primary key of the created object. + :return: The constructed MergeResult object.""" + return MergeResult(create=True, update=False, success=True, pk=pk) + + @staticmethod + def updated(pk: T) -> MergeResult[T]: + """Shorthand for the result of a successful update operation. + + :param pk: The primary key of the updated object. + :return: The constructed MergeResult object.""" + return MergeResult(create=False, update=True, success=True, pk=pk) + + @staticmethod + def failed() -> MergeResult[T]: + """Shorthand for the result of a failed merge operation. + + :return: The constructed MergeResult object.""" + return MergeResult(create=False, update=False, success=False, pk=None) + + def merge_texas_document( docket_entry: TexasDocketEntry, input_document: TexasCaseDocument ) -> tuple[bool, bool, int]: @@ -3443,12 +3488,12 @@ def merge_texas_document( "document_url": input_document["document_url"], }, ) - - if ( - created - or str(texas_document.media_version_id) + update = ( + str(texas_document.media_version_id) != input_document["media_version_id"] - ): + ) + + if created or update: texas_document.description = input_document["description"] texas_document.media_version_id = input_document["media_version_id"] texas_document.document_url = input_document["document_url"] @@ -3643,3 +3688,186 @@ def merge_texas_parties(docket: Docket, parties: list[TexasCaseParty]) -> None: :param parties: The parties involved in the Texas case. """ add_parties_and_attorneys(docket, normalize_texas_parties(parties)) + + +def merge_texas_docket_originating_court( + docket: Docket, originating_court_data: TexasTrialCourt +) -> MergeResult: + """Merge originating court information into the given Texas docket. + + :param docket: The docket to add the originating court to. + :param originating_court_data: The originating court data from Juriscraper. + :return: The result of the merge operation.""" + raise NotImplementedError + + +def merge_texas_case_transfers( + docket: Docket, + docket_data: TexasCourtOfAppealsDocket + | TexasCourtOfCriminalAppealsDocket + | TexasSupremeCourtDocket, +) -> MergeResult: + """Merge appeal and work sharing information into the given Texas docket. + + :param docket: The docket to add the appeal information to. + :param docket_data: The docket data from Juriscraper. + :return: The result of the CaseTransfer merge operation""" + raise NotImplementedError + + +def generate_texas_appellate_brief_flags( + case_events: list[TexasCaseEvent], + appellate_briefs: list[TexasAppellateBrief], +) -> list[bool]: + """Generates a list of booleans indicating whether the corresponding entry + in the list of TexasCaseEvents is in the list of TexasAppellateBriefs. + + :param case_events: A list of TexasCaseEvent objects. + :param appellate_briefs: A list of TexasAppellateBrief objects. + :return: A list of booleans indicating whether the corresponding entry is + an appellate brief.""" + if not appellate_briefs: + return [False] * len(case_events) + i = 0 + flags = [] + # Assumes that appellate briefs will appear in the same order as the + # corresponding case events. + for case_event in case_events: + if i == len(appellate_briefs): + flags.append(False) + continue + if case_event == appellate_briefs[i]: + flags.append(True) + i += 1 + else: + flags.append(False) + + return flags + + +def merge_texas_docket( + court: Court, + docket_data: TexasCourtOfAppealsDocket + | TexasCourtOfCriminalAppealsDocket + | TexasSupremeCourtDocket, +) -> MergeResult: + """Merges scraped data from a Texas docket into the `Docket` table. + + :param court: The court to add the docket to. + :param docket_data: The scraped Texas docket data. + :return: The result of the merge operation.""" + with transaction.atomic(): + docket_number = docket_data["docket_number"] + try: + docket = Docket.objects.get( + court_id=court.pk, docket_number=docket_number + ) + except Docket.DoesNotExist: + logger.info( + "Could not find docket %s in court %s. Creating new Docket.", + docket_number, + court.pk, + ) + docket_created = True + docket = Docket( + court_id=court.pk, + docket_number=docket_number, + ) + except Docket.MultipleObjectsReturned: + logger.error( + "Multiple dockets found for court %s with docket number %s. This likely indicates an error in the merging code.", + court.pk, + docket_number, + ) + return MergeResult.failed() + else: + logger.info( + "Found existing Docket with docket number %s in court %s. Acquiring DB lock for update.", + docket_number, + court.pk, + ) + docket_created = False + Docket.objects.select_for_update().get(pk=docket.pk) + docket.date_filed = docket_data["date_filed"] + docket.cause = docket_data["case_type"] + originating_court_merge_result = merge_texas_docket_originating_court( + docket, docket_data["trial_court"] + ) + if not originating_court_merge_result.success: + logger.error( + "Failed to update originating court information for Texas docket %s in court %s", + docket.docket_number, + court.pk, + ) + lower_court_data: TexasAppealsCourt | TexasTrialCourt = ( + docket_data.get("appeals_court", docket_data["trial_court"]) + ) + lower_court_id = juriscraper_to_cl_court_id( + lower_court_data["court_id"] + ) + + if lower_court_id is not None: + logger.warning( + "Failed to find court ID %s while populating appeal_from field for Texas docket %s in court %s", + lower_court_id, + docket.pk, + court.pk, + ) + docket.appeal_from = lower_court_id + else: + docket.appeal_from_str = lower_court_data.get("name") + + party_merge_result = merge_texas_parties(docket, docket_data["parties"]) + # TODO: Error logging + + entry_merge_results = [ + merge_texas_docket_entry( + docket, sequence_number, appellate_brief, entry + ) + for sequence_number, appellate_brief, entry in zip( + create_docket_entry_sequence_numbers(docket_data["case_events"]), + generate_texas_appellate_brief_flags( + docket_data["case_events"], docket_data["appellate_briefs"] + ), + docket_data["case_events"], + ) + ] + # TODO: Error logging + + merge_case_transfer_result = merge_texas_case_transfers( + docket, docket_data + ) + if not merge_case_transfer_result.success: + logger.error( + "Failed to merge CaseTransfer data for Texas docket %s in court %s", + docket.docket_number, + court.pk, + ) + + create = ( + docket_created + or party_merge_result.create + or originating_court_merge_result.create + or merge_case_transfer_result.create + or any(r.create for r in entry_merge_results) + ) + update = ( + not docket_created + or party_merge_result.update + or originating_court_merge_result.update + or merge_case_transfer_result.update + or any(r.update for r in entry_merge_results) + ) + success = ( + party_merge_result.success + and originating_court_merge_result.success + and merge_case_transfer_result.success + and all(r.success for r in entry_merge_results) + ) + + return MergeResult( + create=create, + update=update, + success=success, + pk=docket.pk, + ) diff --git a/cl/corpus_importer/utils.py b/cl/corpus_importer/utils.py index d32e287884..b06297f6cc 100644 --- a/cl/corpus_importer/utils.py +++ b/cl/corpus_importer/utils.py @@ -1290,3 +1290,27 @@ def create_docket_entry_sequence_numbers( date_counts[entry_date] = i + 1 return sequence_numbers + + +def juriscraper_to_cl_court_id(js_court_id: str) -> str | None: + """Converts a court ID from Juriscraper to the court ID used in the + database. Utility function for a lot of if statements basically. + + :param js_court_id: The court ID from Juriscraper. + :return: The ID of this court in the database or `None` if the Juriscraper + ID was not recognized.""" + if js_court_id.startswith("texas_"): + js_texas_court_id = js_court_id[len("texas_") :] + + if js_texas_court_id.startswith("coa"): + coa_number = int(js_texas_court_id[len("coa") :]) + # TODO 13A and B (for some reason) + return f"txctapp{coa_number}" + if js_texas_court_id == "coscca": + return "texcrimapp" + if js_texas_court_id == "cossup": + return "tex" + logger.error("Unrecognized Texas court ID: %s", js_court_id) + return None + logger.error("Unrecognized court ID: %s", js_court_id) + return None From 98d326b98c2967f0947914f0835040362ae1cff7 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Wed, 28 Jan 2026 17:20:11 -0700 Subject: [PATCH 02/87] feat(texas): Update merge_texas_document to return MergeResult Update return type of merge_texas_document; make merge_texas_document more robust --- cl/corpus_importer/tasks.py | 66 +++++++++++++++++++++++-------------- 1 file changed, 42 insertions(+), 24 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 5dee90671c..00002671ff 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3460,10 +3460,17 @@ def failed() -> MergeResult[T]: :return: The constructed MergeResult object.""" return MergeResult(create=False, update=False, success=False, pk=None) + @staticmethod + def unnecessary(pk: T) -> MergeResult[T]: + """Shorthand for the result of a unnecessary merge operation. + + :return: The constructed MergeResult object.""" + return MergeResult(create=False, update=False, success=True, pk=pk) + def merge_texas_document( docket_entry: TexasDocketEntry, input_document: TexasCaseDocument -) -> tuple[bool, bool, int]: +) -> MergeResult: """Merge a single TexasCaseDocument object into CL. Checks if the document exists, creating a TexasDocument object if it does @@ -3473,27 +3480,34 @@ def merge_texas_document( :param docket_entry: The docket entry this attachment belongs to. :param input_document: The attachment to merge. - :return: Tuple with entries - - Flag indicating whether a document needed to be created or updated - - Flag indicating whether the update operation was successful or not - applicable - - Primary key of the TexasDocument object which matches the input document - """ - (texas_document, created) = TexasDocument.objects.get_or_create( - media_id=input_document["media_id"], - docket_entry=docket_entry, - defaults={ - "description": input_document["description"], - "media_version_id": input_document["media_version_id"], - "document_url": input_document["document_url"], - }, - ) - update = ( - str(texas_document.media_version_id) - != input_document["media_version_id"] - ) + :return: The result of the merge operation.""" + try: + texas_document = TexasDocument.objects.get( + media_id=input_document["media_id"], + docket_entry=docket_entry, + ) + except TexasDocument.DoesNotExist: + existed = False + needs_update = True + texas_document = TexasDocument( + media_id=input_document["media_id"], + docket_entry=docket_entry, + ) + except TexasDocument.MultipleObjectsReturned: + logger.error( + "Found multiple TexasDocument objects on the same docket entry (%s) with the same media_id (%s)", + docket_entry.pk, + input_document["media_id"], + ) + return MergeResult.failed() + else: + existed = True + needs_update = ( + str(texas_document.media_version_id) + != input_document["media_version_id"] + ) - if created or update: + if needs_update: texas_document.description = input_document["description"] texas_document.media_version_id = input_document["media_version_id"] texas_document.document_url = input_document["document_url"] @@ -3504,9 +3518,11 @@ def merge_texas_document( check_if_needed=False, model_name="search.TexasDocument" ), ).apply_async() - return True, True, texas_document.pk + return MergeResult( + create=not existed, update=True, success=True, pk=texas_document.pk + ) - return False, True, texas_document.pk + return MergeResult.unnecessary(texas_document.pk) def merge_texas_documents( @@ -3538,7 +3554,7 @@ def merge_texas_docket_entry( | TexasAppellateBrief | TexasSupremeCourtCaseEvent | TexasSupremeCourtAppellateBrief, -) -> tuple[bool, bool, int]: +) -> MergeResult: """Merges a Texas docket entry into CL. :param docket: The docket this entry belongs to. @@ -3817,6 +3833,8 @@ def merge_texas_docket( else: docket.appeal_from_str = lower_court_data.get("name") + docket.save() + party_merge_result = merge_texas_parties(docket, docket_data["parties"]) # TODO: Error logging From 6bc2fc2e26573575acf2539498ff7208dd140c77 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Thu, 29 Jan 2026 12:19:05 -0700 Subject: [PATCH 03/87] feat(texas): Update mergers and tests to use MergeResult Update mergers and tests to use MergeResult; update tests to distinguish between create and update fields of result. --- cl/corpus_importer/tasks.py | 59 +++++++++++++------------ cl/corpus_importer/tests.py | 88 +++++++++++++++++++++++++------------ 2 files changed, 91 insertions(+), 56 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 00002671ff..d431b9e459 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -1,7 +1,6 @@ from __future__ import annotations import copy -import functools import logging import os import re @@ -3519,7 +3518,10 @@ def merge_texas_document( ), ).apply_async() return MergeResult( - create=not existed, update=True, success=True, pk=texas_document.pk + create=not existed, + update=existed, + success=True, + pk=texas_document.pk, ) return MergeResult.unnecessary(texas_document.pk) @@ -3528,22 +3530,16 @@ def merge_texas_document( def merge_texas_documents( docket_entry: TexasDocketEntry, documents: list[TexasCaseDocument], -) -> list[tuple[bool, bool, int]]: +) -> list[MergeResult]: """Merges a list of Texas docket entry attachments into CL. :param docket_entry: The docket entry this attachment belongs to. :param documents: List of TexasCaseDocument attached to this docket entry. - :return: List of tuples with the following entries: - - A flag indicating whether the document needed to be created or updated, - - A flag indicating which is set to True when the document was successfully - created or updated or when an update was unnecessary, - - The primary key of the updated TexasDocument object.""" - output = [ + :return: List of the results of each merge operation""" + return [ merge_texas_document(docket_entry, document) for document in documents ] - return output - @transaction.atomic def merge_texas_docket_entry( @@ -3579,9 +3575,10 @@ def merge_texas_docket_entry( entry_type=input_docket_entry["type"], appellate_brief=appellate_brief, ) - count_matching_entries = docket_entries.count() - if count_matching_entries == 0: + try: + docket_entry = docket_entries.get() + except TexasDocketEntry.DoesNotExist: logger.info( "No existing TexasDocketEntry found for sequence number %s on Docket %s. Creating new entry.", sequence_number, @@ -3594,15 +3591,7 @@ def merge_texas_docket_entry( appellate_brief=appellate_brief, ) created = True - elif count_matching_entries == 1: - logger.info( - "Found existing TexasDocketEntry for sequence number %s on Docket %s. Updating entry.", - sequence_number, - docket.pk, - ) - docket_entry = docket_entries.first() - created = False - else: + except TexasDocketEntry.MultipleObjectsReturned: # More filtering needed matching_sequence_number = docket_entries.filter( sequence_number=sequence_number @@ -3633,6 +3622,13 @@ def merge_texas_docket_entry( appellate_brief=appellate_brief, ) created = True + else: + logger.info( + "Found existing TexasDocketEntry for sequence number %s on Docket %s. Updating entry.", + sequence_number, + docket.pk, + ) + created = False docket_entry.sequence_number = sequence_number docket_entry.description = input_docket_entry.get("description", "") @@ -3650,16 +3646,17 @@ def merge_texas_docket_entry( docket_entry.pk, docket.pk, ) - documents = merge_texas_documents( + document_results = merge_texas_documents( docket_entry, input_docket_entry["attachments"] ) - (update_or_create, success) = functools.reduce( - lambda x, y: (x[0] or y[0], x[1] and y[1]), documents, (False, True) + return MergeResult( + create=created or any(r.create for r in document_results), + update=not created or any(r.update for r in document_results), + success=all(r.success for r in document_results), + pk=docket_entry.pk, ) - return created or update_or_create, success, docket_entry.pk - def normalize_texas_parties( parties: list[TexasCaseParty], @@ -3692,7 +3689,9 @@ def normalize_texas_parties( ] -def merge_texas_parties(docket: Docket, parties: list[TexasCaseParty]) -> None: +def merge_texas_parties( + docket: Docket, parties: list[TexasCaseParty] +) -> MergeResult: """Merge Texas case parties and attorneys into the given docket. This function takes a docket and a list of parties associated with a Texas @@ -3702,8 +3701,12 @@ def merge_texas_parties(docket: Docket, parties: list[TexasCaseParty]) -> None: :param docket: The docket to which parties and attorneys should be added. :param parties: The parties involved in the Texas case. + :return: A MergeResult indicating the operation succeeded. Note that + create and update flags are always False and pk is always None since + add_parties_and_attorneys does not return this information. """ add_parties_and_attorneys(docket, normalize_texas_parties(parties)) + return MergeResult(create=False, update=False, success=True, pk=None) def merge_texas_docket_originating_court( diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 7232830da2..c1c88f67d0 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -2179,9 +2179,11 @@ def test_merge_texas_document_new_document(self): result = merge_texas_document(docket_entry, input_document) # Assertions - assert result == (True, True, result[2]) + assert result.create is True + assert result.success is True + assert result.pk is not None try: - created_document = TexasDocument.objects.get(pk=result[2]) + created_document = TexasDocument.objects.get(pk=result.pk) except ObjectDoesNotExist: created_document = None assert created_document is not None @@ -2217,8 +2219,10 @@ def test_merge_texas_document_existing_document_no_update(self): result = merge_texas_document(docket_entry, input_document) # Assertions - assert result == (False, True, current_document.pk) - result_document = TexasDocument.objects.get(pk=result[2]) + assert result.create is False + assert result.success is True + assert result.pk == current_document.pk + result_document = TexasDocument.objects.get(pk=result.pk) assert result_document is not None assert result_document.docket_entry_id == docket_entry.id assert result_document.description == input_document["description"] @@ -2254,8 +2258,11 @@ def test_merge_texas_document_existing_document_update(self): result = merge_texas_document(docket_entry, input_document) # Assertions - assert result == (True, True, current_document.pk) - result_document = TexasDocument.objects.get(pk=result[2]) + assert result.create is False + assert result.update is True + assert result.success is True + assert result.pk == current_document.pk + result_document = TexasDocument.objects.get(pk=result.pk) assert result_document is not None assert result_document.docket_entry_id == docket_entry.id assert result_document.description == input_document["description"] @@ -2297,9 +2304,9 @@ def get_test_pdf( ) docket_entry = self.docket_coa1_entry - (_, _, pk) = merge_texas_document(docket_entry, input_document) + result = merge_texas_document(docket_entry, input_document) docket_entry.refresh_from_db() - document = TexasDocument.objects.get(pk=pk) + document = TexasDocument.objects.get(pk=result.pk) self.assertEqual(response.call_count, 1) self.assertEqual(document.document_url, input_document["document_url"]) @@ -2325,8 +2332,14 @@ def test_merge_texas_documents(self): result = merge_texas_documents(docket_entry, input_documents) assert len(result) == 2 - assert result[0] == (True, True, result[0][2]) - assert result[1] == (False, True, current_attachment.pk) + assert result[0].create is True + assert result[0].update is False + assert result[0].success is True + assert result[0].pk is not None + assert result[1].create is False + assert result[1].update is False + assert result[1].success is True + assert result[1].pk == current_attachment.pk def test_merge_texas_docket_entry_new_entry(self): """Can we correctly handle a docket entry?""" @@ -2340,8 +2353,11 @@ def test_merge_texas_docket_entry_new_entry(self): self.docket_coa1, "2025-01-02.000", True, docket_entry ) - assert output == (True, True, output[2]) - created_docket_entry = TexasDocketEntry.objects.get(pk=output[2]) + assert output.create is True + assert output.update is False + assert output.success is True + assert output.pk is not None + created_docket_entry = TexasDocketEntry.objects.get(pk=output.pk) assert created_docket_entry.docket_id == self.docket_coa1.id assert created_docket_entry.entry_type == docket_entry["type"] assert created_docket_entry.description == docket_entry["description"] @@ -2356,9 +2372,10 @@ def test_merge_texas_docket_entry_no_update(self): """Can we correctly handle a docket entry update noop?""" js_docket_entry = TexasDocketEntryDictFactory() - (_, _, pk) = merge_texas_docket_entry( + result = merge_texas_docket_entry( self.docket_coa1, "2025-01-02.000", True, js_docket_entry ) + pk = result.pk # Reset call count self.extract_pdf_document_mock.reset_mock() @@ -2367,9 +2384,12 @@ def test_merge_texas_docket_entry_no_update(self): self.docket_coa1, "2025-01-02.000", True, js_docket_entry ) - assert output == (False, True, output[2]) - assert output[2] == pk - created_docket_entry = TexasDocketEntry.objects.get(pk=output[2]) + assert output.create is False + assert output.update is True + assert output.success is True + assert output.pk is not None + assert output.pk == pk + created_docket_entry = TexasDocketEntry.objects.get(pk=output.pk) assert created_docket_entry.docket_id == self.docket_coa1.id assert created_docket_entry.entry_type == js_docket_entry["type"] assert ( @@ -2387,9 +2407,10 @@ def test_merge_texas_docket_entry_add_document(self): js_docket_entry = TexasDocketEntryDictFactory() initial_n_attachments = len(js_docket_entry["attachments"]) - (_, _, pk) = merge_texas_docket_entry( + result = merge_texas_docket_entry( self.docket_coa1, "2025-01-02.000", True, js_docket_entry ) + pk = result.pk # Reset call count self.extract_pdf_document_mock.reset_mock() @@ -2398,9 +2419,12 @@ def test_merge_texas_docket_entry_add_document(self): self.docket_coa1, "2025-01-02.000", True, js_docket_entry ) - assert output == (True, True, output[2]) - assert output[2] == pk - created_docket_entry = TexasDocketEntry.objects.get(pk=output[2]) + assert output.create is True + assert output.update is True + assert output.success is True + assert output.pk is not None + assert output.pk == pk + created_docket_entry = TexasDocketEntry.objects.get(pk=output.pk) assert created_docket_entry.docket_id == self.docket_coa1.id assert created_docket_entry.entry_type == js_docket_entry["type"] assert ( @@ -2445,8 +2469,11 @@ def test_merge_texas_docket_entry_multiple_matches_with_sequence(self): self.docket_coa1, "2025-01-02.001", True, js_docket_entry ) - assert output == (False, True, existing_entry_2.pk) - updated_entry = TexasDocketEntry.objects.get(pk=output[2]) + assert output.create is False + assert output.update is True + assert output.success is True + assert output.pk == existing_entry_2.pk + updated_entry = TexasDocketEntry.objects.get(pk=output.pk) assert updated_entry.description == "Updated description" assert updated_entry.sequence_number == "2025-01-02.001" # Ensure the first entry was not modified @@ -2477,8 +2504,11 @@ def test_merge_texas_docket_entry_single_match_updates_entry(self): self.docket_coa1, "2025-01-04.001", True, js_docket_entry ) - assert output == (False, True, existing_entry.pk) - updated_entry = TexasDocketEntry.objects.get(pk=output[2]) + assert output.create is False + assert output.update is True + assert output.success is True + assert output.pk == existing_entry.pk + updated_entry = TexasDocketEntry.objects.get(pk=output.pk) assert updated_entry.description == "Updated description" assert updated_entry.sequence_number == "2025-01-04.001" @@ -2514,10 +2544,12 @@ def test_merge_texas_docket_entry_multiple_matches_without_sequence(self): self.docket_coa1, "2025-01-03.002", True, js_docket_entry ) - assert output[0] is True # created - assert output[1] is True # success - assert output[2] not in (existing_entry_1.pk, existing_entry_2.pk) - new_entry = TexasDocketEntry.objects.get(pk=output[2]) + assert output.create is True + assert output.update is False + assert output.success is True + assert output.pk is not None + assert output.pk not in (existing_entry_1.pk, existing_entry_2.pk) + new_entry = TexasDocketEntry.objects.get(pk=output.pk) assert new_entry.description == "New third entry" assert new_entry.sequence_number == "2025-01-03.002" # Ensure existing entries were not modified From fcee6bfcc3039f88ede36f2b132c09f038972bdf Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 4 Feb 2026 21:29:13 +0000 Subject: [PATCH 04/87] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cl/corpus_importer/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cl/corpus_importer/utils.py b/cl/corpus_importer/utils.py index 19c44be4bc..b42b4a938b 100644 --- a/cl/corpus_importer/utils.py +++ b/cl/corpus_importer/utils.py @@ -1315,6 +1315,8 @@ def juriscraper_to_cl_court_id(js_court_id: str) -> str | None: return None logger.error("Unrecognized court ID: %s", js_court_id) return None + + @dataclass class DownloadPDFResult: """Result of a PDF download operation.""" From 8657460ed28aa5cfe392a946c7b94019684254fb Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Thu, 5 Feb 2026 18:04:32 -0600 Subject: [PATCH 05/87] feat(texas): Originating court merger Add logic to merge originating court information into cases --- cl/corpus_importer/tasks.py | 58 +++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 8d26e8b6b8..743eeafe31 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -175,6 +175,7 @@ DocketEntry, Opinion, OpinionCluster, + OriginatingCourtInformation, RECAPDocument, ScotusDocketMetadata, Tag, @@ -3725,15 +3726,45 @@ def merge_texas_parties( return MergeResult(create=False, update=False, success=True, pk=None) +def texas_court_name_to_court(name: str, court_type=None) -> Court | None: + """Translates a Texas court name to a CourtListener court ID. + + Uses the name and type of court extracted from Juriscraper and attempts to + translate them to a CL `Court` object. If we cannot find a matching `Court` + object, return `None`.""" + raise NotImplementedError + + def merge_texas_docket_originating_court( - docket: Docket, originating_court_data: TexasTrialCourt + docket: Docket, + docket_data: TexasCourtOfAppealsDocket + | TexasCourtOfCriminalAppealsDocket + | TexasSupremeCourtDocket, ) -> MergeResult: """Merge originating court information into the given Texas docket. :param docket: The docket to add the originating court to. - :param originating_court_data: The originating court data from Juriscraper. + :param docket_data: The docket data from Juriscraper. :return: The result of the merge operation.""" - raise NotImplementedError + originating_court_information = docket.originating_court_information + originating_court_data = docket_data["trial_court"] + created = False + if not originating_court_information: + created = True + originating_court_information = OriginatingCourtInformation( + docket_number=originating_court_data["case"], + ) + + originating_court_information.court_reporter = originating_court_data[ + "reporter" + ] + originating_court_information.assigned_to_str = originating_court_data[ + "judge" + ] + # TODO Get judge from PeopleDB to add + originating_court_information.save() + + return MergeResult(create=created, update=False, success=True, pk=None) def merge_texas_case_transfers( @@ -3791,7 +3822,11 @@ def merge_texas_docket( :param court: The court to add the docket to. :param docket_data: The scraped Texas docket data. :return: The result of the merge operation.""" + # TODO Maybe we should derive `court` from the scraped data to make things simpler with transaction.atomic(): + Docket.objects.select_for_update().get( + docket_number=docket_data["docket_number"], court_id=court.pk + ) docket_number = docket_data["docket_number"] try: docket = Docket.objects.get( @@ -3826,7 +3861,7 @@ def merge_texas_docket( docket.date_filed = docket_data["date_filed"] docket.cause = docket_data["case_type"] originating_court_merge_result = merge_texas_docket_originating_court( - docket, docket_data["trial_court"] + docket, docket_data ) if not originating_court_merge_result.success: logger.error( @@ -3837,25 +3872,31 @@ def merge_texas_docket( lower_court_data: TexasAppealsCourt | TexasTrialCourt = ( docket_data.get("appeals_court", docket_data["trial_court"]) ) + # TODO Won't have court_id lower_court_id = juriscraper_to_cl_court_id( lower_court_data["court_id"] ) if lower_court_id is not None: + docket.appeal_from = lower_court_id + else: logger.warning( "Failed to find court ID %s while populating appeal_from field for Texas docket %s in court %s", lower_court_id, docket.pk, court.pk, ) - docket.appeal_from = lower_court_id - else: - docket.appeal_from_str = lower_court_data.get("name") + docket.appeal_from_str = lower_court_data.get("name") docket.save() party_merge_result = merge_texas_parties(docket, docket_data["parties"]) - # TODO: Error logging + if not party_merge_result.success: + logger.error( + "Failed to merge party data for Texas docket %s in court %s", + docket.docket_number, + court.pk, + ) entry_merge_results = [ merge_texas_docket_entry( @@ -3869,7 +3910,6 @@ def merge_texas_docket( docket_data["case_events"], ) ] - # TODO: Error logging merge_case_transfer_result = merge_texas_case_transfers( docket, docket_data From 8e4aa74507e8810d4ef0b114a6592b8e3bee9535 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Mon, 9 Feb 2026 11:31:31 -0700 Subject: [PATCH 06/87] test(texas): Update fuzzing to new schema Update fuzzing code to use new JS schema. --- cl/search/state/texas/factories.py | 53 +++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/cl/search/state/texas/factories.py b/cl/search/state/texas/factories.py index f59ceba688..1631d96468 100644 --- a/cl/search/state/texas/factories.py +++ b/cl/search/state/texas/factories.py @@ -34,15 +34,58 @@ class TexasCasePartyDictFactory(DictFactory): representatives = List([Faker("name")]) -class TexasTrialCourtDictFactory(DictFactory): - # TODO Placeholder - name = Faker("pystr") +class TexasOriginatingCourtDictFactory(DictFactory): + name = Faker("court_name") + # TODO Replace the literals with values from Juriscraper when the dependency is updated + court_type = Faker( + "random_element", + elements=( + "texas_appellate", + "texas_district", + "texas_probate", + "texas_business", + "texas_county", + "texas_municipal", + "texas_justice", + "texas_unknown", + ), + ) + county = Faker("pystr") + judge = Faker("name") + # Close enough for testing + case = Faker("federal_district_docket_number") + reporter = Faker("name") + punishment = Faker("pystr") + + +class TexasOriginatingAppellateCourtDictFactory( + TexasOriginatingCourtDictFactory +): + court_id = Faker( + "random_element", + elements=("texas_coa01", "texas_coa02", "texas_coa14", "texas_coa15"), + ) + + +class TexasOriginatingDistrictCourtDictFactory( + TexasOriginatingCourtDictFactory +): + district = Faker("random_element", elements=list(range(1, 527)) + [None]) class TexasCommonDataDictFactory(DictFactory): court_id = Faker( "random_element", - elements=("texctapp1", "texctapp2", "tex", "texcrimapp"), + elements=( + "texas_coa01", + "texas_coa02", + "texas_cossup", + "texas_coscca", + ), + ) + court_type = Faker( + "random_element", + elements=("texas_appellate", "texas_final"), ) # Not correct, but close enough docket_number = Faker("federal_district_docket_number") @@ -51,7 +94,7 @@ class TexasCommonDataDictFactory(DictFactory): date_filed = Faker("date_object") case_type = Faker("pystr") parties = List([SubFactory(TexasCasePartyDictFactory)]) - trial_court = SubFactory(TexasTrialCourtDictFactory) + originating_court = SubFactory(TexasOriginatingCourtDictFactory) case_events = List([SubFactory(TexasDocketEntryDictFactory)]) appellate_briefs = LazyAttribute( lambda d: filter( From 34f1b2928f4fe3f87e9e95a5cb395774e7f96ff1 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Mon, 9 Feb 2026 12:20:09 -0700 Subject: [PATCH 07/87] feat(texas): Finalize required merging logic Finalize merging logic with the exception of `CaseTransfer` objects --- cl/corpus_importer/tasks.py | 97 ++++++++++++++++++++++++------------- 1 file changed, 63 insertions(+), 34 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 743eeafe31..4e61556025 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -67,8 +67,10 @@ TexasTrialCourt, ) from juriscraper.state.texas.common import ( + CourtID, TexasAppellateBrief, TexasCaseDocument, + TexasCommonData, ) from juriscraper.state.texas.court_of_appeals import TexasCourtOfAppealsDocket from openai import ( @@ -3544,20 +3546,6 @@ def merge_texas_document( return MergeResult.unnecessary(texas_document.pk) -def merge_texas_documents( - docket_entry: TexasDocketEntry, - documents: list[TexasCaseDocument], -) -> list[MergeResult]: - """Merges a list of Texas docket entry attachments into CL. - - :param docket_entry: The docket entry this attachment belongs to. - :param documents: List of TexasCaseDocument attached to this docket entry. - :return: List of the results of each merge operation""" - return [ - merge_texas_document(docket_entry, document) for document in documents - ] - - @transaction.atomic def merge_texas_docket_entry( docket: Docket, @@ -3663,9 +3651,10 @@ def merge_texas_docket_entry( docket_entry.pk, docket.pk, ) - document_results = merge_texas_documents( - docket_entry, input_docket_entry["attachments"] - ) + document_results = [ + merge_texas_document(input_docket_entry, document) + for document in input_docket_entry["attachments"] + ] return MergeResult( create=created or any(r.create for r in document_results), @@ -3726,13 +3715,49 @@ def merge_texas_parties( return MergeResult(create=False, update=False, success=True, pk=None) -def texas_court_name_to_court(name: str, court_type=None) -> Court | None: - """Translates a Texas court name to a CourtListener court ID. - - Uses the name and type of court extracted from Juriscraper and attempts to - translate them to a CL `Court` object. If we cannot find a matching `Court` - object, return `None`.""" - raise NotImplementedError +def texas_js_court_id_to_court_id(js_court_id: str) -> str: + """Translates a Juriscraper Texas court ID to a CourtListener Court ID. + + :param js_court_id: The court ID extracted from Juriscraper. + :return: The corresponding Court ID.""" + if js_court_id == CourtID.SUPREME_COURT.value: + return "tex" + if js_court_id == CourtID.COURT_OF_CRIMINAL_APPEALS.value: + return "texcrimapp" + # Court of appeals + appellate_number = str(int(js_court_id[len("texas_") :])) + if appellate_number == "13": + appellate_number = "13A" + return f"txctapp{appellate_number}" + + +def texas_originating_court_to_court_id( + court_data: TexasOriginatingCourt, +) -> str | None: + """Attempts to translate Juriscraper Texas originating court data to a + CourtListener Court ID. + + :param court_data: The originating court data from Juriscraper. + :return: The matching Court ID or None if no court could be found.""" + # TODO Replace with JS CourtID enum values when dependency is updated + court_type = court_data["court_type"] + if court_type == "texas_appellate": + return texas_js_court_id_to_court_id(court_data["court_id"]) + if court_type == "texas_district": + district_number = court_data["district"] + if district_number: + if district_number > 1: + district_number = district_number + 1 + return f"txdistct{district_number}" + return "texdistct" + if court_type == "texas_business": + return "texbizct" + if court_type == "texas_municipal": + return "texctyct" + if court_type == "texas_probate": + return "texprobct" + # County, justice, and unknown court types + return None def merge_texas_docket_originating_court( @@ -3812,17 +3837,17 @@ def generate_texas_appellate_brief_flags( def merge_texas_docket( - court: Court, docket_data: TexasCourtOfAppealsDocket | TexasCourtOfCriminalAppealsDocket | TexasSupremeCourtDocket, ) -> MergeResult: """Merges scraped data from a Texas docket into the `Docket` table. - :param court: The court to add the docket to. :param docket_data: The scraped Texas docket data. :return: The result of the merge operation.""" - # TODO Maybe we should derive `court` from the scraped data to make things simpler + court = Court.objects.get( + pk=texas_js_court_id_to_court_id(docket_data["court_id"]) + ) with transaction.atomic(): Docket.objects.select_for_update().get( docket_number=docket_data["docket_number"], court_id=court.pk @@ -3869,13 +3894,17 @@ def merge_texas_docket( docket.docket_number, court.pk, ) - lower_court_data: TexasAppealsCourt | TexasTrialCourt = ( - docket_data.get("appeals_court", docket_data["trial_court"]) - ) - # TODO Won't have court_id - lower_court_id = juriscraper_to_cl_court_id( - lower_court_data["court_id"] - ) + + if docket_data["court_type"] == "texas_appellate": + lower_court_data = docket_data["originating_court"] + lower_court_id = texas_originating_court_to_court_id( + lower_court_data + ) + else: + lower_court_data = docket_data["appeals_court"] + lower_court_id = texas_js_court_id_to_court_id( + lower_court_data["court_id"] + ) if lower_court_id is not None: docket.appeal_from = lower_court_id From 51ee74c3730a697c274b3cc6f4d5330a5d7571c6 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Mon, 9 Feb 2026 14:35:14 -0700 Subject: [PATCH 08/87] feat(texas): CaseTransfer merger --- cl/corpus_importer/tasks.py | 127 +++++++++++++++++++++++++++++++++++- 1 file changed, 126 insertions(+), 1 deletion(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 4e61556025..45012acce3 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -171,6 +171,7 @@ from cl.search.models import ( PRECEDENTIAL_STATUS, SOURCES, + CaseTransfer, ClaimHistory, Court, Docket, @@ -3803,7 +3804,131 @@ def merge_texas_case_transfers( :param docket: The docket to add the appeal information to. :param docket_data: The docket data from Juriscraper. :return: The result of the CaseTransfer merge operation""" - raise NotImplementedError + trial_court_id = texas_originating_court_to_court_id( + docket_data["originating_court"] + ) + + if docket_data["court_type"] == "texas_final": + # Assume that the originating court -> appellate court transfer will + # be populated by an appellate docket later on. + transfer = CaseTransfer( + destination_court=docket.court, + destination_docket_id=docket.docket_number, + transfer_date=docket_data["date_filed"], + transfer_type=CaseTransfer.APPEAL, + ) + + appeals_court = docket_data["appeals_court"] + if appeals_court["court_id"] == CourtID.UNKNOWN: + appeals_court_id = "texapp" + else: + appeals_court_id = texas_js_court_id_to_court_id( + appeals_court["court_id"] + ) + + if docket_data["court_id"] == CourtID.COURT_OF_CRIMINAL_APPEALS: + # Death penalty cases are automatically appealed to the CCA so the + # appellate court may be missing. + appeals_court = docket_data["appeals_court"] + if appeals_court["court_id"] == CourtID.UNKNOWN: + # Death penalty appeal + if trial_court_id: + transfer.origin_court = Court.objects.get( + pk=trial_court_id + ) + transfer.origin_docket_id = docket_data[ + "originating_court" + ]["case"] + else: + logger.error( + "Unable to determine trial court ID for Texas docket %s to create CaseTransfer", + docket.docket_number, + ) + return MergeResult.failed() + else: + transfer.origin_court = Court.objects.get(pk=appeals_court_id) + transfer.origin_docket_id = appeals_court["case_number"] + elif docket_data["court_id"] == CourtID.SUPREME_COURT: + transfer.origin_court = Court.objects.get(pk=appeals_court_id) + transfer.origin_docket_id = appeals_court["case_number"] + else: + logger.error( + "Unrecognized Texas final court ID %s while creating CaseTransfer", + docket_data["court_id"], + ) + return MergeResult.failed() + transfers = [transfer] + elif docket_data["court_type"] == "texas_appellate": + transfers = [] + if trial_court_id: + transfers.append( + CaseTransfer( + origin_court=Court.objects.get(pk=trial_court_id), + origin_docket_id=docket_data["originating_court"]["case"], + destination_court=docket.court, + destination_docket_id=docket.docket_number, + transfer_date=docket_data["date_filed"], + transfer_type=CaseTransfer.APPEAL, + ) + ) + if docket_data["transfer_from"]: + transfers.append( + CaseTransfer( + origin_court=Court.objects.get( + pk=texas_js_court_id_to_court_id( + docket_data["transfer_from"]["court_id"] + ) + ), + origin_docket_id=docket_data["transfer_from"][ + "origin_docket" + ], + destination_court=docket.court, + destination_docket_id=docket.docket_number, + # The "date" field of transfers is not always set, but when it is, it seems to match date filed. + transfer_date=docket_data["date_filed"], + transfer_type=CaseTransfer.WORKLOAD, + ) + ) + # Assume that the value in the "transfer_to" field will be filled in + # by another court. + else: + logger.error( + "Unrecognized Texas court type %s while creating CaseTransfer", + docket_data["court_type"], + ) + return MergeResult.failed() + + any_created = False + for transfer in transfers: + _, created = CaseTransfer.objects.get_or_create( + origin_court=transfer.origin_court, + origin_docket_id=transfer.origin_docket_id, + destination_court=transfer.destination_court, + destination_docket_id=transfer.destination_docket_id, + transfer_date=transfer.transfer_date, + transfer_type=transfer.transfer_type, + ) + if created: + any_created = True + logger.info( + "Created CaseTransfer object from docket %s in court %s to docket %s in court %s", + transfer.origin_docket_id, + transfer.origin_court.pk, + transfer.destination_docket_id, + transfer.destination_court.pk, + ) + else: + logger.warning( + "CaseTransfer object from docket %s in court %s to docket %s in court %s already exists", + transfer.origin_docket_id, + transfer.origin_court.pk, + transfer.destination_docket_id, + transfer.destination_court.pk, + ) + + return MergeResult( + success=True, created=any_created, updated=False, pk=None + ) def generate_texas_appellate_brief_flags( From d831fda43a9b650c8ac8904a39da4d09d85e8557 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Mon, 9 Feb 2026 14:44:55 -0700 Subject: [PATCH 09/87] feat(db): Update CaseTransfer to use text docket IDs Use text fields for dockets in CaseTransfer instead of foreign keys since in many cases while populating transfers we will not yet or will never have a docket object to reference directly. --- .../0052_case_transfer_docket_to_text_id.py | 53 +++++++++++++++++++ .../0052_case_transfer_docket_to_text_id.sql | 44 +++++++++++++++ ...e_transfer_docket_to_text_id_customers.sql | 24 +++++++++ cl/search/models.py | 16 ++---- 4 files changed, 125 insertions(+), 12 deletions(-) create mode 100644 cl/search/migrations/0052_case_transfer_docket_to_text_id.py create mode 100644 cl/search/migrations/0052_case_transfer_docket_to_text_id.sql create mode 100644 cl/search/migrations/0052_case_transfer_docket_to_text_id_customers.sql diff --git a/cl/search/migrations/0052_case_transfer_docket_to_text_id.py b/cl/search/migrations/0052_case_transfer_docket_to_text_id.py new file mode 100644 index 0000000000..c637a7f97a --- /dev/null +++ b/cl/search/migrations/0052_case_transfer_docket_to_text_id.py @@ -0,0 +1,53 @@ +# Generated by Django 6.0.1 on 2026-02-09 21:36 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('search', '0051_texas_models'), + ] + + operations = [ + migrations.RemoveField( + model_name='casetransfer', + name='destination_docket', + ), + migrations.RemoveField( + model_name='casetransfer', + name='origin_docket', + ), + migrations.RemoveField( + model_name='casetransferevent', + name='destination_docket', + ), + migrations.RemoveField( + model_name='casetransferevent', + name='origin_docket', + ), + migrations.AddField( + model_name='casetransfer', + name='destination_docket_id', + field=models.TextField(db_comment='The ID of the case docket in the destination court.', default='', help_text='The ID of the case docket in the destination court.'), + preserve_default=False, + ), + migrations.AddField( + model_name='casetransfer', + name='origin_docket_id', + field=models.TextField(db_comment='The ID of the docket this transfer originates from.', default='', help_text='The ID of the docket this transfer originates from.'), + preserve_default=False, + ), + migrations.AddField( + model_name='casetransferevent', + name='destination_docket_id', + field=models.TextField(db_comment='The ID of the case docket in the destination court.', default='', help_text='The ID of the case docket in the destination court.'), + preserve_default=False, + ), + migrations.AddField( + model_name='casetransferevent', + name='origin_docket_id', + field=models.TextField(db_comment='The ID of the docket this transfer originates from.', default='', help_text='The ID of the docket this transfer originates from.'), + preserve_default=False, + ), + ] diff --git a/cl/search/migrations/0052_case_transfer_docket_to_text_id.sql b/cl/search/migrations/0052_case_transfer_docket_to_text_id.sql new file mode 100644 index 0000000000..eb720bf414 --- /dev/null +++ b/cl/search/migrations/0052_case_transfer_docket_to_text_id.sql @@ -0,0 +1,44 @@ +BEGIN; +-- +-- Remove field destination_docket from casetransfer +-- +SET CONSTRAINTS "search_casetransfer_destination_docket_i_9941948f_fk_search_do" IMMEDIATE; ALTER TABLE "search_casetransfer" DROP CONSTRAINT "search_casetransfer_destination_docket_i_9941948f_fk_search_do"; +ALTER TABLE "search_casetransfer" DROP COLUMN "destination_docket_id"; +-- +-- Remove field origin_docket from casetransfer +-- +SET CONSTRAINTS "search_casetransfer_origin_docket_id_b23a08e9_fk_search_do" IMMEDIATE; ALTER TABLE "search_casetransfer" DROP CONSTRAINT "search_casetransfer_origin_docket_id_b23a08e9_fk_search_do"; +ALTER TABLE "search_casetransfer" DROP COLUMN "origin_docket_id"; +-- +-- Remove field destination_docket from casetransferevent +-- +ALTER TABLE "search_casetransferevent" DROP COLUMN "destination_docket_id"; +-- +-- Remove field origin_docket from casetransferevent +-- +ALTER TABLE "search_casetransferevent" DROP COLUMN "origin_docket_id"; +-- +-- Add field destination_docket_id to casetransfer +-- +ALTER TABLE "search_casetransfer" ADD COLUMN "destination_docket_id" text DEFAULT '' NOT NULL; +ALTER TABLE "search_casetransfer" ALTER COLUMN "destination_docket_id" DROP DEFAULT; +COMMENT ON COLUMN "search_casetransfer"."destination_docket_id" IS 'The ID of the case docket in the destination court.'; +-- +-- Add field origin_docket_id to casetransfer +-- +ALTER TABLE "search_casetransfer" ADD COLUMN "origin_docket_id" text DEFAULT '' NOT NULL; +ALTER TABLE "search_casetransfer" ALTER COLUMN "origin_docket_id" DROP DEFAULT; +COMMENT ON COLUMN "search_casetransfer"."origin_docket_id" IS 'The ID of the docket this transfer originates from.'; +-- +-- Add field destination_docket_id to casetransferevent +-- +ALTER TABLE "search_casetransferevent" ADD COLUMN "destination_docket_id" text DEFAULT '' NOT NULL; +ALTER TABLE "search_casetransferevent" ALTER COLUMN "destination_docket_id" DROP DEFAULT; +COMMENT ON COLUMN "search_casetransferevent"."destination_docket_id" IS 'The ID of the case docket in the destination court.'; +-- +-- Add field origin_docket_id to casetransferevent +-- +ALTER TABLE "search_casetransferevent" ADD COLUMN "origin_docket_id" text DEFAULT '' NOT NULL; +ALTER TABLE "search_casetransferevent" ALTER COLUMN "origin_docket_id" DROP DEFAULT; +COMMENT ON COLUMN "search_casetransferevent"."origin_docket_id" IS 'The ID of the docket this transfer originates from.'; +COMMIT; diff --git a/cl/search/migrations/0052_case_transfer_docket_to_text_id_customers.sql b/cl/search/migrations/0052_case_transfer_docket_to_text_id_customers.sql new file mode 100644 index 0000000000..23558ffcd1 --- /dev/null +++ b/cl/search/migrations/0052_case_transfer_docket_to_text_id_customers.sql @@ -0,0 +1,24 @@ +BEGIN; +-- +-- Remove field destination_docket from casetransfer +-- +SET CONSTRAINTS "search_casetransfer_destination_docket_i_9941948f_fk_search_do" IMMEDIATE; ALTER TABLE "search_casetransfer" DROP CONSTRAINT "search_casetransfer_destination_docket_i_9941948f_fk_search_do"; +ALTER TABLE "search_casetransfer" DROP COLUMN "destination_docket_id"; +-- +-- Remove field origin_docket from casetransfer +-- +SET CONSTRAINTS "search_casetransfer_origin_docket_id_b23a08e9_fk_search_do" IMMEDIATE; ALTER TABLE "search_casetransfer" DROP CONSTRAINT "search_casetransfer_origin_docket_id_b23a08e9_fk_search_do"; +ALTER TABLE "search_casetransfer" DROP COLUMN "origin_docket_id"; +-- +-- Add field destination_docket_id to casetransfer +-- +ALTER TABLE "search_casetransfer" ADD COLUMN "destination_docket_id" text DEFAULT '' NOT NULL; +ALTER TABLE "search_casetransfer" ALTER COLUMN "destination_docket_id" DROP DEFAULT; +COMMENT ON COLUMN "search_casetransfer"."destination_docket_id" IS 'The ID of the case docket in the destination court.'; +-- +-- Add field origin_docket_id to casetransfer +-- +ALTER TABLE "search_casetransfer" ADD COLUMN "origin_docket_id" text DEFAULT '' NOT NULL; +ALTER TABLE "search_casetransfer" ALTER COLUMN "origin_docket_id" DROP DEFAULT; +COMMENT ON COLUMN "search_casetransfer"."origin_docket_id" IS 'The ID of the docket this transfer originates from.'; +COMMIT; diff --git a/cl/search/models.py b/cl/search/models.py index 596b656dbd..8bd9ff2406 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -3981,9 +3981,9 @@ class CaseTransfer(AbstractDateTimeModel): an appeal, workload balancing, or docket merging. :ivar origin_court: The court this transfer originates from. - :ivar origin_docket: The docket this transfer originates from. + :ivar origin_docket_id: The ID of the docket this transfer originates from. :ivar destination_court: The court the docket is being transferred to. - :ivar destination_docket: The case docket in the destination court. + :ivar destination_docket_id: The ID of the case docket in the destination court. :ivar transfer_date: The date this transfer occurred. :ivar transfer_type: The type of transfer (appeal, work sharing, etc.). """ @@ -4007,21 +4007,13 @@ class CaseTransfer(AbstractDateTimeModel): on_delete=models.CASCADE, related_name="case_transfer_origin_court", ) - origin_docket = models.ForeignKey( - "search.Docket", - on_delete=models.CASCADE, - related_name="case_transfer_origin_docket", - ) + origin_docket_id = models.TextField() destination_court = models.ForeignKey( "search.Court", on_delete=models.CASCADE, related_name="case_transfer_destination_court", ) - destination_docket = models.ForeignKey( - "search.Docket", - on_delete=models.CASCADE, - related_name="case_transfer_destination_docket", - ) + destination_docket_id = models.TextField() transfer_date = models.DateField() transfer_type = models.SmallIntegerField( choices=transfer_type_choices.items(), From 9e40ca57987e0b6f6ebe7673be8ad2076720d1c5 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Mon, 9 Feb 2026 17:43:48 -0700 Subject: [PATCH 10/87] tests(texas): Tests for CaseTransfer merger Mockup of tests for CaseTransfer merger. --- cl/corpus_importer/tasks.py | 9 +- cl/corpus_importer/tests.py | 317 ++++++++++++++++++++++++++++- cl/search/state/texas/factories.py | 60 ++++++ 3 files changed, 381 insertions(+), 5 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 45012acce3..575978a68e 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3773,14 +3773,15 @@ def merge_texas_docket_originating_court( :param docket_data: The docket data from Juriscraper. :return: The result of the merge operation.""" originating_court_information = docket.originating_court_information - originating_court_data = docket_data["trial_court"] + originating_court_data = docket_data["originating_court"] created = False if not originating_court_information: created = True - originating_court_information = OriginatingCourtInformation( - docket_number=originating_court_data["case"], - ) + originating_court_information = OriginatingCourtInformation() + originating_court_information.docket_number = ( + originating_court_data["case"], + ) originating_court_information.court_reporter = originating_court_data[ "reporter" ] diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 762aacb4f3..247d74f0a5 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -85,9 +85,10 @@ download_texas_document_pdf, generate_ia_json, get_and_save_free_document_report, + merge_texas_case_transfers, merge_texas_docket_entry, + merge_texas_docket_originating_court, merge_texas_document, - merge_texas_documents, merge_texas_parties, normalize_texas_parties, probe_or_scrape_iquery_pages, @@ -157,17 +158,23 @@ from cl.search.models import ( SEARCH_TYPES, SOURCES, + CaseTransfer, Citation, Docket, Opinion, OpinionCluster, + OriginatingCourtInformation, RECAPDocument, ) from cl.search.state.texas.factories import ( + TexasAppellateCourtInfoDictFactory, TexasCaseDocumentDictFactory, + TexasCourtOfAppealsDocketDictFactory, TexasDocketEntryDictFactory, TexasDocketEntryFactory, TexasDocumentFactory, + TexasFinalCourtDocketDictFactory, + TexasOriginatingDistrictCourtDictFactory, ) from cl.search.state.texas.models import TexasDocketEntry, TexasDocument from cl.settings import MEDIA_ROOT @@ -2800,6 +2807,314 @@ def test_download_texas_document_pdf_download_failure(self): texas_document.refresh_from_db() assert not texas_document.filepath_local + def test_merge_texas_docket_originating_court_creates_new(self): + """Can we create new originating court information?""" + docket_data = TexasCourtOfAppealsDocketDictFactory( + docket_number=self.docket_number_coa1, + originating_court=TexasOriginatingDistrictCourtDictFactory( + court_type="texas_district", + district=5, + ), + ) + + result = merge_texas_docket_originating_court( + self.docket_coa1, docket_data + ) + + assert result.create is True + assert result.success is True + + self.docket_coa1.refresh_from_db() + originating_info = self.docket_coa1.originating_court_information + assert originating_info is not None + assert ( + originating_info.docket_number + == docket_data["originating_court"]["case"] + ) + assert ( + originating_info.court_reporter + == docket_data["originating_court"]["reporter"] + ) + assert ( + originating_info.assigned_to_str + == docket_data["originating_court"]["judge"] + ) + + def test_merge_texas_docket_originating_court_updates_existing(self): + """Can we update existing originating court information?""" + # Create existing originating court information + self.docket_coa1.originating_court_information = ( + OriginatingCourtInformation.objects.create( + docket_number="OLD-123", + court_reporter="Old Reporter", + assigned_to_str="Old Judge", + ) + ) + self.docket_coa1.save() + + originating_court = TexasOriginatingDistrictCourtDictFactory() + docket_data = TexasCourtOfAppealsDocketDictFactory( + court_id="texas_coa01", + docket_number=self.docket_number_coa1, + originating_court=originating_court, + ) + + result = merge_texas_docket_originating_court( + self.docket_coa1, docket_data + ) + + assert result.create is False + assert result.success is True + + self.docket_coa1.refresh_from_db() + updated_info = self.docket_coa1.originating_court_information + assert updated_info is not None + assert updated_info.docket_number == originating_court["case"] + assert updated_info.court_reporter == originating_court["reporter"] + assert updated_info.assigned_to_str == originating_court["judge"] + + def test_merge_texas_case_transfers_appellate_court_from_trial(self): + """Can we create a CaseTransfer for an appellate court case?""" + texas_district = CourtFactory.create(id="txdistct6") + + originating_court = TexasOriginatingDistrictCourtDictFactory( + court_type="texas_district", + district=5, + case="2023-12345", + ) + docket_data = TexasCourtOfAppealsDocketFactory( + court_id="texas_coa01", + docket_number=self.docket_number_coa1, + date_filed=date(2025, 1, 15), + originating_court=originating_court, + transfer_from=None, + ) + + result = merge_texas_case_transfers(self.docket_coa1, docket_data) + + assert result.success is True + assert result.create is True + + transfers = CaseTransfer.objects.filter( + destination_court=self.texas_coa1, + destination_docket_id=self.docket_number_coa1, + ) + assert transfers.count() == 1 + transfer = transfers.first() + assert transfer.origin_court.id == "txdistct6" + assert transfer.origin_docket_id == "2023-12345" + assert transfer.transfer_type == CaseTransfer.APPEAL + assert transfer.transfer_date == date(2025, 1, 15) + + def test_merge_texas_case_transfers_appellate_with_workload_transfer( + self, + ): + """Can we create CaseTransfer for appellate case with work sharing?""" + texas_district = CourtFactory.create(id="txdistct6") + texas_coa2 = CourtFactory.create(id="texas_coa2") + + originating_court = TexasOriginatingDistrictCourtDictFactory( + court_type="texas_district", + district=5, + case="2023-12345", + ) + transfer_from = TexasTransferFromDictFactory( + court_id="texas_coa02", + origin_docket="02-24-00999-CV", + date=date(2025, 1, 10), + ) + docket_data = TexasCourtOfAppealsDocketFactory( + court_id="texas_coa01", + docket_number=self.docket_number_coa1, + date_filed=date(2025, 1, 15), + originating_court=originating_court, + transfer_from=transfer_from, + ) + + result = merge_texas_case_transfers(self.docket_coa1, docket_data) + + assert result.success is True + assert result.create is True + + transfers = CaseTransfer.objects.filter( + destination_court=self.texas_coa1, + destination_docket_id=self.docket_number_coa1, + ) + assert transfers.count() == 2 + + appeal_transfer = transfers.get(transfer_type=CaseTransfer.APPEAL) + assert appeal_transfer.origin_court.id == "txdistct6" + assert appeal_transfer.origin_docket_id == "2023-12345" + + workload_transfer = transfers.get(transfer_type=CaseTransfer.WORKLOAD) + assert workload_transfer.origin_court == texas_coa2 + assert workload_transfer.origin_docket_id == "02-24-00999-CV" + + def test_merge_texas_case_transfers_supreme_court(self): + """Can we create a CaseTransfer for a Supreme Court case?""" + docket_sc = DocketFactory.create( + court=self.texas_sc, docket_number="25-0100" + ) + + appeals_court = TexasAppellateCourtInfoDictFactory( + court_id="texas_coa01", + case_number=self.docket_number_coa1, + ) + docket_data = TexasSupremeCourtDocketFactory( + docket_number="25-0100", + date_filed=date(2025, 1, 15), + originating_court=TexasOriginatingDistrictCourtDictFactory( + court_type="texas_district", + district=5, + ), + appeals_court=appeals_court, + ) + + result = merge_texas_case_transfers(docket_sc, docket_data) + + assert result.success is True + assert result.create is True + + transfers = CaseTransfer.objects.filter( + destination_court=self.texas_sc, + destination_docket_id="25-0100", + ) + assert transfers.count() == 1 + transfer = transfers.first() + assert transfer.origin_court == self.texas_coa1 + assert transfer.origin_docket_id == self.docket_number_coa1 + assert transfer.transfer_type == CaseTransfer.APPEAL + assert transfer.transfer_date == date(2025, 1, 15) + + def test_merge_texas_case_transfers_cca_from_appellate(self): + """Can we create a CaseTransfer for CCA from appellate court?""" + docket_cca = DocketFactory.create( + court=self.texas_cca, docket_number="PD-0100-25" + ) + + appeals_court = TexasAppellateCourtInfoDictFactory( + court_id="texas_coa01", + case_number=self.docket_number_coa1, + ) + docket_data = TexasCourtOfCriminalAppealsDocketFactory( + docket_number="PD-0100-25", + date_filed=date(2025, 1, 15), + originating_court=TexasOriginatingDistrictCourtDictFactory( + court_type="texas_district", + district=5, + ), + appeals_court=appeals_court, + ) + + result = merge_texas_case_transfers(docket_cca, docket_data) + + assert result.success is True + assert result.create is True + + transfers = CaseTransfer.objects.filter( + destination_court=self.texas_cca, + destination_docket_id="PD-0100-25", + ) + assert transfers.count() == 1 + transfer = transfers.first() + assert transfer.origin_court == self.texas_coa1 + assert transfer.origin_docket_id == self.docket_number_coa1 + assert transfer.transfer_type == CaseTransfer.APPEAL + + def test_merge_texas_case_transfers_cca_death_penalty_direct_appeal( + self, + ): + """Can we handle death penalty direct appeals to CCA?""" + docket_cca = DocketFactory.create( + court=self.texas_cca, docket_number="AP-76,000" + ) + + originating_court = TexasOriginatingDistrictCourtDictFactory( + district=5 + ) + docket_data = TexasFinalCourtDocketDictFactory( + court_id="texas_coscca", + docket_number="AP-76,000", + originating_court=originating_court, + appeals_court=None, + ) + + result = merge_texas_case_transfers(docket_cca, docket_data) + + assert result.success is True + assert result.create is True + + transfers = CaseTransfer.objects.filter( + destination_court=self.texas_cca, + destination_docket_id="AP-76,000", + ) + assert transfers.count() == 1 + transfer = transfers.first() + assert transfer.origin_court.id == "txdistct6" + assert transfer.origin_docket_id == originating_court["case"] + assert transfer.transfer_type == CaseTransfer.APPEAL + + def test_merge_texas_case_transfers_no_trial_court_info(self): + """Do we handle appellate cases without trial court info?""" + originating_court = TexasOriginatingDistrictCourtDictFactory( + court_type="texas_unknown", + district=None, + case="", + ) + docket_data = TexasCourtOfAppealsDocketDictFactory( + court_id="texas_coa01", + docket_number=self.docket_number_coa1, + originating_court=originating_court, + transfer_from=None, + ) + + result = merge_texas_case_transfers(self.docket_coa1, docket_data) + + assert result.success is True + + transfers = CaseTransfer.objects.filter( + destination_court=self.texas_coa1, + destination_docket_id=self.docket_number_coa1, + ) + assert transfers.count() == 0 + + def test_merge_texas_case_transfers_duplicate_handling(self): + """Do we properly handle duplicate CaseTransfer objects?""" + texas_district = CourtFactory.create(id="txdistct6") + + CaseTransfer.objects.create( + origin_court=texas_district, + origin_docket_id="2023-12345", + destination_court=self.texas_coa1, + destination_docket_id=self.docket_number_coa1, + transfer_date=date(2025, 1, 15), + transfer_type=CaseTransfer.APPEAL, + ) + + originating_court = TexasOriginatingDistrictCourtDictFactory( + court_type="texas_district", + district=5, + case="2023-12345", + ) + docket_data = TexasCourtOfAppealsDocketDictFactory( + court_id="texas_coa01", + docket_number=self.docket_number_coa1, + date_filed=date(2025, 1, 15), + originating_court=originating_court, + transfer_from=None, + ) + + result = merge_texas_case_transfers(self.docket_coa1, docket_data) + + assert result.success is True + assert result.create is False + + transfers = CaseTransfer.objects.filter( + destination_court=self.texas_coa1, + destination_docket_id=self.docket_number_coa1, + ) + assert transfers.count() == 1 + @patch("cl.corpus_importer.tasks.get_or_cache_pacer_cookies") @override_settings( diff --git a/cl/search/state/texas/factories.py b/cl/search/state/texas/factories.py index 1631d96468..973508e9f5 100644 --- a/cl/search/state/texas/factories.py +++ b/cl/search/state/texas/factories.py @@ -128,3 +128,63 @@ class TexasDocumentFactory(DjangoModelFactory): class Meta: model = TexasDocument + + +class TexasAppellateCourtInfoDictFactory(DictFactory): + """Factory for appeals_court field in Texas final court dockets.""" + + court_id = Faker( + "random_element", + elements=( + "texas_coa01", + "texas_coa02", + "texas_coa14", + "texas_unknown", + ), + ) + case_number = Faker("federal_district_docket_number") + case_url = Faker("url") + disposition = Faker("pystr") + district = Faker("pystr") + justice = Faker("Name") + opinion_cite = Faker("citation") + + +class TexasAppellateTransferDictFactory(DictFactory): + """Factory for transfer_from field in Texas appellate dockets.""" + + court_id = Faker( + "random_element", + elements=("texas_coa01", "texas_coa02", "texas_coa14"), + ) + origin_docket = Faker("federal_district_docket_number") + date = Faker("date_object") + + +class TexasCourtOfAppealsDocketDictFactory(TexasCommonDataDictFactory): + """Factory for Texas Court of Appeals docket data.""" + + court_type = "texas_appellate" + court_id = Faker( + "random_element", + elements=("texas_coa01", "texas_coa02", "texas_coa14"), + ) + transfer_from = LazyAttribute( + lambda d: TexasAppellateTransferDictFactory.create() + if random.random() < 0.1 + else None + ) + transfer_to = LazyAttribute( + lambda d: TexasAppellateTransferDictFactory.create() + if random.random() < 0.1 + else None + ) + + +class TexasFinalCourtDocketDictFactory(TexasCommonDataDictFactory): + """Factory for Texas Supreme Court and Court of Criminal Appeals docket data.""" + + court_type = "texas_final" + court_id = Faker( + "random_element", elements=("texas_cossup", "texas_coscca") + ) From 9663338f23a9c956ae22ce5b52365a5b7fd37384 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 10 Feb 2026 00:44:07 +0000 Subject: [PATCH 11/87] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cl/corpus_importer/tasks.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 575978a68e..c478bd60a7 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -57,20 +57,17 @@ ) from juriscraper.pacer.reports import BaseReport from juriscraper.state.texas import ( - TexasAppealsCourt, TexasCaseEvent, TexasCaseParty, TexasCourtOfCriminalAppealsDocket, TexasSupremeCourtAppellateBrief, TexasSupremeCourtCaseEvent, TexasSupremeCourtDocket, - TexasTrialCourt, ) from juriscraper.state.texas.common import ( CourtID, TexasAppellateBrief, TexasCaseDocument, - TexasCommonData, ) from juriscraper.state.texas.court_of_appeals import TexasCourtOfAppealsDocket from openai import ( @@ -112,7 +109,6 @@ create_docket_entry_sequence_numbers, is_appellate_court, is_long_appellate_document_number, - juriscraper_to_cl_court_id, make_iquery_probing_key, mark_ia_upload_needed, ) From 70f263e2928121020abd9a16ed1018c69f33b76e Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 10 Feb 2026 10:35:09 -0700 Subject: [PATCH 12/87] fix(texas): Bad DB migration Don't name things "name_" --- cl/corpus_importer/tasks.py | 30 ++++++++-------- .../0052_case_transfer_docket_to_text_id.py | 28 ++++++++++++--- .../0052_case_transfer_docket_to_text_id.sql | 35 +++++++++---------- ...e_transfer_docket_to_text_id_customers.sql | 19 +++++----- cl/search/models.py | 8 ++--- 5 files changed, 67 insertions(+), 53 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index c478bd60a7..f18fa42383 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3810,7 +3810,7 @@ def merge_texas_case_transfers( # be populated by an appellate docket later on. transfer = CaseTransfer( destination_court=docket.court, - destination_docket_id=docket.docket_number, + destination_docket_number=docket.docket_number, transfer_date=docket_data["date_filed"], transfer_type=CaseTransfer.APPEAL, ) @@ -3833,7 +3833,7 @@ def merge_texas_case_transfers( transfer.origin_court = Court.objects.get( pk=trial_court_id ) - transfer.origin_docket_id = docket_data[ + transfer.origin_docket_number = docket_data[ "originating_court" ]["case"] else: @@ -3844,10 +3844,10 @@ def merge_texas_case_transfers( return MergeResult.failed() else: transfer.origin_court = Court.objects.get(pk=appeals_court_id) - transfer.origin_docket_id = appeals_court["case_number"] + transfer.origin_docket_number = appeals_court["case_number"] elif docket_data["court_id"] == CourtID.SUPREME_COURT: transfer.origin_court = Court.objects.get(pk=appeals_court_id) - transfer.origin_docket_id = appeals_court["case_number"] + transfer.origin_docket_number = appeals_court["case_number"] else: logger.error( "Unrecognized Texas final court ID %s while creating CaseTransfer", @@ -3861,9 +3861,11 @@ def merge_texas_case_transfers( transfers.append( CaseTransfer( origin_court=Court.objects.get(pk=trial_court_id), - origin_docket_id=docket_data["originating_court"]["case"], + origin_docket_number=docket_data["originating_court"][ + "case" + ], destination_court=docket.court, - destination_docket_id=docket.docket_number, + destination_docket_number=docket.docket_number, transfer_date=docket_data["date_filed"], transfer_type=CaseTransfer.APPEAL, ) @@ -3876,11 +3878,11 @@ def merge_texas_case_transfers( docket_data["transfer_from"]["court_id"] ) ), - origin_docket_id=docket_data["transfer_from"][ + origin_docket_number=docket_data["transfer_from"][ "origin_docket" ], destination_court=docket.court, - destination_docket_id=docket.docket_number, + destination_docket_number=docket.docket_number, # The "date" field of transfers is not always set, but when it is, it seems to match date filed. transfer_date=docket_data["date_filed"], transfer_type=CaseTransfer.WORKLOAD, @@ -3899,9 +3901,9 @@ def merge_texas_case_transfers( for transfer in transfers: _, created = CaseTransfer.objects.get_or_create( origin_court=transfer.origin_court, - origin_docket_id=transfer.origin_docket_id, + origin_docket_number=transfer.origin_docket_number, destination_court=transfer.destination_court, - destination_docket_id=transfer.destination_docket_id, + destination_docket_number=transfer.destination_docket_number, transfer_date=transfer.transfer_date, transfer_type=transfer.transfer_type, ) @@ -3909,17 +3911,17 @@ def merge_texas_case_transfers( any_created = True logger.info( "Created CaseTransfer object from docket %s in court %s to docket %s in court %s", - transfer.origin_docket_id, + transfer.origin_docket_number, transfer.origin_court.pk, - transfer.destination_docket_id, + transfer.destination_docket_number, transfer.destination_court.pk, ) else: logger.warning( "CaseTransfer object from docket %s in court %s to docket %s in court %s already exists", - transfer.origin_docket_id, + transfer.origin_docket_number, transfer.origin_court.pk, - transfer.destination_docket_id, + transfer.destination_docket_number, transfer.destination_court.pk, ) diff --git a/cl/search/migrations/0052_case_transfer_docket_to_text_id.py b/cl/search/migrations/0052_case_transfer_docket_to_text_id.py index c637a7f97a..3c8c8f0701 100644 --- a/cl/search/migrations/0052_case_transfer_docket_to_text_id.py +++ b/cl/search/migrations/0052_case_transfer_docket_to_text_id.py @@ -1,5 +1,7 @@ -# Generated by Django 6.0.1 on 2026-02-09 21:36 +# Generated by Django 6.0.1 on 2026-02-10 17:22 +import pgtrigger.compiler +import pgtrigger.migrations from django.db import migrations, models @@ -10,6 +12,14 @@ class Migration(migrations.Migration): ] operations = [ + pgtrigger.migrations.RemoveTrigger( + model_name='casetransfer', + name='update_update', + ), + pgtrigger.migrations.RemoveTrigger( + model_name='casetransfer', + name='delete_delete', + ), migrations.RemoveField( model_name='casetransfer', name='destination_docket', @@ -28,26 +38,34 @@ class Migration(migrations.Migration): ), migrations.AddField( model_name='casetransfer', - name='destination_docket_id', + name='destination_docket_number', field=models.TextField(db_comment='The ID of the case docket in the destination court.', default='', help_text='The ID of the case docket in the destination court.'), preserve_default=False, ), migrations.AddField( model_name='casetransfer', - name='origin_docket_id', + name='origin_docket_number', field=models.TextField(db_comment='The ID of the docket this transfer originates from.', default='', help_text='The ID of the docket this transfer originates from.'), preserve_default=False, ), migrations.AddField( model_name='casetransferevent', - name='destination_docket_id', + name='destination_docket_number', field=models.TextField(db_comment='The ID of the case docket in the destination court.', default='', help_text='The ID of the case docket in the destination court.'), preserve_default=False, ), migrations.AddField( model_name='casetransferevent', - name='origin_docket_id', + name='origin_docket_number', field=models.TextField(db_comment='The ID of the docket this transfer originates from.', default='', help_text='The ID of the docket this transfer originates from.'), preserve_default=False, ), + pgtrigger.migrations.AddTrigger( + model_name='casetransfer', + trigger=pgtrigger.compiler.Trigger(name='update_update', sql=pgtrigger.compiler.UpsertTriggerSql(condition='WHEN (OLD."destination_court_id" IS DISTINCT FROM (NEW."destination_court_id") OR OLD."destination_docket_number" IS DISTINCT FROM (NEW."destination_docket_number") OR OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."origin_court_id" IS DISTINCT FROM (NEW."origin_court_id") OR OLD."origin_docket_number" IS DISTINCT FROM (NEW."origin_docket_number") OR OLD."transfer_date" IS DISTINCT FROM (NEW."transfer_date") OR OLD."transfer_type" IS DISTINCT FROM (NEW."transfer_type"))', func='INSERT INTO "search_casetransferevent" ("date_created", "date_modified", "destination_court_id", "destination_docket_number", "id", "origin_court_id", "origin_docket_number", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "transfer_date", "transfer_type") VALUES (OLD."date_created", OLD."date_modified", OLD."destination_court_id", OLD."destination_docket_number", OLD."id", OLD."origin_court_id", OLD."origin_docket_number", _pgh_attach_context(), NOW(), \'update\', OLD."id", OLD."transfer_date", OLD."transfer_type"); RETURN NULL;', hash='80b917540054665b8a49836933ea803aad526606', operation='UPDATE', pgid='pgtrigger_update_update_8e8e1', table='search_casetransfer', when='AFTER')), + ), + pgtrigger.migrations.AddTrigger( + model_name='casetransfer', + trigger=pgtrigger.compiler.Trigger(name='delete_delete', sql=pgtrigger.compiler.UpsertTriggerSql(func='INSERT INTO "search_casetransferevent" ("date_created", "date_modified", "destination_court_id", "destination_docket_number", "id", "origin_court_id", "origin_docket_number", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "transfer_date", "transfer_type") VALUES (OLD."date_created", OLD."date_modified", OLD."destination_court_id", OLD."destination_docket_number", OLD."id", OLD."origin_court_id", OLD."origin_docket_number", _pgh_attach_context(), NOW(), \'delete\', OLD."id", OLD."transfer_date", OLD."transfer_type"); RETURN NULL;', hash='1ebca4ba9fa46ebba2ad0edfdedbc2710d4cb1b6', operation='DELETE', pgid='pgtrigger_delete_delete_b8bc0', table='search_casetransfer', when='AFTER')), + ), ] diff --git a/cl/search/migrations/0052_case_transfer_docket_to_text_id.sql b/cl/search/migrations/0052_case_transfer_docket_to_text_id.sql index eb720bf414..ca82dce37f 100644 --- a/cl/search/migrations/0052_case_transfer_docket_to_text_id.sql +++ b/cl/search/migrations/0052_case_transfer_docket_to_text_id.sql @@ -1,7 +1,4 @@ BEGIN; --- --- Remove field destination_docket from casetransfer --- SET CONSTRAINTS "search_casetransfer_destination_docket_i_9941948f_fk_search_do" IMMEDIATE; ALTER TABLE "search_casetransfer" DROP CONSTRAINT "search_casetransfer_destination_docket_i_9941948f_fk_search_do"; ALTER TABLE "search_casetransfer" DROP COLUMN "destination_docket_id"; -- @@ -18,27 +15,27 @@ ALTER TABLE "search_casetransferevent" DROP COLUMN "destination_docket_id"; -- ALTER TABLE "search_casetransferevent" DROP COLUMN "origin_docket_id"; -- --- Add field destination_docket_id to casetransfer +-- Add field destination_docket_number to casetransfer -- -ALTER TABLE "search_casetransfer" ADD COLUMN "destination_docket_id" text DEFAULT '' NOT NULL; -ALTER TABLE "search_casetransfer" ALTER COLUMN "destination_docket_id" DROP DEFAULT; -COMMENT ON COLUMN "search_casetransfer"."destination_docket_id" IS 'The ID of the case docket in the destination court.'; +ALTER TABLE "search_casetransfer" ADD COLUMN "destination_docket_number" text DEFAULT '' NOT NULL; +ALTER TABLE "search_casetransfer" ALTER COLUMN "destination_docket_number" DROP DEFAULT; +COMMENT ON COLUMN "search_casetransfer"."destination_docket_number" IS 'The ID of the case docket in the destination court.'; -- --- Add field origin_docket_id to casetransfer +-- Add field origin_docket_number to casetransfer -- -ALTER TABLE "search_casetransfer" ADD COLUMN "origin_docket_id" text DEFAULT '' NOT NULL; -ALTER TABLE "search_casetransfer" ALTER COLUMN "origin_docket_id" DROP DEFAULT; -COMMENT ON COLUMN "search_casetransfer"."origin_docket_id" IS 'The ID of the docket this transfer originates from.'; +ALTER TABLE "search_casetransfer" ADD COLUMN "origin_docket_number" text DEFAULT '' NOT NULL; +ALTER TABLE "search_casetransfer" ALTER COLUMN "origin_docket_number" DROP DEFAULT; +COMMENT ON COLUMN "search_casetransfer"."origin_docket_number" IS 'The ID of the docket this transfer originates from.'; -- --- Add field destination_docket_id to casetransferevent +-- Add field destination_docket_number to casetransferevent -- -ALTER TABLE "search_casetransferevent" ADD COLUMN "destination_docket_id" text DEFAULT '' NOT NULL; -ALTER TABLE "search_casetransferevent" ALTER COLUMN "destination_docket_id" DROP DEFAULT; -COMMENT ON COLUMN "search_casetransferevent"."destination_docket_id" IS 'The ID of the case docket in the destination court.'; +ALTER TABLE "search_casetransferevent" ADD COLUMN "destination_docket_number" text DEFAULT '' NOT NULL; +ALTER TABLE "search_casetransferevent" ALTER COLUMN "destination_docket_number" DROP DEFAULT; +COMMENT ON COLUMN "search_casetransferevent"."destination_docket_number" IS 'The ID of the case docket in the destination court.'; -- --- Add field origin_docket_id to casetransferevent +-- Add field origin_docket_number to casetransferevent -- -ALTER TABLE "search_casetransferevent" ADD COLUMN "origin_docket_id" text DEFAULT '' NOT NULL; -ALTER TABLE "search_casetransferevent" ALTER COLUMN "origin_docket_id" DROP DEFAULT; -COMMENT ON COLUMN "search_casetransferevent"."origin_docket_id" IS 'The ID of the docket this transfer originates from.'; +ALTER TABLE "search_casetransferevent" ADD COLUMN "origin_docket_number" text DEFAULT '' NOT NULL; +ALTER TABLE "search_casetransferevent" ALTER COLUMN "origin_docket_number" DROP DEFAULT; +COMMENT ON COLUMN "search_casetransferevent"."origin_docket_number" IS 'The ID of the docket this transfer originates from.'; COMMIT; diff --git a/cl/search/migrations/0052_case_transfer_docket_to_text_id_customers.sql b/cl/search/migrations/0052_case_transfer_docket_to_text_id_customers.sql index 23558ffcd1..7877aa77da 100644 --- a/cl/search/migrations/0052_case_transfer_docket_to_text_id_customers.sql +++ b/cl/search/migrations/0052_case_transfer_docket_to_text_id_customers.sql @@ -1,7 +1,4 @@ BEGIN; --- --- Remove field destination_docket from casetransfer --- SET CONSTRAINTS "search_casetransfer_destination_docket_i_9941948f_fk_search_do" IMMEDIATE; ALTER TABLE "search_casetransfer" DROP CONSTRAINT "search_casetransfer_destination_docket_i_9941948f_fk_search_do"; ALTER TABLE "search_casetransfer" DROP COLUMN "destination_docket_id"; -- @@ -10,15 +7,15 @@ ALTER TABLE "search_casetransfer" DROP COLUMN "destination_docket_id"; SET CONSTRAINTS "search_casetransfer_origin_docket_id_b23a08e9_fk_search_do" IMMEDIATE; ALTER TABLE "search_casetransfer" DROP CONSTRAINT "search_casetransfer_origin_docket_id_b23a08e9_fk_search_do"; ALTER TABLE "search_casetransfer" DROP COLUMN "origin_docket_id"; -- --- Add field destination_docket_id to casetransfer +-- Add field destination_docket_number to casetransfer -- -ALTER TABLE "search_casetransfer" ADD COLUMN "destination_docket_id" text DEFAULT '' NOT NULL; -ALTER TABLE "search_casetransfer" ALTER COLUMN "destination_docket_id" DROP DEFAULT; -COMMENT ON COLUMN "search_casetransfer"."destination_docket_id" IS 'The ID of the case docket in the destination court.'; +ALTER TABLE "search_casetransfer" ADD COLUMN "destination_docket_number" text DEFAULT '' NOT NULL; +ALTER TABLE "search_casetransfer" ALTER COLUMN "destination_docket_number" DROP DEFAULT; +COMMENT ON COLUMN "search_casetransfer"."destination_docket_number" IS 'The ID of the case docket in the destination court.'; -- --- Add field origin_docket_id to casetransfer +-- Add field origin_docket_number to casetransfer -- -ALTER TABLE "search_casetransfer" ADD COLUMN "origin_docket_id" text DEFAULT '' NOT NULL; -ALTER TABLE "search_casetransfer" ALTER COLUMN "origin_docket_id" DROP DEFAULT; -COMMENT ON COLUMN "search_casetransfer"."origin_docket_id" IS 'The ID of the docket this transfer originates from.'; +ALTER TABLE "search_casetransfer" ADD COLUMN "origin_docket_number" text DEFAULT '' NOT NULL; +ALTER TABLE "search_casetransfer" ALTER COLUMN "origin_docket_number" DROP DEFAULT; +COMMENT ON COLUMN "search_casetransfer"."origin_docket_number" IS 'The ID of the docket this transfer originates from.'; COMMIT; diff --git a/cl/search/models.py b/cl/search/models.py index 8bd9ff2406..6e9a17cc29 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -3981,9 +3981,9 @@ class CaseTransfer(AbstractDateTimeModel): an appeal, workload balancing, or docket merging. :ivar origin_court: The court this transfer originates from. - :ivar origin_docket_id: The ID of the docket this transfer originates from. + :ivar origin_docket_number: The ID of the docket this transfer originates from. :ivar destination_court: The court the docket is being transferred to. - :ivar destination_docket_id: The ID of the case docket in the destination court. + :ivar destination_docket_number: The ID of the case docket in the destination court. :ivar transfer_date: The date this transfer occurred. :ivar transfer_type: The type of transfer (appeal, work sharing, etc.). """ @@ -4007,13 +4007,13 @@ class CaseTransfer(AbstractDateTimeModel): on_delete=models.CASCADE, related_name="case_transfer_origin_court", ) - origin_docket_id = models.TextField() + origin_docket_number = models.TextField() destination_court = models.ForeignKey( "search.Court", on_delete=models.CASCADE, related_name="case_transfer_destination_court", ) - destination_docket_id = models.TextField() + destination_docket_number = models.TextField() transfer_date = models.DateField() transfer_type = models.SmallIntegerField( choices=transfer_type_choices.items(), From 1f9c10f9be2a5b4bea626e208b7ba84aabd949a3 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 10 Feb 2026 10:36:10 -0700 Subject: [PATCH 13/87] test(texas): Fix AI nonsense Finish fixing up badness from having Claude generate tests. --- cl/corpus_importer/tests.py | 90 +++++++++++------------------- cl/search/state/texas/factories.py | 35 ++++++++---- 2 files changed, 57 insertions(+), 68 deletions(-) diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 247d74f0a5..c4a2350adf 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -25,6 +25,7 @@ from juriscraper.state.texas import ( TexasCaseParty, ) +from juriscraper.state.texas.common import CourtID from openai import RateLimitError from pydantic import ValidationError @@ -168,6 +169,7 @@ ) from cl.search.state.texas.factories import ( TexasAppellateCourtInfoDictFactory, + TexasAppellateTransferDictFactory, TexasCaseDocumentDictFactory, TexasCourtOfAppealsDocketDictFactory, TexasDocketEntryDictFactory, @@ -2321,34 +2323,6 @@ def get_test_pdf( self.assertTrue(document.filepath_local) self.assertIn("UNITED", document.plain_text) - def test_merge_texas_documents(self): - """Can we correctly handle multiple documents?""" - docket_entry = self.docket_coa1_entry - existing_document = TexasCaseDocumentDictFactory() - current_attachment = TexasDocumentFactory.create( - docket_entry=docket_entry, - description=existing_document["description"], - media_id=existing_document["media_id"], - media_version_id=existing_document["media_version_id"], - document_url=existing_document["document_url"], - ) - input_documents = [ - TexasCaseDocumentDictFactory(), - existing_document, - ] - - result = merge_texas_documents(docket_entry, input_documents) - - assert len(result) == 2 - assert result[0].create is True - assert result[0].update is False - assert result[0].success is True - assert result[0].pk is not None - assert result[1].create is False - assert result[1].update is False - assert result[1].success is True - assert result[1].pk == current_attachment.pk - def test_merge_texas_docket_entry_new_entry(self): """Can we correctly handle a docket entry?""" docket_entry = TexasDocketEntryDictFactory( @@ -2854,7 +2828,7 @@ def test_merge_texas_docket_originating_court_updates_existing(self): originating_court = TexasOriginatingDistrictCourtDictFactory() docket_data = TexasCourtOfAppealsDocketDictFactory( - court_id="texas_coa01", + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, docket_number=self.docket_number_coa1, originating_court=originating_court, ) @@ -2882,8 +2856,8 @@ def test_merge_texas_case_transfers_appellate_court_from_trial(self): district=5, case="2023-12345", ) - docket_data = TexasCourtOfAppealsDocketFactory( - court_id="texas_coa01", + docket_data = TexasCourtOfAppealsDocketDictFactory( + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, docket_number=self.docket_number_coa1, date_filed=date(2025, 1, 15), originating_court=originating_court, @@ -2897,12 +2871,12 @@ def test_merge_texas_case_transfers_appellate_court_from_trial(self): transfers = CaseTransfer.objects.filter( destination_court=self.texas_coa1, - destination_docket_id=self.docket_number_coa1, + destination_docket_number=self.docket_number_coa1, ) assert transfers.count() == 1 transfer = transfers.first() assert transfer.origin_court.id == "txdistct6" - assert transfer.origin_docket_id == "2023-12345" + assert transfer.origin_docket_number == "2023-12345" assert transfer.transfer_type == CaseTransfer.APPEAL assert transfer.transfer_date == date(2025, 1, 15) @@ -2918,13 +2892,13 @@ def test_merge_texas_case_transfers_appellate_with_workload_transfer( district=5, case="2023-12345", ) - transfer_from = TexasTransferFromDictFactory( - court_id="texas_coa02", + transfer_from = TexasAppellateTransferDictFactory( + court_id=CourtID.SECOND_COURT_OF_APPEALS.value, origin_docket="02-24-00999-CV", date=date(2025, 1, 10), ) - docket_data = TexasCourtOfAppealsDocketFactory( - court_id="texas_coa01", + docket_data = TexasCourtOfAppealsDocketDictFactory( + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, docket_number=self.docket_number_coa1, date_filed=date(2025, 1, 15), originating_court=originating_court, @@ -2938,17 +2912,17 @@ def test_merge_texas_case_transfers_appellate_with_workload_transfer( transfers = CaseTransfer.objects.filter( destination_court=self.texas_coa1, - destination_docket_id=self.docket_number_coa1, + destination_docket_number=self.docket_number_coa1, ) assert transfers.count() == 2 appeal_transfer = transfers.get(transfer_type=CaseTransfer.APPEAL) assert appeal_transfer.origin_court.id == "txdistct6" - assert appeal_transfer.origin_docket_id == "2023-12345" + assert appeal_transfer.origin_docket_number == "2023-12345" workload_transfer = transfers.get(transfer_type=CaseTransfer.WORKLOAD) assert workload_transfer.origin_court == texas_coa2 - assert workload_transfer.origin_docket_id == "02-24-00999-CV" + assert workload_transfer.origin_docket_number == "02-24-00999-CV" def test_merge_texas_case_transfers_supreme_court(self): """Can we create a CaseTransfer for a Supreme Court case?""" @@ -2957,10 +2931,11 @@ def test_merge_texas_case_transfers_supreme_court(self): ) appeals_court = TexasAppellateCourtInfoDictFactory( - court_id="texas_coa01", + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, case_number=self.docket_number_coa1, ) - docket_data = TexasSupremeCourtDocketFactory( + docket_data = TexasFinalCourtDocketDictFactory( + court_id=CourtID.SUPREME_COURT.value, docket_number="25-0100", date_filed=date(2025, 1, 15), originating_court=TexasOriginatingDistrictCourtDictFactory( @@ -2977,12 +2952,12 @@ def test_merge_texas_case_transfers_supreme_court(self): transfers = CaseTransfer.objects.filter( destination_court=self.texas_sc, - destination_docket_id="25-0100", + destination_docket_number="25-0100", ) assert transfers.count() == 1 transfer = transfers.first() assert transfer.origin_court == self.texas_coa1 - assert transfer.origin_docket_id == self.docket_number_coa1 + assert transfer.origin_docket_number == self.docket_number_coa1 assert transfer.transfer_type == CaseTransfer.APPEAL assert transfer.transfer_date == date(2025, 1, 15) @@ -2993,10 +2968,11 @@ def test_merge_texas_case_transfers_cca_from_appellate(self): ) appeals_court = TexasAppellateCourtInfoDictFactory( - court_id="texas_coa01", + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, case_number=self.docket_number_coa1, ) - docket_data = TexasCourtOfCriminalAppealsDocketFactory( + docket_data = TexasFinalCourtDocketDictFactory( + court_id=CourtID.COURT_OF_CRIMINAL_APPEALS.value, docket_number="PD-0100-25", date_filed=date(2025, 1, 15), originating_court=TexasOriginatingDistrictCourtDictFactory( @@ -3013,12 +2989,12 @@ def test_merge_texas_case_transfers_cca_from_appellate(self): transfers = CaseTransfer.objects.filter( destination_court=self.texas_cca, - destination_docket_id="PD-0100-25", + destination_docket_number="PD-0100-25", ) assert transfers.count() == 1 transfer = transfers.first() assert transfer.origin_court == self.texas_coa1 - assert transfer.origin_docket_id == self.docket_number_coa1 + assert transfer.origin_docket_number == self.docket_number_coa1 assert transfer.transfer_type == CaseTransfer.APPEAL def test_merge_texas_case_transfers_cca_death_penalty_direct_appeal( @@ -3033,7 +3009,7 @@ def test_merge_texas_case_transfers_cca_death_penalty_direct_appeal( district=5 ) docket_data = TexasFinalCourtDocketDictFactory( - court_id="texas_coscca", + court_id=CourtID.COURT_OF_CRIMINAL_APPEALS.value, docket_number="AP-76,000", originating_court=originating_court, appeals_court=None, @@ -3046,12 +3022,12 @@ def test_merge_texas_case_transfers_cca_death_penalty_direct_appeal( transfers = CaseTransfer.objects.filter( destination_court=self.texas_cca, - destination_docket_id="AP-76,000", + destination_docket_number="AP-76,000", ) assert transfers.count() == 1 transfer = transfers.first() assert transfer.origin_court.id == "txdistct6" - assert transfer.origin_docket_id == originating_court["case"] + assert transfer.origin_docket_number == originating_court["case"] assert transfer.transfer_type == CaseTransfer.APPEAL def test_merge_texas_case_transfers_no_trial_court_info(self): @@ -3062,7 +3038,7 @@ def test_merge_texas_case_transfers_no_trial_court_info(self): case="", ) docket_data = TexasCourtOfAppealsDocketDictFactory( - court_id="texas_coa01", + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, docket_number=self.docket_number_coa1, originating_court=originating_court, transfer_from=None, @@ -3074,7 +3050,7 @@ def test_merge_texas_case_transfers_no_trial_court_info(self): transfers = CaseTransfer.objects.filter( destination_court=self.texas_coa1, - destination_docket_id=self.docket_number_coa1, + destination_docket_number=self.docket_number_coa1, ) assert transfers.count() == 0 @@ -3084,9 +3060,9 @@ def test_merge_texas_case_transfers_duplicate_handling(self): CaseTransfer.objects.create( origin_court=texas_district, - origin_docket_id="2023-12345", + origin_docket_number="2023-12345", destination_court=self.texas_coa1, - destination_docket_id=self.docket_number_coa1, + destination_docket_number=self.docket_number_coa1, transfer_date=date(2025, 1, 15), transfer_type=CaseTransfer.APPEAL, ) @@ -3097,7 +3073,7 @@ def test_merge_texas_case_transfers_duplicate_handling(self): case="2023-12345", ) docket_data = TexasCourtOfAppealsDocketDictFactory( - court_id="texas_coa01", + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, docket_number=self.docket_number_coa1, date_filed=date(2025, 1, 15), originating_court=originating_court, @@ -3111,7 +3087,7 @@ def test_merge_texas_case_transfers_duplicate_handling(self): transfers = CaseTransfer.objects.filter( destination_court=self.texas_coa1, - destination_docket_id=self.docket_number_coa1, + destination_docket_number=self.docket_number_coa1, ) assert transfers.count() == 1 diff --git a/cl/search/state/texas/factories.py b/cl/search/state/texas/factories.py index 973508e9f5..2205cebfbc 100644 --- a/cl/search/state/texas/factories.py +++ b/cl/search/state/texas/factories.py @@ -5,6 +5,7 @@ from factory import DictFactory, Faker, List, SubFactory from factory.declarations import LazyAttribute from factory.django import DjangoModelFactory +from juriscraper.state.texas.common import CourtID from cl.search.factories import DocketFactory from cl.search.models import TexasDocketEntry, TexasDocument @@ -77,10 +78,10 @@ class TexasCommonDataDictFactory(DictFactory): court_id = Faker( "random_element", elements=( - "texas_coa01", - "texas_coa02", - "texas_cossup", - "texas_coscca", + CourtID.FIRST_COURT_OF_APPEALS.value, + CourtID.SECOND_COURT_OF_APPEALS.value, + CourtID.SUPREME_COURT.value, + CourtID.COURT_OF_CRIMINAL_APPEALS.value, ), ) court_type = Faker( @@ -136,10 +137,10 @@ class TexasAppellateCourtInfoDictFactory(DictFactory): court_id = Faker( "random_element", elements=( - "texas_coa01", - "texas_coa02", - "texas_coa14", - "texas_unknown", + CourtID.FIRST_COURT_OF_APPEALS.value, + CourtID.SECOND_COURT_OF_APPEALS.value, + CourtID.FOURTEENTH_COURT_OF_APPEALS.value, + CourtID.UNKNOWN.value, ), ) case_number = Faker("federal_district_docket_number") @@ -155,7 +156,11 @@ class TexasAppellateTransferDictFactory(DictFactory): court_id = Faker( "random_element", - elements=("texas_coa01", "texas_coa02", "texas_coa14"), + elements=( + CourtID.FIRST_COURT_OF_APPEALS.value, + CourtID.SECOND_COURT_OF_APPEALS.value, + CourtID.FOURTEENTH_COURT_OF_APPEALS.value, + ), ) origin_docket = Faker("federal_district_docket_number") date = Faker("date_object") @@ -167,7 +172,11 @@ class TexasCourtOfAppealsDocketDictFactory(TexasCommonDataDictFactory): court_type = "texas_appellate" court_id = Faker( "random_element", - elements=("texas_coa01", "texas_coa02", "texas_coa14"), + elements=( + CourtID.FIRST_COURT_OF_APPEALS.value, + CourtID.SECOND_COURT_OF_APPEALS.value, + CourtID.FOURTEENTH_COURT_OF_APPEALS.value, + ), ) transfer_from = LazyAttribute( lambda d: TexasAppellateTransferDictFactory.create() @@ -186,5 +195,9 @@ class TexasFinalCourtDocketDictFactory(TexasCommonDataDictFactory): court_type = "texas_final" court_id = Faker( - "random_element", elements=("texas_cossup", "texas_coscca") + "random_element", + elements=( + CourtID.SUPREME_COURT.value, + CourtID.COURT_OF_CRIMINAL_APPEALS.value, + ), ) From 461b1d9e14c60f96241d8c9bc458472638f4d783 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 10 Feb 2026 11:23:40 -0700 Subject: [PATCH 14/87] fix(texas): Fix errors in tests --- cl/corpus_importer/tasks.py | 28 ++++++++++++++++------------ cl/corpus_importer/tests.py | 12 ++++++------ cl/search/state/texas/factories.py | 3 ++- 3 files changed, 24 insertions(+), 19 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index f18fa42383..76777c89a4 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3649,7 +3649,7 @@ def merge_texas_docket_entry( docket.pk, ) document_results = [ - merge_texas_document(input_docket_entry, document) + merge_texas_document(docket_entry, document) for document in input_docket_entry["attachments"] ] @@ -3722,7 +3722,7 @@ def texas_js_court_id_to_court_id(js_court_id: str) -> str: if js_court_id == CourtID.COURT_OF_CRIMINAL_APPEALS.value: return "texcrimapp" # Court of appeals - appellate_number = str(int(js_court_id[len("texas_") :])) + appellate_number = str(int(js_court_id[len("texas_coa") :])) if appellate_number == "13": appellate_number = "13A" return f"txctapp{appellate_number}" @@ -3745,7 +3745,7 @@ def texas_originating_court_to_court_id( if district_number: if district_number > 1: district_number = district_number + 1 - return f"txdistct{district_number}" + return f"texdistct{district_number}" return "texdistct" if court_type == "texas_business": return "texbizct" @@ -3816,14 +3816,14 @@ def merge_texas_case_transfers( ) appeals_court = docket_data["appeals_court"] - if appeals_court["court_id"] == CourtID.UNKNOWN: - appeals_court_id = "texapp" - else: - appeals_court_id = texas_js_court_id_to_court_id( - appeals_court["court_id"] - ) if docket_data["court_id"] == CourtID.COURT_OF_CRIMINAL_APPEALS: + if appeals_court["court_id"] == CourtID.UNKNOWN: + appeals_court_id = "texapp" + else: + appeals_court_id = texas_js_court_id_to_court_id( + appeals_court["court_id"] + ) # Death penalty cases are automatically appealed to the CCA so the # appellate court may be missing. appeals_court = docket_data["appeals_court"] @@ -3846,6 +3846,12 @@ def merge_texas_case_transfers( transfer.origin_court = Court.objects.get(pk=appeals_court_id) transfer.origin_docket_number = appeals_court["case_number"] elif docket_data["court_id"] == CourtID.SUPREME_COURT: + if appeals_court["court_id"] == CourtID.UNKNOWN: + appeals_court_id = "texapp" + else: + appeals_court_id = texas_js_court_id_to_court_id( + appeals_court["court_id"] + ) transfer.origin_court = Court.objects.get(pk=appeals_court_id) transfer.origin_docket_number = appeals_court["case_number"] else: @@ -3925,9 +3931,7 @@ def merge_texas_case_transfers( transfer.destination_court.pk, ) - return MergeResult( - success=True, created=any_created, updated=False, pk=None - ) + return MergeResult(success=True, create=any_created, update=False, pk=None) def generate_texas_appellate_brief_flags( diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index c4a2350adf..f1e2cc57e1 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -2849,7 +2849,7 @@ def test_merge_texas_docket_originating_court_updates_existing(self): def test_merge_texas_case_transfers_appellate_court_from_trial(self): """Can we create a CaseTransfer for an appellate court case?""" - texas_district = CourtFactory.create(id="txdistct6") + texas_district = CourtFactory.create(id="texdistct6") originating_court = TexasOriginatingDistrictCourtDictFactory( court_type="texas_district", @@ -2875,7 +2875,7 @@ def test_merge_texas_case_transfers_appellate_court_from_trial(self): ) assert transfers.count() == 1 transfer = transfers.first() - assert transfer.origin_court.id == "txdistct6" + assert transfer.origin_court == texas_district assert transfer.origin_docket_number == "2023-12345" assert transfer.transfer_type == CaseTransfer.APPEAL assert transfer.transfer_date == date(2025, 1, 15) @@ -2884,8 +2884,8 @@ def test_merge_texas_case_transfers_appellate_with_workload_transfer( self, ): """Can we create CaseTransfer for appellate case with work sharing?""" - texas_district = CourtFactory.create(id="txdistct6") - texas_coa2 = CourtFactory.create(id="texas_coa2") + texas_district = CourtFactory.create(id="texdistct6") + texas_coa2 = CourtFactory.create(id="txctapp2") originating_court = TexasOriginatingDistrictCourtDictFactory( court_type="texas_district", @@ -2917,7 +2917,7 @@ def test_merge_texas_case_transfers_appellate_with_workload_transfer( assert transfers.count() == 2 appeal_transfer = transfers.get(transfer_type=CaseTransfer.APPEAL) - assert appeal_transfer.origin_court.id == "txdistct6" + assert appeal_transfer.origin_court == texas_district assert appeal_transfer.origin_docket_number == "2023-12345" workload_transfer = transfers.get(transfer_type=CaseTransfer.WORKLOAD) @@ -3056,7 +3056,7 @@ def test_merge_texas_case_transfers_no_trial_court_info(self): def test_merge_texas_case_transfers_duplicate_handling(self): """Do we properly handle duplicate CaseTransfer objects?""" - texas_district = CourtFactory.create(id="txdistct6") + texas_district = CourtFactory.create(id="texdistct6") CaseTransfer.objects.create( origin_court=texas_district, diff --git a/cl/search/state/texas/factories.py b/cl/search/state/texas/factories.py index 2205cebfbc..068ce11862 100644 --- a/cl/search/state/texas/factories.py +++ b/cl/search/state/texas/factories.py @@ -147,7 +147,7 @@ class TexasAppellateCourtInfoDictFactory(DictFactory): case_url = Faker("url") disposition = Faker("pystr") district = Faker("pystr") - justice = Faker("Name") + justice = Faker("name") opinion_cite = Faker("citation") @@ -194,6 +194,7 @@ class TexasFinalCourtDocketDictFactory(TexasCommonDataDictFactory): """Factory for Texas Supreme Court and Court of Criminal Appeals docket data.""" court_type = "texas_final" + appeals_court = SubFactory(TexasAppellateCourtInfoDictFactory) court_id = Faker( "random_element", elements=( From a724d3d74e7e51a7df6e2f74c2db50b77c05496e Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 10 Feb 2026 12:18:22 -0700 Subject: [PATCH 15/87] fix(texas): Fix broken behavior Fix things that weren't working discovered by tests. --- cl/corpus_importer/tasks.py | 39 +++++---- cl/corpus_importer/tests.py | 122 +++++++++++++---------------- cl/search/factories.py | 21 +++++ cl/search/state/texas/factories.py | 4 +- 4 files changed, 98 insertions(+), 88 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 76777c89a4..b3e55f2f36 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3768,16 +3768,17 @@ def merge_texas_docket_originating_court( :param docket: The docket to add the originating court to. :param docket_data: The docket data from Juriscraper. :return: The result of the merge operation.""" - originating_court_information = docket.originating_court_information - originating_court_data = docket_data["originating_court"] created = False - if not originating_court_information: + if not docket.originating_court_information: created = True - originating_court_information = OriginatingCourtInformation() + docket.originating_court_information = OriginatingCourtInformation() - originating_court_information.docket_number = ( - originating_court_data["case"], - ) + originating_court_information = docket.originating_court_information + originating_court_data = docket_data["originating_court"] + + originating_court_information.docket_number = originating_court_data[ + "case" + ] originating_court_information.court_reporter = originating_court_data[ "reporter" ] @@ -3786,6 +3787,8 @@ def merge_texas_docket_originating_court( ] # TODO Get judge from PeopleDB to add originating_court_information.save() + if created: + docket.save() return MergeResult(create=created, update=False, success=True, pk=None) @@ -3817,17 +3820,13 @@ def merge_texas_case_transfers( appeals_court = docket_data["appeals_court"] - if docket_data["court_id"] == CourtID.COURT_OF_CRIMINAL_APPEALS: - if appeals_court["court_id"] == CourtID.UNKNOWN: - appeals_court_id = "texapp" - else: - appeals_court_id = texas_js_court_id_to_court_id( - appeals_court["court_id"] - ) + if docket_data["court_id"] == CourtID.COURT_OF_CRIMINAL_APPEALS.value: # Death penalty cases are automatically appealed to the CCA so the # appellate court may be missing. - appeals_court = docket_data["appeals_court"] - if appeals_court["court_id"] == CourtID.UNKNOWN: + if ( + not appeals_court + or appeals_court["court_id"] == CourtID.UNKNOWN + ): # Death penalty appeal if trial_court_id: transfer.origin_court = Court.objects.get( @@ -3843,9 +3842,15 @@ def merge_texas_case_transfers( ) return MergeResult.failed() else: + if appeals_court["court_id"] == CourtID.UNKNOWN: + appeals_court_id = "texapp" + else: + appeals_court_id = texas_js_court_id_to_court_id( + appeals_court["court_id"] + ) transfer.origin_court = Court.objects.get(pk=appeals_court_id) transfer.origin_docket_number = appeals_court["case_number"] - elif docket_data["court_id"] == CourtID.SUPREME_COURT: + elif docket_data["court_id"] == CourtID.SUPREME_COURT.value: if appeals_court["court_id"] == CourtID.UNKNOWN: appeals_court_id = "texapp" else: diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index f1e2cc57e1..11195bacc2 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -145,6 +145,7 @@ from cl.scrapers.models import PACERFreeDocumentRow from cl.scrapers.tasks import update_docket_info_iquery from cl.search.factories import ( + CaseTransferFactory, CourtFactory, DocketEntryFactory, DocketFactory, @@ -176,6 +177,7 @@ TexasDocketEntryFactory, TexasDocumentFactory, TexasFinalCourtDocketDictFactory, + TexasOriginatingCourtDictFactory, TexasOriginatingDistrictCourtDictFactory, ) from cl.search.state.texas.models import TexasDocketEntry, TexasDocument @@ -2162,9 +2164,9 @@ def setUp(self): @classmethod def setUpTestData(cls): """Create test data for Texas merger tests""" - cls.texas_sc = CourtFactory.create(id="texas_sc") - cls.texas_cca = CourtFactory.create(id="texas_cca") - cls.texas_coa1 = CourtFactory.create(id="texas_coa1") + cls.texas_sc = CourtFactory.create(id="tex") + cls.texas_cca = CourtFactory.create(id="texcrimapp") + cls.texas_coa1 = CourtFactory.create(id="txctapp1") cls.docket_number_coa1 = "01-25-00011-CV" cls.docket_coa1 = DocketFactory.create( court=cls.texas_coa1, docket_number=cls.docket_number_coa1 @@ -2783,10 +2785,11 @@ def test_download_texas_document_pdf_download_failure(self): def test_merge_texas_docket_originating_court_creates_new(self): """Can we create new originating court information?""" + self.docket_coa1.originating_court_information = None + self.docket_coa1.save() docket_data = TexasCourtOfAppealsDocketDictFactory( docket_number=self.docket_number_coa1, originating_court=TexasOriginatingDistrictCourtDictFactory( - court_type="texas_district", district=5, ), ) @@ -2869,14 +2872,13 @@ def test_merge_texas_case_transfers_appellate_court_from_trial(self): assert result.success is True assert result.create is True - transfers = CaseTransfer.objects.filter( - destination_court=self.texas_coa1, - destination_docket_number=self.docket_number_coa1, - ) + transfers = CaseTransfer.objects.all() assert transfers.count() == 1 transfer = transfers.first() + assert transfer.destination_court == self.texas_coa1 + assert transfer.destination_docket_number == self.docket_number_coa1 assert transfer.origin_court == texas_district - assert transfer.origin_docket_number == "2023-12345" + assert transfer.origin_docket_number == originating_court["case"] assert transfer.transfer_type == CaseTransfer.APPEAL assert transfer.transfer_date == date(2025, 1, 15) @@ -2888,13 +2890,10 @@ def test_merge_texas_case_transfers_appellate_with_workload_transfer( texas_coa2 = CourtFactory.create(id="txctapp2") originating_court = TexasOriginatingDistrictCourtDictFactory( - court_type="texas_district", district=5, - case="2023-12345", ) transfer_from = TexasAppellateTransferDictFactory( court_id=CourtID.SECOND_COURT_OF_APPEALS.value, - origin_docket="02-24-00999-CV", date=date(2025, 1, 10), ) docket_data = TexasCourtOfAppealsDocketDictFactory( @@ -2910,25 +2909,35 @@ def test_merge_texas_case_transfers_appellate_with_workload_transfer( assert result.success is True assert result.create is True - transfers = CaseTransfer.objects.filter( - destination_court=self.texas_coa1, - destination_docket_number=self.docket_number_coa1, - ) + transfers = CaseTransfer.objects.all() assert transfers.count() == 2 appeal_transfer = transfers.get(transfer_type=CaseTransfer.APPEAL) + assert appeal_transfer.destination_court == self.texas_coa1 + assert ( + appeal_transfer.destination_docket_number + == self.docket_number_coa1 + ) assert appeal_transfer.origin_court == texas_district - assert appeal_transfer.origin_docket_number == "2023-12345" + assert ( + appeal_transfer.origin_docket_number == originating_court["case"] + ) workload_transfer = transfers.get(transfer_type=CaseTransfer.WORKLOAD) + assert workload_transfer.destination_court == self.texas_coa1 + assert ( + workload_transfer.destination_docket_number + == self.docket_number_coa1 + ) assert workload_transfer.origin_court == texas_coa2 - assert workload_transfer.origin_docket_number == "02-24-00999-CV" + assert ( + workload_transfer.origin_docket_number + == transfer_from["origin_docket"] + ) def test_merge_texas_case_transfers_supreme_court(self): """Can we create a CaseTransfer for a Supreme Court case?""" - docket_sc = DocketFactory.create( - court=self.texas_sc, docket_number="25-0100" - ) + docket_sc = DocketFactory.create(court=self.texas_sc) appeals_court = TexasAppellateCourtInfoDictFactory( court_id=CourtID.FIRST_COURT_OF_APPEALS.value, @@ -2936,12 +2945,7 @@ def test_merge_texas_case_transfers_supreme_court(self): ) docket_data = TexasFinalCourtDocketDictFactory( court_id=CourtID.SUPREME_COURT.value, - docket_number="25-0100", - date_filed=date(2025, 1, 15), - originating_court=TexasOriginatingDistrictCourtDictFactory( - court_type="texas_district", - district=5, - ), + docket_number=docket_sc.docket_number, appeals_court=appeals_court, ) @@ -2950,22 +2954,19 @@ def test_merge_texas_case_transfers_supreme_court(self): assert result.success is True assert result.create is True - transfers = CaseTransfer.objects.filter( - destination_court=self.texas_sc, - destination_docket_number="25-0100", - ) + transfers = CaseTransfer.objects.all() assert transfers.count() == 1 transfer = transfers.first() + assert transfer.destination_court == self.texas_sc + assert transfer.destination_docket_number == docket_sc.docket_number assert transfer.origin_court == self.texas_coa1 assert transfer.origin_docket_number == self.docket_number_coa1 assert transfer.transfer_type == CaseTransfer.APPEAL - assert transfer.transfer_date == date(2025, 1, 15) + assert transfer.transfer_date == docket_data["date_filed"] def test_merge_texas_case_transfers_cca_from_appellate(self): """Can we create a CaseTransfer for CCA from appellate court?""" - docket_cca = DocketFactory.create( - court=self.texas_cca, docket_number="PD-0100-25" - ) + docket_cca = DocketFactory.create(court=self.texas_cca) appeals_court = TexasAppellateCourtInfoDictFactory( court_id=CourtID.FIRST_COURT_OF_APPEALS.value, @@ -2973,12 +2974,7 @@ def test_merge_texas_case_transfers_cca_from_appellate(self): ) docket_data = TexasFinalCourtDocketDictFactory( court_id=CourtID.COURT_OF_CRIMINAL_APPEALS.value, - docket_number="PD-0100-25", - date_filed=date(2025, 1, 15), - originating_court=TexasOriginatingDistrictCourtDictFactory( - court_type="texas_district", - district=5, - ), + docket_number=docket_cca.docket_number, appeals_court=appeals_court, ) @@ -2987,12 +2983,11 @@ def test_merge_texas_case_transfers_cca_from_appellate(self): assert result.success is True assert result.create is True - transfers = CaseTransfer.objects.filter( - destination_court=self.texas_cca, - destination_docket_number="PD-0100-25", - ) + transfers = CaseTransfer.objects.all() assert transfers.count() == 1 transfer = transfers.first() + assert transfer.destination_court == self.texas_cca + assert transfer.destination_docket_number == docket_cca.docket_number assert transfer.origin_court == self.texas_coa1 assert transfer.origin_docket_number == self.docket_number_coa1 assert transfer.transfer_type == CaseTransfer.APPEAL @@ -3001,16 +2996,15 @@ def test_merge_texas_case_transfers_cca_death_penalty_direct_appeal( self, ): """Can we handle death penalty direct appeals to CCA?""" - docket_cca = DocketFactory.create( - court=self.texas_cca, docket_number="AP-76,000" - ) + texas_district = CourtFactory.create(id="texdistct6") + docket_cca = DocketFactory.create(court=self.texas_cca) originating_court = TexasOriginatingDistrictCourtDictFactory( district=5 ) docket_data = TexasFinalCourtDocketDictFactory( court_id=CourtID.COURT_OF_CRIMINAL_APPEALS.value, - docket_number="AP-76,000", + docket_number=docket_cca.docket_number, originating_court=originating_court, appeals_court=None, ) @@ -3020,21 +3014,19 @@ def test_merge_texas_case_transfers_cca_death_penalty_direct_appeal( assert result.success is True assert result.create is True - transfers = CaseTransfer.objects.filter( - destination_court=self.texas_cca, - destination_docket_number="AP-76,000", - ) + transfers = CaseTransfer.objects.all() assert transfers.count() == 1 transfer = transfers.first() - assert transfer.origin_court.id == "txdistct6" + assert transfer.destination_court == self.texas_cca + assert transfer.destination_docket_number == docket_cca.docket_number + assert transfer.origin_court == texas_district assert transfer.origin_docket_number == originating_court["case"] assert transfer.transfer_type == CaseTransfer.APPEAL def test_merge_texas_case_transfers_no_trial_court_info(self): """Do we handle appellate cases without trial court info?""" - originating_court = TexasOriginatingDistrictCourtDictFactory( + originating_court = TexasOriginatingCourtDictFactory( court_type="texas_unknown", - district=None, case="", ) docket_data = TexasCourtOfAppealsDocketDictFactory( @@ -3048,35 +3040,30 @@ def test_merge_texas_case_transfers_no_trial_court_info(self): assert result.success is True - transfers = CaseTransfer.objects.filter( - destination_court=self.texas_coa1, - destination_docket_number=self.docket_number_coa1, - ) + transfers = CaseTransfer.objects.all() assert transfers.count() == 0 def test_merge_texas_case_transfers_duplicate_handling(self): """Do we properly handle duplicate CaseTransfer objects?""" texas_district = CourtFactory.create(id="texdistct6") - CaseTransfer.objects.create( + transfer = CaseTransferFactory.create( origin_court=texas_district, - origin_docket_number="2023-12345", destination_court=self.texas_coa1, destination_docket_number=self.docket_number_coa1, - transfer_date=date(2025, 1, 15), transfer_type=CaseTransfer.APPEAL, ) originating_court = TexasOriginatingDistrictCourtDictFactory( court_type="texas_district", district=5, - case="2023-12345", + case=transfer.origin_docket_number, ) docket_data = TexasCourtOfAppealsDocketDictFactory( court_id=CourtID.FIRST_COURT_OF_APPEALS.value, docket_number=self.docket_number_coa1, - date_filed=date(2025, 1, 15), originating_court=originating_court, + date_filed=transfer.transfer_date, transfer_from=None, ) @@ -3085,10 +3072,7 @@ def test_merge_texas_case_transfers_duplicate_handling(self): assert result.success is True assert result.create is False - transfers = CaseTransfer.objects.filter( - destination_court=self.texas_coa1, - destination_docket_number=self.docket_number_coa1, - ) + transfers = CaseTransfer.objects.all() assert transfers.count() == 1 diff --git a/cl/search/factories.py b/cl/search/factories.py index bf3e3c319b..57dbff2e5d 100644 --- a/cl/search/factories.py +++ b/cl/search/factories.py @@ -25,6 +25,7 @@ PRECEDENTIAL_STATUS, SOURCES, BankruptcyInformation, + CaseTransfer, Citation, Court, Docket, @@ -428,3 +429,23 @@ class ScotusDocketDataFactory(DictFactory): questions_presented = Faker("url") docket_entries = List([SubFactory(SCOTUSDocketEntryFactory)]) parties = [] + + +class CaseTransferFactory(DjangoModelFactory): + origin_court = SubFactory(CourtFactory) + origin_docket_number = Faker("federal_district_docket_number") + destination_court = SubFactory(CourtFactory) + destination_docket_number = Faker("federal_district_docket_number") + transfer_date = Faker("date_object") + transfer_type = Faker( + "random_element", + elements=( + CaseTransfer.APPEAL, + CaseTransfer.WORKLOAD, + CaseTransfer.MERGE, + CaseTransfer.JURISDICTION, + ), + ) + + class Meta: + model = CaseTransfer diff --git a/cl/search/state/texas/factories.py b/cl/search/state/texas/factories.py index 068ce11862..7f7acd6bb2 100644 --- a/cl/search/state/texas/factories.py +++ b/cl/search/state/texas/factories.py @@ -41,8 +41,6 @@ class TexasOriginatingCourtDictFactory(DictFactory): court_type = Faker( "random_element", elements=( - "texas_appellate", - "texas_district", "texas_probate", "texas_business", "texas_county", @@ -62,6 +60,7 @@ class TexasOriginatingCourtDictFactory(DictFactory): class TexasOriginatingAppellateCourtDictFactory( TexasOriginatingCourtDictFactory ): + court_type = "texas_appellate" court_id = Faker( "random_element", elements=("texas_coa01", "texas_coa02", "texas_coa14", "texas_coa15"), @@ -71,6 +70,7 @@ class TexasOriginatingAppellateCourtDictFactory( class TexasOriginatingDistrictCourtDictFactory( TexasOriginatingCourtDictFactory ): + court_type = "texas_district" district = Faker("random_element", elements=list(range(1, 527)) + [None]) From b912d33104caed5505fa89de7c184417bc36f47f Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 10 Feb 2026 13:39:07 -0700 Subject: [PATCH 16/87] test(texas): Docket merger test Implement docket merger tests and fix remaining issues --- cl/corpus_importer/tasks.py | 8 +++-- cl/corpus_importer/tests.py | 49 ++++++++++++++++++++++++++++++ cl/search/state/texas/factories.py | 7 +++-- 3 files changed, 60 insertions(+), 4 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index b3e55f2f36..d5771df982 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4040,7 +4040,9 @@ def merge_texas_docket( ) if lower_court_id is not None: - docket.appeal_from = lower_court_id + court = Court.objects.get(pk=lower_court_id) + docket.appeal_from = court + court_name = court.full_name else: logger.warning( "Failed to find court ID %s while populating appeal_from field for Texas docket %s in court %s", @@ -4048,7 +4050,9 @@ def merge_texas_docket( docket.pk, court.pk, ) - docket.appeal_from_str = lower_court_data.get("name") + # Assumes that we will only fail to generate a court ID for trial courts and never appellate courts + court_name = lower_court_data["name"] + docket.appeal_from_str = court_name docket.save() diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 11195bacc2..eb884cd221 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -87,6 +87,7 @@ generate_ia_json, get_and_save_free_document_report, merge_texas_case_transfers, + merge_texas_docket, merge_texas_docket_entry, merge_texas_docket_originating_court, merge_texas_document, @@ -3075,6 +3076,54 @@ def test_merge_texas_case_transfers_duplicate_handling(self): transfers = CaseTransfer.objects.all() assert transfers.count() == 1 + def test_merge_texas_docket_appellate_sets_appeal_from(self): + """Does merge_texas_docket set appeal_from for appellate courts?""" + texas_district = CourtFactory.create(id="texdistct6") + originating_court = TexasOriginatingDistrictCourtDictFactory( + court_type="texas_district", + district=5, + ) + docket_data = TexasCourtOfAppealsDocketDictFactory( + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, + docket_number=self.docket_number_coa1, + originating_court=originating_court, + transfer_from=None, + ) + + result = merge_texas_docket(docket_data) + + assert result.success is True + assert result.pk == self.docket_coa1.pk + + self.docket_coa1.refresh_from_db() + assert self.docket_coa1.date_filed == docket_data["date_filed"] + assert self.docket_coa1.cause == docket_data["case_type"] + assert self.docket_coa1.appeal_from_id == "texdistct6" + assert self.docket_coa1.appeal_from_str == texas_district.full_name + + def test_merge_texas_docket_final_court_sets_appeal_from(self): + """Does merge_texas_docket set appeal_from for final courts?""" + docket_sc = DocketFactory.create(court=self.texas_sc) + appeals_court = TexasAppellateCourtInfoDictFactory( + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, + ) + docket_data = TexasFinalCourtDocketDictFactory( + court_id=CourtID.SUPREME_COURT.value, + docket_number=docket_sc.docket_number, + appeals_court=appeals_court, + ) + + result = merge_texas_docket(docket_data) + + assert result.success is True + assert result.pk == docket_sc.pk + + docket_sc.refresh_from_db() + assert docket_sc.date_filed == docket_data["date_filed"] + assert docket_sc.cause == docket_data["case_type"] + assert docket_sc.appeal_from_id == "txctapp1" + assert docket_sc.appeal_from_str == self.texas_coa1.full_name + @patch("cl.corpus_importer.tasks.get_or_cache_pacer_cookies") @override_settings( diff --git a/cl/search/state/texas/factories.py b/cl/search/state/texas/factories.py index 7f7acd6bb2..921d4c7ef6 100644 --- a/cl/search/state/texas/factories.py +++ b/cl/search/state/texas/factories.py @@ -98,8 +98,11 @@ class TexasCommonDataDictFactory(DictFactory): originating_court = SubFactory(TexasOriginatingCourtDictFactory) case_events = List([SubFactory(TexasDocketEntryDictFactory)]) appellate_briefs = LazyAttribute( - lambda d: filter( - lambda e: True if random.random() < 0.1 else False, d.case_events + lambda d: list( + filter( + lambda e: True if random.random() < 0.1 else False, + d.case_events, + ) ) ) From cf5bb681d0e823c9340531409834061a4385343c Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 10 Feb 2026 14:31:55 -0700 Subject: [PATCH 17/87] fix(texas): Lazy merge Fix things that broke when I was lazy while merging --- cl/corpus_importer/tasks.py | 8 ++------ cl/corpus_importer/tests.py | 2 ++ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 2442423d96..950c92ddb6 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3520,14 +3520,10 @@ def merge_texas_document( needs_update = ( str(texas_document.media_version_id) != input_document["media_version_id"] + or not texas_document.filepath_local ) - if ( - created - or str(texas_document.media_version_id) - != input_document["media_version_id"] - or not texas_document.filepath_local - ): + if needs_update: texas_document.description = input_document["description"] texas_document.media_version_id = input_document["media_version_id"] texas_document.document_url = input_document["document_url"] diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 998d004144..d1f3b03348 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -2362,6 +2362,7 @@ def test_merge_texas_docket_entry_no_update(self): result = merge_texas_docket_entry( self.docket_coa1, "2025-01-02.000", True, js_docket_entry ) + pk = result.pk documents = TexasDocument.objects.filter(docket_entry_id=pk) for document in documents: document.filepath_local = "a" @@ -2400,6 +2401,7 @@ def test_merge_texas_docket_entry_add_document(self): result = merge_texas_docket_entry( self.docket_coa1, "2025-01-02.000", True, js_docket_entry ) + pk = result.pk documents = TexasDocument.objects.filter(docket_entry_id=pk) for document in documents: document.filepath_local = "a" From 831525939ecf89eafc9575dc59f0e7a8c489ef06 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Wed, 11 Feb 2026 15:14:26 -0700 Subject: [PATCH 18/87] feat(texas): Importer command --- .../commands/import_texas_dockets.py | 17 ++ cl/corpus_importer/tasks.py | 20 ++ cl/lib/command_utils.py | 188 ++++++++++++++++++ 3 files changed, 225 insertions(+) create mode 100644 cl/corpus_importer/management/commands/import_texas_dockets.py diff --git a/cl/corpus_importer/management/commands/import_texas_dockets.py b/cl/corpus_importer/management/commands/import_texas_dockets.py new file mode 100644 index 0000000000..3865ff5376 --- /dev/null +++ b/cl/corpus_importer/management/commands/import_texas_dockets.py @@ -0,0 +1,17 @@ +from cl.celery_init import app +from cl.corpus_importer.tasks import merge_texas_docket, parse_texas_docket +from cl.lib.command_utils import CorpusImporterCommand + + +class Command(CorpusImporterCommand): + help = "Import Texas dockets from S3 using an inventory CSV." + + compose_redis_key = "texas_docket_import:log" + + @staticmethod + def parse_task() -> app.Task: + return parse_texas_docket + + @staticmethod + def merge_task() -> app.Task: + return merge_texas_docket diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 950c92ddb6..a378658ac7 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3970,6 +3970,10 @@ def generate_texas_appellate_brief_flags( return flags +@app.task( + max_retries=5, + ignore_result=True, +) def merge_texas_docket( docket_data: TexasCourtOfAppealsDocket | TexasCourtOfCriminalAppealsDocket @@ -4115,3 +4119,19 @@ def merge_texas_docket( success=success, pk=docket.pk, ) + + +@app.task( + bind=True, + max_retries=5, + ignore_result=True, +) +def parse_texas_docket( + self: Task, bucket: str, s3_key: str +) -> ( + TexasCourtOfAppealsDocket + | TexasCourtOfCriminalAppealsDocket + | TexasSupremeCourtDocket + | None +): + raise NotImplementedError diff --git a/cl/lib/command_utils.py b/cl/lib/command_utils.py index ee86463812..b50a0e7388 100644 --- a/cl/lib/command_utils.py +++ b/cl/lib/command_utils.py @@ -1,9 +1,23 @@ +import csv import logging import os +import time +from abc import ABC, abstractmethod +from itertools import islice +import botocore.exceptions +from celery import chain +from django.conf import settings from django.core.management import BaseCommand, CommandError +from cl.celery_init import app +from cl.lib.celery_utils import CeleryThrottle +from cl.lib.indexing_utils import ( + get_last_parent_document_id_processed, + log_last_document_indexed, +) from cl.lib.juriscraper_utils import get_module_by_court_id +from cl.lib.storage import AWSMediaStorage logger = logging.getLogger(__name__) @@ -58,6 +72,180 @@ def add_arguments(self, parser): ) +@app.task( + bind=True, + autoretry_for=( + botocore.exceptions.HTTPClientError, + botocore.exceptions.ConnectionError, + ), + max_retries=5, + retry_backoff=10, + ignore_result=True, +) +def _corpus_download_task(bucket: str, s3_key: str) -> tuple[bytes, str, str]: + """Downloads a scraped file from S3 and returns it for parsing. + + :param bucket: S3 bucket name. + :param s3_key: S3 key to download file from. + :return: Tuple with entries: Bytes of downloaded file, the bucket + parameter, and the s3_key parameter.""" + logger.info("Downloading file from S3: %s", s3_key) + storage = AWSMediaStorage(bucket_name=bucket) + with storage.open(s3_key, "rb") as f: + content = f.read() + return content, bucket, s3_key + + +class CorpusImporterCommand(VerboseCommand, ABC): + """Base class for `cl.corpus_importer` commands encapsulating inventory + file reading, celery queue interactions, and redis logging. + + Uses an inventory CSV from S3 to find files to parse and ingest into the + database. Includes ratelimiting and autoresume logic. + + Required methods are: + + - `parse_task`: Should return a Celery task which parses a `bytes` object + into some usable format, typically using Juriscraper. Signature should be: + `task(content: bytes, bucket_name: str, s3_key: str)`, unless you manually + override `download_task` to return a different format. + - `merge_task`: Should return a Celery task which takes the output of + `parse_task` and merges it into the database. Input should be whatever the + output of `parse_task` is. + + Required properties are: + + - `compose_redis_key`: The Redis log key to use for tracking progress. + + Optional methods are: + - `download_task`: Should return the task used to download files from S3. A + default implementation is provided for convenience.""" + + compose_redis_key: str + + def add_arguments(self, parser): + parser.add_argument( + "--inventory-file", + required=True, + help="Path to the inventory CSV relative to MEDIA_ROOT.", + ) + parser.add_argument( + "--retrieval-queue", + default="celery", + help="Which celery queue to use for S3 retrieval.", + ) + parser.add_argument( + "--parsing-queue", + default="celery", + help="Which celery queue to use for document parsing.", + ) + parser.add_argument( + "--ingesting-queue", + default="celery", + help="Which celery queue to use for DB ingesting.", + ) + parser.add_argument( + "--throttle-min-items", + type=int, + default=5, + help="CeleryThrottle min_items parameter.", + ) + parser.add_argument( + "--delay", + type=float, + default=1.0, + help="Seconds to sleep between scheduling tasks.", + ) + parser.add_argument( + "--start-row", + type=int, + default=0, + help="Row number to start from (for manual resume).", + ) + parser.add_argument( + "--inventory-rows", + type=int, + required=True, + help="Total number of rows in the inventory CSV. Used to " + "log progress percentage.", + ) + parser.add_argument( + "--auto-resume", + action="store_true", + default=False, + help="Resume from last row stored in Redis.", + ) + + @staticmethod + def download_task() -> app.Task: + return _corpus_download_task + + @staticmethod + @abstractmethod + def parse_task() -> app.Task: ... + + @staticmethod + @abstractmethod + def merge_task() -> app.Task: ... + + def handle(self, *args, **options): + super().handle(*args, **options) + + retrieval_queue = options["retrieval_queue"] + parse_queue = options["parsing_queue"] + ingesting_queue = options["ingesting_queue"] + delay = options["delay"] + inventory_rows = options["inventory_rows"] + inventory_path = settings.MEDIA_ROOT / options["inventory_file"] + + start_row = options["start_row"] + if options["auto_resume"]: + start_row = get_last_parent_document_id_processed( + self.compose_redis_key + ) + logger.info("Auto-resuming from row %s.", start_row) + + total_rows = inventory_rows - start_row + + throttle = CeleryThrottle( + min_items=options["throttle_min_items"], + queue_name=ingesting_queue, + ) + + with open(inventory_path) as f: + reader = csv.reader(f) + for row_idx, row in islice(enumerate(reader), start_row, None): + bucket = row[0].strip() + s3_key = row[1].strip() + + throttle.maybe_wait() + chain( + self.download_task() + .si(bucket, s3_key) + .set(queue=retrieval_queue), + self.parse_task().s().set(queue=parse_queue), + self.merge_task().s().set(queue=ingesting_queue), + ).apply_async() + time.sleep(delay) + + if row_idx % 100 == 0: + processed = row_idx - start_row + progress = ( + f" ({processed / total_rows:.1%})" + if total_rows + else "" + ) + logger.info( + "Scheduled %s rows %s. Current row: %s.", + row_idx, + progress, + s3_key, + ) + log_last_document_indexed(row_idx, self.compose_redis_key) + + logger.info("Finished scheduling all rows from inventory.") + + class CommandUtils: """A mixin to give some useful methods to sub classes.""" From 76dbc35cae9625acf79c7196fb900f7cedf4331a Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Thu, 12 Feb 2026 12:47:37 -0700 Subject: [PATCH 19/87] feat(texas): Fully operational Texas merger command Implement parsing step of Texas merger command and update to account for actual data structure. --- .github/workflows/lint.yml | 3 + .../commands/import_texas_dockets.py | 80 ++++++- cl/corpus_importer/management/utils.py | 225 ++++++++++++++++++ cl/corpus_importer/tasks.py | 37 ++- cl/lib/command_utils.py | 188 --------------- cl/lib/decorators.py | 17 ++ 6 files changed, 357 insertions(+), 193 deletions(-) create mode 100644 cl/corpus_importer/management/utils.py diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index c707c4b918..d11ab88cbc 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -82,3 +82,6 @@ jobs: cl/search/docket_number_cleaner.py \ cl/search/management/commands/clean_docket_number_raw.py \ cl/scrapers/management/commands/back_scrape_dockets.py \ + cl/corpus_importer/management/commands/import_texas_dockets.py \ + cl/corpus_importer/management/utils.py \ + cl/lib/decorators.py diff --git a/cl/corpus_importer/management/commands/import_texas_dockets.py b/cl/corpus_importer/management/commands/import_texas_dockets.py index 3865ff5376..86057edd8c 100644 --- a/cl/corpus_importer/management/commands/import_texas_dockets.py +++ b/cl/corpus_importer/management/commands/import_texas_dockets.py @@ -1,6 +1,59 @@ +import json +from collections.abc import Iterable +from itertools import batched + +import botocore.exceptions + from cl.celery_init import app +from cl.corpus_importer.management.utils import ( + CorpusImporterCommand, + TexasDocketMeta, +) from cl.corpus_importer.tasks import merge_texas_docket, parse_texas_docket -from cl.lib.command_utils import CorpusImporterCommand +from cl.lib.command_utils import logger +from cl.lib.storage import AWSMediaStorage + + +@app.task( + bind=True, + autoretry_for=( + botocore.exceptions.HTTPClientError, + botocore.exceptions.ConnectionError, + ), + max_retries=5, + retry_backoff=10, + ignore_result=True, +) +def _texas_corpus_download_task( + docket: tuple[str, str], + docket_headers: tuple[str, str], + docket_meta: tuple[str, str], +) -> tuple[bytes, dict[str, str], TexasDocketMeta]: + """Downloads a scraped file from S3 and returns it for parsing. + + :param docket: Tuple of S3 bucket name and key where docket HTML is stored. + :param docket_headers: Tuple of S3 bucket name and key where docket + response headers are stored. + :param docket_meta: Tuple of S3 bucket name and key where docket metadata + is stored. + :return: Tuple with entries: Bytes of downloaded file, dictionary with + response headers, and docket metadata.""" + storage = AWSMediaStorage(bucket_name=docket[0]) + logger.info("Downloading HTML file from S3: %s", docket[1]) + with storage.open(docket[1], "rb") as f: + content = f.read() + + storage = AWSMediaStorage(bucket=docket_headers[0]) + logger.info("Downloading docket headers from S3: %s", docket_headers[1]) + with storage.open(docket_headers[1], "r") as f: + headers = json.load(f) + + storage = AWSMediaStorage(bucket=docket_meta[0]) + logger.info("Downloading docket meta from S3: %s", docket_meta[1]) + with storage.open(docket_meta[1], "r") as f: + meta = TexasDocketMeta.model_validate_json(f.read()) + + return content, headers, meta class Command(CorpusImporterCommand): @@ -8,6 +61,31 @@ class Command(CorpusImporterCommand): compose_redis_key = "texas_docket_import:log" + @staticmethod + def inventory_row_batch_to_download( + batch: tuple[list[str], ...], + ) -> tuple[tuple[str, str], tuple[str, str], tuple[str, str]]: + """Extracts S3 buckets and paths from a batch of three entries from the + Texas inventory file. These will point to: the docket HTML, the docket + response headers, and metadata about the docket.""" + return ( + (batch[0][0].strip(), batch[0][1].strip()), + (batch[1][0].strip(), batch[1][1].strip()), + (batch[2][0].strip(), batch[2][1].strip()), + ) + + @staticmethod + def transform_inventory_iterator( + csv_reader: Iterable[list[str]], + ) -> Iterable[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]: + return map( + Command.inventory_row_batch_to_download, batched(csv_reader, 3) + ) + + @staticmethod + def download_task() -> app.Task: + return _texas_corpus_download_task + @staticmethod def parse_task() -> app.Task: return parse_texas_docket diff --git a/cl/corpus_importer/management/utils.py b/cl/corpus_importer/management/utils.py new file mode 100644 index 0000000000..c5c1c6ba58 --- /dev/null +++ b/cl/corpus_importer/management/utils.py @@ -0,0 +1,225 @@ +import csv +import time +from abc import ABC, abstractmethod +from collections.abc import Iterable +from datetime import datetime +from itertools import islice +from typing import final + +import botocore.exceptions +from celery import chain +from django.conf import settings +from pydantic import BaseModel + +from cl.celery_init import app +from cl.lib.celery_utils import CeleryThrottle +from cl.lib.command_utils import VerboseCommand, logger +from cl.lib.indexing_utils import ( + get_last_parent_document_id_processed, + log_last_document_indexed, +) +from cl.lib.storage import AWSMediaStorage + + +class TexasDocketMeta(BaseModel): + case_number: str + case_url: str + date_filed: datetime + style: str + v: str + case_type: str + coa_case_number: str + trial_court_case_number: str + trial_court_county: str + trial_court: str + appellate_court: str + court_code: str + + +@app.task( + bind=True, + autoretry_for=( + botocore.exceptions.HTTPClientError, + botocore.exceptions.ConnectionError, + ), + max_retries=5, + retry_backoff=10, + ignore_result=True, +) +def _corpus_download_task(bucket: str, s3_key: str) -> tuple[bytes, str, str]: + """Downloads a scraped file from S3 and returns it for parsing. + + :param bucket: S3 bucket name. + :param s3_key: S3 key to download file from. + :return: Tuple with entries: Bytes of downloaded file, the bucket + parameter, and the s3_key parameter.""" + logger.info("Downloading file from S3: %s", s3_key) + storage = AWSMediaStorage(bucket_name=bucket) + with storage.open(s3_key, "rb") as f: + content = f.read() + return content, bucket, s3_key + + +class CorpusImporterCommand(VerboseCommand, ABC): + """Base class for `cl.corpus_importer` commands encapsulating inventory + file reading, celery queue interactions, and redis logging. + + Uses an inventory CSV from S3 to find files to parse and ingest into the + database. Includes ratelimiting and autoresume logic. + + Required methods are: + + - `parse_task`: Should return a Celery task which parses a `bytes` object + into some usable format, typically using Juriscraper. Signature should be: + `task(content: bytes, bucket_name: str, s3_key: str)`, unless you manually + override `download_task` to return a different format. + - `merge_task`: Should return a Celery task which takes the output of + `parse_task` and merges it into the database. Input should be whatever the + output of `parse_task` is. + + Required properties are: + + - `compose_redis_key`: The Redis log key to use for tracking progress. + + Optional methods are: + - `download_task`: Should return the task used to download files from S3. A + default implementation is provided for convenience.""" + + compose_redis_key: str + + @final + def add_arguments(self, parser): + parser.add_argument( + "--inventory-file", + required=True, + help="Path to the inventory CSV relative to MEDIA_ROOT.", + ) + parser.add_argument( + "--retrieval-queue", + default="celery", + help="Which celery queue to use for S3 retrieval.", + ) + parser.add_argument( + "--parsing-queue", + default="celery", + help="Which celery queue to use for document parsing.", + ) + parser.add_argument( + "--ingesting-queue", + default="celery", + help="Which celery queue to use for DB ingesting.", + ) + parser.add_argument( + "--throttle-min-items", + type=int, + default=5, + help="CeleryThrottle min_items parameter.", + ) + parser.add_argument( + "--delay", + type=float, + default=1.0, + help="Seconds to sleep between scheduling tasks.", + ) + parser.add_argument( + "--start-row", + type=int, + default=0, + help="Row number to start from (for manual resume).", + ) + parser.add_argument( + "--inventory-rows", + type=int, + required=True, + help="Total number of rows in the inventory CSV. Used to " + "log progress percentage.", + ) + parser.add_argument( + "--auto-resume", + action="store_true", + default=False, + help="Resume from last row stored in Redis.", + ) + + @staticmethod + def download_task() -> app.Task: + return _corpus_download_task + + @staticmethod + @abstractmethod + def parse_task() -> app.Task: ... + + @staticmethod + @abstractmethod + def merge_task() -> app.Task: ... + + @staticmethod + def transform_inventory_iterator( + csv_reader: Iterable[list[str]], + ) -> Iterable: + """Optionally performs transformations on the inventory CSV file before + passing it to the download Celery task. Can be used for instance to + merge consecutive rows which represent the same docket into one object. + + :param csv_reader: The `csv.Reader` object to use to read the CSV. + :return: The transformed inventory CSV iterator. The item of the + iterable should be a list of arguments to be passed to the download + task.""" + return map(lambda row: [row[0].strip(), row[1].strip()], csv_reader) + + @final + def handle(self, *args, **options): + super().handle(*args, **options) + + retrieval_queue = options["retrieval_queue"] + parse_queue = options["parsing_queue"] + ingesting_queue = options["ingesting_queue"] + delay = options["delay"] + inventory_rows = options["inventory_rows"] + inventory_path = settings.MEDIA_ROOT / options["inventory_file"] + + start_row = options["start_row"] + if options["auto_resume"]: + start_row = get_last_parent_document_id_processed( + self.compose_redis_key + ) + logger.info("Auto-resuming from row %s.", start_row) + + total_rows = inventory_rows - start_row + + throttle = CeleryThrottle( + min_items=options["throttle_min_items"], + queue_name=ingesting_queue, + ) + + with open(inventory_path) as f: + reader = self.transform_inventory_iterator(csv.reader(f)) + for row_idx, download_args in islice( + enumerate(reader), start_row, None + ): + throttle.maybe_wait() + chain( + self.download_task() + .si(*download_args) + .set(queue=retrieval_queue), + self.parse_task().s().set(queue=parse_queue), + self.merge_task().s().set(queue=ingesting_queue), + ).apply_async() + time.sleep(delay) + + if row_idx % 100 == 0: + processed = row_idx - start_row + progress = ( + f" ({processed / total_rows:.1%})" + if total_rows + else "" + ) + logger.info( + "Scheduled %s rows %s. Current row: %s.", + row_idx, + progress, + download_args, + ) + log_last_document_indexed(row_idx, self.compose_redis_key) + + logger.info("Finished scheduling all rows from inventory.") diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index a378658ac7..46921ff0cc 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -60,16 +60,21 @@ TexasCaseEvent, TexasCaseParty, TexasCourtOfCriminalAppealsDocket, + TexasCourtOfCriminalAppealsScraper, TexasSupremeCourtAppellateBrief, TexasSupremeCourtCaseEvent, TexasSupremeCourtDocket, + TexasSupremeCourtScraper, ) from juriscraper.state.texas.common import ( CourtID, TexasAppellateBrief, TexasCaseDocument, ) -from juriscraper.state.texas.court_of_appeals import TexasCourtOfAppealsDocket +from juriscraper.state.texas.court_of_appeals import ( + TexasCourtOfAppealsDocket, + TexasCourtOfAppealsScraper, +) from openai import ( APIConnectionError, APIError, @@ -100,6 +105,7 @@ from cl.citations.utils import filter_out_non_case_law_citations from cl.corpus_importer.api_serializers import IADocketSerializer from cl.corpus_importer.llm_models import CaseNameExtractionResponse +from cl.corpus_importer.management.utils import TexasDocketMeta from cl.corpus_importer.prompts.system import CASE_NAME_EXTRACT_SYSTEM from cl.corpus_importer.utils import ( DownloadPDFResult, @@ -115,7 +121,7 @@ from cl.custom_filters.templatetags.text_filters import best_case_name from cl.lib.celery_utils import throttle_task from cl.lib.crypto import sha1 -from cl.lib.decorators import retry +from cl.lib.decorators import retry, time_call from cl.lib.llm import call_llm from cl.lib.microservice_utils import ( doc_page_count_service, @@ -3974,6 +3980,7 @@ def generate_texas_appellate_brief_flags( max_retries=5, ignore_result=True, ) +@time_call def merge_texas_docket( docket_data: TexasCourtOfAppealsDocket | TexasCourtOfCriminalAppealsDocket @@ -4126,12 +4133,34 @@ def merge_texas_docket( max_retries=5, ignore_result=True, ) +@time_call def parse_texas_docket( - self: Task, bucket: str, s3_key: str + self: Task, content: bytes, headers: dict[str, str], meta: TexasDocketMeta ) -> ( TexasCourtOfAppealsDocket | TexasCourtOfCriminalAppealsDocket | TexasSupremeCourtDocket | None ): - raise NotImplementedError + """Uses Juriscraper to parse bytes into a Texas docket object. + + :param self: The Celery task. + :param content: Bytes string to parse. + :param headers: The response headers to the scraper. + :param meta: Docket metadata. + :return: The parsed docket or `None` if parsing failed.""" + if meta.court_code == "cossup": + parser = TexasSupremeCourtScraper() + elif meta.court_code == "coscca": + parser = TexasCourtOfCriminalAppealsScraper() + elif meta.court_code.startswith("coa"): + parser = TexasCourtOfAppealsScraper(meta.court_code) + else: + logger.error( + "Unrecognized Texas court type %s. Cannot parse.", meta.court_code + ) + self.request.chain = None + return None + + parser._parse_text(content.decode("utf-8")) + return parser.data diff --git a/cl/lib/command_utils.py b/cl/lib/command_utils.py index b50a0e7388..ee86463812 100644 --- a/cl/lib/command_utils.py +++ b/cl/lib/command_utils.py @@ -1,23 +1,9 @@ -import csv import logging import os -import time -from abc import ABC, abstractmethod -from itertools import islice -import botocore.exceptions -from celery import chain -from django.conf import settings from django.core.management import BaseCommand, CommandError -from cl.celery_init import app -from cl.lib.celery_utils import CeleryThrottle -from cl.lib.indexing_utils import ( - get_last_parent_document_id_processed, - log_last_document_indexed, -) from cl.lib.juriscraper_utils import get_module_by_court_id -from cl.lib.storage import AWSMediaStorage logger = logging.getLogger(__name__) @@ -72,180 +58,6 @@ def add_arguments(self, parser): ) -@app.task( - bind=True, - autoretry_for=( - botocore.exceptions.HTTPClientError, - botocore.exceptions.ConnectionError, - ), - max_retries=5, - retry_backoff=10, - ignore_result=True, -) -def _corpus_download_task(bucket: str, s3_key: str) -> tuple[bytes, str, str]: - """Downloads a scraped file from S3 and returns it for parsing. - - :param bucket: S3 bucket name. - :param s3_key: S3 key to download file from. - :return: Tuple with entries: Bytes of downloaded file, the bucket - parameter, and the s3_key parameter.""" - logger.info("Downloading file from S3: %s", s3_key) - storage = AWSMediaStorage(bucket_name=bucket) - with storage.open(s3_key, "rb") as f: - content = f.read() - return content, bucket, s3_key - - -class CorpusImporterCommand(VerboseCommand, ABC): - """Base class for `cl.corpus_importer` commands encapsulating inventory - file reading, celery queue interactions, and redis logging. - - Uses an inventory CSV from S3 to find files to parse and ingest into the - database. Includes ratelimiting and autoresume logic. - - Required methods are: - - - `parse_task`: Should return a Celery task which parses a `bytes` object - into some usable format, typically using Juriscraper. Signature should be: - `task(content: bytes, bucket_name: str, s3_key: str)`, unless you manually - override `download_task` to return a different format. - - `merge_task`: Should return a Celery task which takes the output of - `parse_task` and merges it into the database. Input should be whatever the - output of `parse_task` is. - - Required properties are: - - - `compose_redis_key`: The Redis log key to use for tracking progress. - - Optional methods are: - - `download_task`: Should return the task used to download files from S3. A - default implementation is provided for convenience.""" - - compose_redis_key: str - - def add_arguments(self, parser): - parser.add_argument( - "--inventory-file", - required=True, - help="Path to the inventory CSV relative to MEDIA_ROOT.", - ) - parser.add_argument( - "--retrieval-queue", - default="celery", - help="Which celery queue to use for S3 retrieval.", - ) - parser.add_argument( - "--parsing-queue", - default="celery", - help="Which celery queue to use for document parsing.", - ) - parser.add_argument( - "--ingesting-queue", - default="celery", - help="Which celery queue to use for DB ingesting.", - ) - parser.add_argument( - "--throttle-min-items", - type=int, - default=5, - help="CeleryThrottle min_items parameter.", - ) - parser.add_argument( - "--delay", - type=float, - default=1.0, - help="Seconds to sleep between scheduling tasks.", - ) - parser.add_argument( - "--start-row", - type=int, - default=0, - help="Row number to start from (for manual resume).", - ) - parser.add_argument( - "--inventory-rows", - type=int, - required=True, - help="Total number of rows in the inventory CSV. Used to " - "log progress percentage.", - ) - parser.add_argument( - "--auto-resume", - action="store_true", - default=False, - help="Resume from last row stored in Redis.", - ) - - @staticmethod - def download_task() -> app.Task: - return _corpus_download_task - - @staticmethod - @abstractmethod - def parse_task() -> app.Task: ... - - @staticmethod - @abstractmethod - def merge_task() -> app.Task: ... - - def handle(self, *args, **options): - super().handle(*args, **options) - - retrieval_queue = options["retrieval_queue"] - parse_queue = options["parsing_queue"] - ingesting_queue = options["ingesting_queue"] - delay = options["delay"] - inventory_rows = options["inventory_rows"] - inventory_path = settings.MEDIA_ROOT / options["inventory_file"] - - start_row = options["start_row"] - if options["auto_resume"]: - start_row = get_last_parent_document_id_processed( - self.compose_redis_key - ) - logger.info("Auto-resuming from row %s.", start_row) - - total_rows = inventory_rows - start_row - - throttle = CeleryThrottle( - min_items=options["throttle_min_items"], - queue_name=ingesting_queue, - ) - - with open(inventory_path) as f: - reader = csv.reader(f) - for row_idx, row in islice(enumerate(reader), start_row, None): - bucket = row[0].strip() - s3_key = row[1].strip() - - throttle.maybe_wait() - chain( - self.download_task() - .si(bucket, s3_key) - .set(queue=retrieval_queue), - self.parse_task().s().set(queue=parse_queue), - self.merge_task().s().set(queue=ingesting_queue), - ).apply_async() - time.sleep(delay) - - if row_idx % 100 == 0: - processed = row_idx - start_row - progress = ( - f" ({processed / total_rows:.1%})" - if total_rows - else "" - ) - logger.info( - "Scheduled %s rows %s. Current row: %s.", - row_idx, - progress, - s3_key, - ) - log_last_document_indexed(row_idx, self.compose_redis_key) - - logger.info("Finished scheduling all rows from inventory.") - - class CommandUtils: """A mixin to give some useful methods to sub classes.""" diff --git a/cl/lib/decorators.py b/cl/lib/decorators.py index 279b01900e..0980ca6884 100644 --- a/cl/lib/decorators.py +++ b/cl/lib/decorators.py @@ -295,3 +295,20 @@ def document_model(model: type[models.Model]) -> type[models.Model]: field.db_comment = doc return model + + +def time_call(fn_logger: logging.Logger) -> Callable: + def decorator(f: Callable) -> Callable: + @wraps(f) + def wrapper(*args: Any, **kwargs: Any) -> Any: + start = time.perf_counter_ns() + result = f(*args, **kwargs) + elapsed = time.perf_counter_ns() - start + fn_logger.debug( + "Ran %s in %d.3 ms", f.__qualname__, elapsed / 1_000_000 + ) + return result + + return wrapper + + return decorator From 2dbd0211ecd24e9103b3c40c727fdf7f21b4726f Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Feb 2026 10:07:35 -0700 Subject: [PATCH 20/87] chore(texas): Update Juriscraper Update Juriscraper so we have the new Texas parser changes. --- cl/corpus_importer/tasks.py | 4 +++- pyproject.toml | 4 ++-- uv.lock | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 098caec8b4..d780b7f407 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -60,6 +60,8 @@ TexasCaseEvent, TexasCaseParty, TexasCourtOfCriminalAppealsDocket, + TexasOriginatingAppellateCourt, + TexasOriginatingDistrictCourt, TexasSupremeCourtAppellateBrief, TexasSupremeCourtCaseEvent, TexasSupremeCourtDocket, @@ -3730,7 +3732,7 @@ def texas_js_court_id_to_court_id(js_court_id: str) -> str: def texas_originating_court_to_court_id( - court_data: TexasOriginatingCourt, + court_data: TexasOriginatingAppellateCourt | TexasOriginatingDistrictCourt, ) -> str | None: """Attempts to translate Juriscraper Texas originating court data to a CourtListener Court ID. diff --git a/pyproject.toml b/pyproject.toml index 23d3c9d39c..d1f6c4c579 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ urls.Home = "https://www.courtlistener.com/" urls.Repository = "https://github.com/freelawproject/courtlistener" urls.Documentation = "https://github.com/freelawproject/courtlistener/wiki" license = " AGPL-3.0-only" -license-files = [ "LICENSE.txt" ] +license-files = ["LICENSE.txt"] dependencies = [ "ada-url>=1.28.0", @@ -112,7 +112,7 @@ dependencies = [ "django-cotton>=2.6.0", "django-cursor-pagination>=0.3.0", "django-elasticsearch-dsl>=8.0", - "juriscraper>=2.6.68", + "juriscraper>=2.7.6", "instructor>=1.14.1", "django-s3-express-cache>=0.1.0", "zohocrmsdk8-0==4.0.0", diff --git a/uv.lock b/uv.lock index e3b508e3c6..77270919a3 100644 --- a/uv.lock +++ b/uv.lock @@ -502,7 +502,7 @@ requires-dist = [ { name = "ipython", specifier = ">=9.9.0" }, { name = "itypes", specifier = ">=1.1.0" }, { name = "judge-pics", specifier = ">=2.0.5" }, - { name = "juriscraper", specifier = ">=2.6.68" }, + { name = "juriscraper", specifier = ">=2.7.6" }, { name = "kombu", specifier = ">=5.5.1" }, { name = "lxml", specifier = ">=6.0.2" }, { name = "markdown2", specifier = ">=2.5.4" }, From b1ce795912f72673db333d529b91d5fcffa20b7b Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Feb 2026 10:18:46 -0700 Subject: [PATCH 21/87] fix(texas): Timing logger Add logger to timing decorator; fix typing nitpick with MergResult --- .../management/commands/import_texas_dockets.py | 2 ++ cl/corpus_importer/tasks.py | 16 +++++++++------- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/cl/corpus_importer/management/commands/import_texas_dockets.py b/cl/corpus_importer/management/commands/import_texas_dockets.py index 86057edd8c..f5a8ef4266 100644 --- a/cl/corpus_importer/management/commands/import_texas_dockets.py +++ b/cl/corpus_importer/management/commands/import_texas_dockets.py @@ -11,6 +11,7 @@ ) from cl.corpus_importer.tasks import merge_texas_docket, parse_texas_docket from cl.lib.command_utils import logger +from cl.lib.decorators import time_call from cl.lib.storage import AWSMediaStorage @@ -24,6 +25,7 @@ retry_backoff=10, ignore_result=True, ) +@time_call(logger) def _texas_corpus_download_task( docket: tuple[str, str], docket_headers: tuple[str, str], diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 04143a3ff0..6e39887036 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3461,7 +3461,7 @@ class MergeResult[T = int](NamedTuple): """The primary key of the created or updated object.""" @staticmethod - def created(pk: T) -> MergeResult[T]: + def created[S](pk: S) -> MergeResult[S]: """Shorthand for the result of a successful creation operation. :param pk: The primary key of the created object. @@ -3469,7 +3469,7 @@ def created(pk: T) -> MergeResult[T]: return MergeResult(create=True, update=False, success=True, pk=pk) @staticmethod - def updated(pk: T) -> MergeResult[T]: + def updated[S](pk: S) -> MergeResult[S]: """Shorthand for the result of a successful update operation. :param pk: The primary key of the updated object. @@ -3477,14 +3477,16 @@ def updated(pk: T) -> MergeResult[T]: return MergeResult(create=False, update=True, success=True, pk=pk) @staticmethod - def failed() -> MergeResult[T]: + def failed[S]() -> MergeResult[S]: """Shorthand for the result of a failed merge operation. :return: The constructed MergeResult object.""" - return MergeResult(create=False, update=False, success=False, pk=None) + return MergeResult[S]( + create=False, update=False, success=False, pk=None + ) @staticmethod - def unnecessary(pk: T) -> MergeResult[T]: + def unnecessary[S](pk: S) -> MergeResult[S]: """Shorthand for the result of a unnecessary merge operation. :return: The constructed MergeResult object.""" @@ -3982,7 +3984,7 @@ def generate_texas_appellate_brief_flags( max_retries=5, ignore_result=True, ) -@time_call +@time_call(logger) def merge_texas_docket( docket_data: TexasCourtOfAppealsDocket | TexasCourtOfCriminalAppealsDocket @@ -4135,7 +4137,7 @@ def merge_texas_docket( max_retries=5, ignore_result=True, ) -@time_call +@time_call(logger) def parse_texas_docket( self: Task, content: bytes, headers: dict[str, str], meta: TexasDocketMeta ) -> ( From 854bca105689cfc4d16b9e34bda247748971f27a Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Feb 2026 10:38:07 -0700 Subject: [PATCH 22/87] chore(texas): Update Juriscraper again --- pyproject.toml | 2 +- uv.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d1f6c4c579..66836d83d7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -112,7 +112,7 @@ dependencies = [ "django-cotton>=2.6.0", "django-cursor-pagination>=0.3.0", "django-elasticsearch-dsl>=8.0", - "juriscraper>=2.7.6", + "juriscraper>=2.7.7", "instructor>=1.14.1", "django-s3-express-cache>=0.1.0", "zohocrmsdk8-0==4.0.0", diff --git a/uv.lock b/uv.lock index 77270919a3..b764fd0bb5 100644 --- a/uv.lock +++ b/uv.lock @@ -502,7 +502,7 @@ requires-dist = [ { name = "ipython", specifier = ">=9.9.0" }, { name = "itypes", specifier = ">=1.1.0" }, { name = "judge-pics", specifier = ">=2.0.5" }, - { name = "juriscraper", specifier = ">=2.7.6" }, + { name = "juriscraper", specifier = ">=2.7.7" }, { name = "kombu", specifier = ">=5.5.1" }, { name = "lxml", specifier = ">=6.0.2" }, { name = "markdown2", specifier = ">=2.5.4" }, @@ -1814,7 +1814,7 @@ wheels = [ [[package]] name = "juriscraper" -version = "2.7.6" +version = "2.7.7" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "certifi" }, @@ -1832,9 +1832,9 @@ dependencies = [ { name = "selenium" }, { name = "tldextract" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/f6/a1/caba7e6adcdadb90f2d0abcd7f314880b6651aa9abc8afaa2629a25f90fa/juriscraper-2.7.6.tar.gz", hash = "sha256:32ab337bf03ef70e998b28d782849031aca36ffbbcd105cb13c87e13521b29e4", size = 379379, upload-time = "2026-02-10T23:09:27.41Z" } +sdist = { url = "https://files.pythonhosted.org/packages/9f/e0/a368f6a908932e6beff52079bcd9ec1de06ccce1f6d79e22b8d048250d11/juriscraper-2.7.7.tar.gz", hash = "sha256:3bc282b64078f306f556cdc9ddc2d4600ef767a1ff82d07c676503767d9ab537", size = 379815, upload-time = "2026-02-13T17:30:13.109Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e8/c5/d9584596bf828e8f7ee1c166d880f64d31a95671f75650e12c726cabdc01/juriscraper-2.7.6-py3-none-any.whl", hash = "sha256:f5ce45abd917a04ea8c60680aad7bf79f12c104ecc772c0a409f7aa7d3942f91", size = 603387, upload-time = "2026-02-10T23:09:25.708Z" }, + { url = "https://files.pythonhosted.org/packages/81/f8/ea9842c89b6b9546bea21c159bdb16f9a4433ea673348864390d828eaa4f/juriscraper-2.7.7-py3-none-any.whl", hash = "sha256:f659323b1cb0e73d122d002f515e6c79a59621c92a16fb6563e33d6cbc02e79b", size = 603765, upload-time = "2026-02-13T17:30:11.469Z" }, ] [[package]] From 610b3a596db17ea941099fd84513f5051fa5a481 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Feb 2026 10:51:32 -0700 Subject: [PATCH 23/87] fix(texas): Broken migrations --- .../0052_case_transfer_docket_to_text_id.py | 71 ------------ .../0053_case_transfer_docket_to_text_id.py | 107 ++++++++++++++++++ ... 0053_case_transfer_docket_to_text_id.sql} | 0 ..._transfer_docket_to_text_id_customers.sql} | 0 4 files changed, 107 insertions(+), 71 deletions(-) delete mode 100644 cl/search/migrations/0052_case_transfer_docket_to_text_id.py create mode 100644 cl/search/migrations/0053_case_transfer_docket_to_text_id.py rename cl/search/migrations/{0052_case_transfer_docket_to_text_id.sql => 0053_case_transfer_docket_to_text_id.sql} (100%) rename cl/search/migrations/{0052_case_transfer_docket_to_text_id_customers.sql => 0053_case_transfer_docket_to_text_id_customers.sql} (100%) diff --git a/cl/search/migrations/0052_case_transfer_docket_to_text_id.py b/cl/search/migrations/0052_case_transfer_docket_to_text_id.py deleted file mode 100644 index 3c8c8f0701..0000000000 --- a/cl/search/migrations/0052_case_transfer_docket_to_text_id.py +++ /dev/null @@ -1,71 +0,0 @@ -# Generated by Django 6.0.1 on 2026-02-10 17:22 - -import pgtrigger.compiler -import pgtrigger.migrations -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('search', '0051_texas_models'), - ] - - operations = [ - pgtrigger.migrations.RemoveTrigger( - model_name='casetransfer', - name='update_update', - ), - pgtrigger.migrations.RemoveTrigger( - model_name='casetransfer', - name='delete_delete', - ), - migrations.RemoveField( - model_name='casetransfer', - name='destination_docket', - ), - migrations.RemoveField( - model_name='casetransfer', - name='origin_docket', - ), - migrations.RemoveField( - model_name='casetransferevent', - name='destination_docket', - ), - migrations.RemoveField( - model_name='casetransferevent', - name='origin_docket', - ), - migrations.AddField( - model_name='casetransfer', - name='destination_docket_number', - field=models.TextField(db_comment='The ID of the case docket in the destination court.', default='', help_text='The ID of the case docket in the destination court.'), - preserve_default=False, - ), - migrations.AddField( - model_name='casetransfer', - name='origin_docket_number', - field=models.TextField(db_comment='The ID of the docket this transfer originates from.', default='', help_text='The ID of the docket this transfer originates from.'), - preserve_default=False, - ), - migrations.AddField( - model_name='casetransferevent', - name='destination_docket_number', - field=models.TextField(db_comment='The ID of the case docket in the destination court.', default='', help_text='The ID of the case docket in the destination court.'), - preserve_default=False, - ), - migrations.AddField( - model_name='casetransferevent', - name='origin_docket_number', - field=models.TextField(db_comment='The ID of the docket this transfer originates from.', default='', help_text='The ID of the docket this transfer originates from.'), - preserve_default=False, - ), - pgtrigger.migrations.AddTrigger( - model_name='casetransfer', - trigger=pgtrigger.compiler.Trigger(name='update_update', sql=pgtrigger.compiler.UpsertTriggerSql(condition='WHEN (OLD."destination_court_id" IS DISTINCT FROM (NEW."destination_court_id") OR OLD."destination_docket_number" IS DISTINCT FROM (NEW."destination_docket_number") OR OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."origin_court_id" IS DISTINCT FROM (NEW."origin_court_id") OR OLD."origin_docket_number" IS DISTINCT FROM (NEW."origin_docket_number") OR OLD."transfer_date" IS DISTINCT FROM (NEW."transfer_date") OR OLD."transfer_type" IS DISTINCT FROM (NEW."transfer_type"))', func='INSERT INTO "search_casetransferevent" ("date_created", "date_modified", "destination_court_id", "destination_docket_number", "id", "origin_court_id", "origin_docket_number", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "transfer_date", "transfer_type") VALUES (OLD."date_created", OLD."date_modified", OLD."destination_court_id", OLD."destination_docket_number", OLD."id", OLD."origin_court_id", OLD."origin_docket_number", _pgh_attach_context(), NOW(), \'update\', OLD."id", OLD."transfer_date", OLD."transfer_type"); RETURN NULL;', hash='80b917540054665b8a49836933ea803aad526606', operation='UPDATE', pgid='pgtrigger_update_update_8e8e1', table='search_casetransfer', when='AFTER')), - ), - pgtrigger.migrations.AddTrigger( - model_name='casetransfer', - trigger=pgtrigger.compiler.Trigger(name='delete_delete', sql=pgtrigger.compiler.UpsertTriggerSql(func='INSERT INTO "search_casetransferevent" ("date_created", "date_modified", "destination_court_id", "destination_docket_number", "id", "origin_court_id", "origin_docket_number", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "transfer_date", "transfer_type") VALUES (OLD."date_created", OLD."date_modified", OLD."destination_court_id", OLD."destination_docket_number", OLD."id", OLD."origin_court_id", OLD."origin_docket_number", _pgh_attach_context(), NOW(), \'delete\', OLD."id", OLD."transfer_date", OLD."transfer_type"); RETURN NULL;', hash='1ebca4ba9fa46ebba2ad0edfdedbc2710d4cb1b6', operation='DELETE', pgid='pgtrigger_delete_delete_b8bc0', table='search_casetransfer', when='AFTER')), - ), - ] diff --git a/cl/search/migrations/0053_case_transfer_docket_to_text_id.py b/cl/search/migrations/0053_case_transfer_docket_to_text_id.py new file mode 100644 index 0000000000..55b8db455d --- /dev/null +++ b/cl/search/migrations/0053_case_transfer_docket_to_text_id.py @@ -0,0 +1,107 @@ +# Generated by Django 6.0.1 on 2026-02-10 17:22 + +import pgtrigger.compiler +import pgtrigger.migrations +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("search", "0052_add_scotus_docket_entry_and_scotus_documents"), + ] + + operations = [ + pgtrigger.migrations.RemoveTrigger( + model_name="casetransfer", + name="update_update", + ), + pgtrigger.migrations.RemoveTrigger( + model_name="casetransfer", + name="delete_delete", + ), + migrations.RemoveField( + model_name="casetransfer", + name="destination_docket", + ), + migrations.RemoveField( + model_name="casetransfer", + name="origin_docket", + ), + migrations.RemoveField( + model_name="casetransferevent", + name="destination_docket", + ), + migrations.RemoveField( + model_name="casetransferevent", + name="origin_docket", + ), + migrations.AddField( + model_name="casetransfer", + name="destination_docket_number", + field=models.TextField( + db_comment="The ID of the case docket in the destination court.", + default="", + help_text="The ID of the case docket in the destination court.", + ), + preserve_default=False, + ), + migrations.AddField( + model_name="casetransfer", + name="origin_docket_number", + field=models.TextField( + db_comment="The ID of the docket this transfer originates from.", + default="", + help_text="The ID of the docket this transfer originates from.", + ), + preserve_default=False, + ), + migrations.AddField( + model_name="casetransferevent", + name="destination_docket_number", + field=models.TextField( + db_comment="The ID of the case docket in the destination court.", + default="", + help_text="The ID of the case docket in the destination court.", + ), + preserve_default=False, + ), + migrations.AddField( + model_name="casetransferevent", + name="origin_docket_number", + field=models.TextField( + db_comment="The ID of the docket this transfer originates from.", + default="", + help_text="The ID of the docket this transfer originates from.", + ), + preserve_default=False, + ), + pgtrigger.migrations.AddTrigger( + model_name="casetransfer", + trigger=pgtrigger.compiler.Trigger( + name="update_update", + sql=pgtrigger.compiler.UpsertTriggerSql( + condition='WHEN (OLD."destination_court_id" IS DISTINCT FROM (NEW."destination_court_id") OR OLD."destination_docket_number" IS DISTINCT FROM (NEW."destination_docket_number") OR OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."origin_court_id" IS DISTINCT FROM (NEW."origin_court_id") OR OLD."origin_docket_number" IS DISTINCT FROM (NEW."origin_docket_number") OR OLD."transfer_date" IS DISTINCT FROM (NEW."transfer_date") OR OLD."transfer_type" IS DISTINCT FROM (NEW."transfer_type"))', + func='INSERT INTO "search_casetransferevent" ("date_created", "date_modified", "destination_court_id", "destination_docket_number", "id", "origin_court_id", "origin_docket_number", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "transfer_date", "transfer_type") VALUES (OLD."date_created", OLD."date_modified", OLD."destination_court_id", OLD."destination_docket_number", OLD."id", OLD."origin_court_id", OLD."origin_docket_number", _pgh_attach_context(), NOW(), \'update\', OLD."id", OLD."transfer_date", OLD."transfer_type"); RETURN NULL;', + hash="80b917540054665b8a49836933ea803aad526606", + operation="UPDATE", + pgid="pgtrigger_update_update_8e8e1", + table="search_casetransfer", + when="AFTER", + ), + ), + ), + pgtrigger.migrations.AddTrigger( + model_name="casetransfer", + trigger=pgtrigger.compiler.Trigger( + name="delete_delete", + sql=pgtrigger.compiler.UpsertTriggerSql( + func='INSERT INTO "search_casetransferevent" ("date_created", "date_modified", "destination_court_id", "destination_docket_number", "id", "origin_court_id", "origin_docket_number", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "transfer_date", "transfer_type") VALUES (OLD."date_created", OLD."date_modified", OLD."destination_court_id", OLD."destination_docket_number", OLD."id", OLD."origin_court_id", OLD."origin_docket_number", _pgh_attach_context(), NOW(), \'delete\', OLD."id", OLD."transfer_date", OLD."transfer_type"); RETURN NULL;', + hash="1ebca4ba9fa46ebba2ad0edfdedbc2710d4cb1b6", + operation="DELETE", + pgid="pgtrigger_delete_delete_b8bc0", + table="search_casetransfer", + when="AFTER", + ), + ), + ), + ] diff --git a/cl/search/migrations/0052_case_transfer_docket_to_text_id.sql b/cl/search/migrations/0053_case_transfer_docket_to_text_id.sql similarity index 100% rename from cl/search/migrations/0052_case_transfer_docket_to_text_id.sql rename to cl/search/migrations/0053_case_transfer_docket_to_text_id.sql diff --git a/cl/search/migrations/0052_case_transfer_docket_to_text_id_customers.sql b/cl/search/migrations/0053_case_transfer_docket_to_text_id_customers.sql similarity index 100% rename from cl/search/migrations/0052_case_transfer_docket_to_text_id_customers.sql rename to cl/search/migrations/0053_case_transfer_docket_to_text_id_customers.sql From f71c98949958da5edafa6a382ae40a7112583631 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Feb 2026 12:19:54 -0700 Subject: [PATCH 24/87] fix(texas): Import test option and fixes Add testing option to import script and fix some oversights --- .../commands/import_texas_dockets.py | 11 ++++++--- cl/corpus_importer/management/utils.py | 24 +++++++++++++++---- cl/corpus_importer/tasks.py | 10 ++++---- 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/cl/corpus_importer/management/commands/import_texas_dockets.py b/cl/corpus_importer/management/commands/import_texas_dockets.py index f5a8ef4266..e3f4f37604 100644 --- a/cl/corpus_importer/management/commands/import_texas_dockets.py +++ b/cl/corpus_importer/management/commands/import_texas_dockets.py @@ -27,6 +27,7 @@ ) @time_call(logger) def _texas_corpus_download_task( + self: app.Task, docket: tuple[str, str], docket_headers: tuple[str, str], docket_meta: tuple[str, str], @@ -41,16 +42,20 @@ def _texas_corpus_download_task( :return: Tuple with entries: Bytes of downloaded file, dictionary with response headers, and docket metadata.""" storage = AWSMediaStorage(bucket_name=docket[0]) - logger.info("Downloading HTML file from S3: %s", docket[1]) + logger.info( + "Downloading HTML file from S3: (Bucket: %s; Path: %s)", + docket[0], + docket[1], + ) with storage.open(docket[1], "rb") as f: content = f.read() - storage = AWSMediaStorage(bucket=docket_headers[0]) + storage = AWSMediaStorage(bucket_name=docket_headers[0]) logger.info("Downloading docket headers from S3: %s", docket_headers[1]) with storage.open(docket_headers[1], "r") as f: headers = json.load(f) - storage = AWSMediaStorage(bucket=docket_meta[0]) + storage = AWSMediaStorage(bucket_name=docket_meta[0]) logger.info("Downloading docket meta from S3: %s", docket_meta[1]) with storage.open(docket_meta[1], "r") as f: meta = TexasDocketMeta.model_validate_json(f.read()) diff --git a/cl/corpus_importer/management/utils.py b/cl/corpus_importer/management/utils.py index c5c1c6ba58..d9b95d7612 100644 --- a/cl/corpus_importer/management/utils.py +++ b/cl/corpus_importer/management/utils.py @@ -1,15 +1,16 @@ import csv +import random import time from abc import ABC, abstractmethod from collections.abc import Iterable -from datetime import datetime +from datetime import date from itertools import islice from typing import final import botocore.exceptions from celery import chain from django.conf import settings -from pydantic import BaseModel +from pydantic import BaseModel, field_validator from cl.celery_init import app from cl.lib.celery_utils import CeleryThrottle @@ -24,7 +25,7 @@ class TexasDocketMeta(BaseModel): case_number: str case_url: str - date_filed: datetime + date_filed: date style: str v: str case_type: str @@ -35,6 +36,10 @@ class TexasDocketMeta(BaseModel): appellate_court: str court_code: str + @field_validator("date_filed", mode="before") + def date_filed_validator(cls, v): + return date(*(time.strptime(v, "%m/%d/%Y")[0:3])) + @app.task( bind=True, @@ -140,6 +145,12 @@ def add_arguments(self, parser): default=False, help="Resume from last row stored in Redis.", ) + parser.add_argument( + "--test-random", + type=bool, + default=False, + help="Randomly select rows from the inventory file to import.", + ) @staticmethod def download_task() -> app.Task: @@ -192,8 +203,13 @@ def handle(self, *args, **options): queue_name=ingesting_queue, ) - with open(inventory_path) as f: + with open(inventory_path, encoding="utf-8") as f: reader = self.transform_inventory_iterator(csv.reader(f)) + if options["test_random"]: + logger.warning( + "In testing mode. Randomly selecting rows from the inventory file." + ) + reader = filter(lambda _: random.random() < 0.01, reader) for row_idx, download_args in islice( enumerate(reader), start_row, None ): diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 6e39887036..199314af08 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4139,7 +4139,7 @@ def merge_texas_docket( ) @time_call(logger) def parse_texas_docket( - self: Task, content: bytes, headers: dict[str, str], meta: TexasDocketMeta + self: Task, i: tuple[bytes, dict[str, str], TexasDocketMeta] ) -> ( TexasCourtOfAppealsDocket | TexasCourtOfCriminalAppealsDocket @@ -4149,10 +4149,12 @@ def parse_texas_docket( """Uses Juriscraper to parse bytes into a Texas docket object. :param self: The Celery task. - :param content: Bytes string to parse. - :param headers: The response headers to the scraper. - :param meta: Docket metadata. + :param i: Tuple with the following entries: + - Bytes string to parse. + - The response headers to the scraper. + - Docket metadata. :return: The parsed docket or `None` if parsing failed.""" + content, headers, meta = i if meta.court_code == "cossup": parser = TexasSupremeCourtScraper() elif meta.court_code == "coscca": From 91b7d033c9f8d7c512d8b4d7f19bff198403660e Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Feb 2026 16:59:13 -0700 Subject: [PATCH 25/87] fix(texas): Make import command skip duplicates --- .../commands/import_texas_dockets.py | 53 ++++++++++++------- cl/corpus_importer/management/utils.py | 8 +-- 2 files changed, 39 insertions(+), 22 deletions(-) diff --git a/cl/corpus_importer/management/commands/import_texas_dockets.py b/cl/corpus_importer/management/commands/import_texas_dockets.py index e3f4f37604..3c5eac41a4 100644 --- a/cl/corpus_importer/management/commands/import_texas_dockets.py +++ b/cl/corpus_importer/management/commands/import_texas_dockets.py @@ -1,6 +1,6 @@ import json from collections.abc import Iterable -from itertools import batched +from pathlib import Path import botocore.exceptions @@ -43,7 +43,7 @@ def _texas_corpus_download_task( response headers, and docket metadata.""" storage = AWSMediaStorage(bucket_name=docket[0]) logger.info( - "Downloading HTML file from S3: (Bucket: %s; Path: %s)", + "Downloading docket HTML from S3: (Bucket: %s; Path: %s)", docket[0], docket[1], ) @@ -51,12 +51,20 @@ def _texas_corpus_download_task( content = f.read() storage = AWSMediaStorage(bucket_name=docket_headers[0]) - logger.info("Downloading docket headers from S3: %s", docket_headers[1]) + logger.info( + "Downloading docket headers from S3: (Bucket: %s; Path: %s)", + docket_headers[0], + docket_headers[1], + ) with storage.open(docket_headers[1], "r") as f: headers = json.load(f) storage = AWSMediaStorage(bucket_name=docket_meta[0]) - logger.info("Downloading docket meta from S3: %s", docket_meta[1]) + logger.info( + "Downloading docket meta from S3: (Bucket: %s; Path: %s)", + docket_meta[0], + docket_meta[1], + ) with storage.open(docket_meta[1], "r") as f: meta = TexasDocketMeta.model_validate_json(f.read()) @@ -68,27 +76,34 @@ class Command(CorpusImporterCommand): compose_redis_key = "texas_docket_import:log" - @staticmethod - def inventory_row_batch_to_download( - batch: tuple[list[str], ...], - ) -> tuple[tuple[str, str], tuple[str, str], tuple[str, str]]: - """Extracts S3 buckets and paths from a batch of three entries from the - Texas inventory file. These will point to: the docket HTML, the docket - response headers, and metadata about the docket.""" - return ( - (batch[0][0].strip(), batch[0][1].strip()), - (batch[1][0].strip(), batch[1][1].strip()), - (batch[2][0].strip(), batch[2][1].strip()), - ) - @staticmethod def transform_inventory_iterator( csv_reader: Iterable[list[str]], ) -> Iterable[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]: - return map( - Command.inventory_row_batch_to_download, batched(csv_reader, 3) + html_rows = filter( + lambda r: Path(r[1]).suffix == ".html", + map(lambda r: (r[0].strip(), r[1].strip()), csv_reader), ) + previous_key_stem = None + for html_row in html_rows: + html_bucket, html_key = html_row + html_path = Path(html_key) + docket_name = html_path.stem + if previous_key_stem and docket_name.startswith(previous_key_stem): + continue + else: + previous_key_stem = docket_name + header_key = str( + html_path.with_name(f"{docket_name}_headers.json") + ) + meta_key = str(html_path.with_name(f"{docket_name}_meta.json")) + yield ( + (html_bucket, html_key), + (html_bucket, header_key), + (html_bucket, meta_key), + ) + @staticmethod def download_task() -> app.Task: return _texas_corpus_download_task diff --git a/cl/corpus_importer/management/utils.py b/cl/corpus_importer/management/utils.py index d9b95d7612..5621e5064c 100644 --- a/cl/corpus_importer/management/utils.py +++ b/cl/corpus_importer/management/utils.py @@ -204,14 +204,16 @@ def handle(self, *args, **options): ) with open(inventory_path, encoding="utf-8") as f: - reader = self.transform_inventory_iterator(csv.reader(f)) + download_inputs = self.transform_inventory_iterator(csv.reader(f)) if options["test_random"]: logger.warning( "In testing mode. Randomly selecting rows from the inventory file." ) - reader = filter(lambda _: random.random() < 0.01, reader) + download_inputs = filter( + lambda _: random.random() < 0.001, download_inputs + ) for row_idx, download_args in islice( - enumerate(reader), start_row, None + enumerate(download_inputs), start_row, None ): throttle.maybe_wait() chain( From acd54bd4e79dcc553f19e320852b407421f33fdf Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Feb 2026 17:16:31 -0700 Subject: [PATCH 26/87] fix(texas): Catch parsing exceptions during import Wrap parsing code in try statement to catch and log parsing exceptions instead of crashing. Do not attempt to parse non-docket entries from the docket inventory file. Stop importing header files as they are not needed. --- .../commands/import_texas_dockets.py | 25 ++---------- cl/corpus_importer/tasks.py | 39 ++++++++++++------- 2 files changed, 28 insertions(+), 36 deletions(-) diff --git a/cl/corpus_importer/management/commands/import_texas_dockets.py b/cl/corpus_importer/management/commands/import_texas_dockets.py index 3c5eac41a4..dc98738db1 100644 --- a/cl/corpus_importer/management/commands/import_texas_dockets.py +++ b/cl/corpus_importer/management/commands/import_texas_dockets.py @@ -1,4 +1,3 @@ -import json from collections.abc import Iterable from pathlib import Path @@ -29,14 +28,11 @@ def _texas_corpus_download_task( self: app.Task, docket: tuple[str, str], - docket_headers: tuple[str, str], docket_meta: tuple[str, str], -) -> tuple[bytes, dict[str, str], TexasDocketMeta]: +) -> tuple[bytes, TexasDocketMeta]: """Downloads a scraped file from S3 and returns it for parsing. :param docket: Tuple of S3 bucket name and key where docket HTML is stored. - :param docket_headers: Tuple of S3 bucket name and key where docket - response headers are stored. :param docket_meta: Tuple of S3 bucket name and key where docket metadata is stored. :return: Tuple with entries: Bytes of downloaded file, dictionary with @@ -50,15 +46,6 @@ def _texas_corpus_download_task( with storage.open(docket[1], "rb") as f: content = f.read() - storage = AWSMediaStorage(bucket_name=docket_headers[0]) - logger.info( - "Downloading docket headers from S3: (Bucket: %s; Path: %s)", - docket_headers[0], - docket_headers[1], - ) - with storage.open(docket_headers[1], "r") as f: - headers = json.load(f) - storage = AWSMediaStorage(bucket_name=docket_meta[0]) logger.info( "Downloading docket meta from S3: (Bucket: %s; Path: %s)", @@ -68,7 +55,7 @@ def _texas_corpus_download_task( with storage.open(docket_meta[1], "r") as f: meta = TexasDocketMeta.model_validate_json(f.read()) - return content, headers, meta + return content, meta class Command(CorpusImporterCommand): @@ -79,9 +66,9 @@ class Command(CorpusImporterCommand): @staticmethod def transform_inventory_iterator( csv_reader: Iterable[list[str]], - ) -> Iterable[tuple[tuple[str, str], tuple[str, str], tuple[str, str]]]: + ) -> Iterable[tuple[tuple[str, str], tuple[str, str]]]: html_rows = filter( - lambda r: Path(r[1]).suffix == ".html", + lambda r: Path(r[1]).suffix == ".html" and "searches" not in r[1], map(lambda r: (r[0].strip(), r[1].strip()), csv_reader), ) @@ -94,13 +81,9 @@ def transform_inventory_iterator( continue else: previous_key_stem = docket_name - header_key = str( - html_path.with_name(f"{docket_name}_headers.json") - ) meta_key = str(html_path.with_name(f"{docket_name}_meta.json")) yield ( (html_bucket, html_key), - (html_bucket, header_key), (html_bucket, meta_key), ) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 199314af08..38412ad317 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4139,7 +4139,7 @@ def merge_texas_docket( ) @time_call(logger) def parse_texas_docket( - self: Task, i: tuple[bytes, dict[str, str], TexasDocketMeta] + self: Task, i: tuple[bytes, TexasDocketMeta] ) -> ( TexasCourtOfAppealsDocket | TexasCourtOfCriminalAppealsDocket @@ -4154,19 +4154,28 @@ def parse_texas_docket( - The response headers to the scraper. - Docket metadata. :return: The parsed docket or `None` if parsing failed.""" - content, headers, meta = i - if meta.court_code == "cossup": - parser = TexasSupremeCourtScraper() - elif meta.court_code == "coscca": - parser = TexasCourtOfCriminalAppealsScraper() - elif meta.court_code.startswith("coa"): - parser = TexasCourtOfAppealsScraper(meta.court_code) - else: + content, meta = i + try: + if meta.court_code == "cossup": + parser = TexasSupremeCourtScraper() + elif meta.court_code == "coscca": + parser = TexasCourtOfCriminalAppealsScraper() + elif meta.court_code.startswith("coa"): + parser = TexasCourtOfAppealsScraper(meta.court_code) + else: + logger.error( + "Unrecognized Texas court type %s. Cannot parse.", + meta.court_code, + ) + self.request.chain = None + return None + + parser._parse_text(content.decode("utf-8")) + return parser.data + except Exception as e: + self.request.chain = None logger.error( - "Unrecognized Texas court type %s. Cannot parse.", meta.court_code + "Encountered error parsing Texas docket at URL %s: %s", + meta.case_url, + str(e), ) - self.request.chain = None - return None - - parser._parse_text(content.decode("utf-8")) - return parser.data From 7ff1c23e94a5609b641d400480c77985be1074a8 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 17 Feb 2026 09:50:57 -0700 Subject: [PATCH 27/87] fix(texas): Complete TODO tasks Complete TODO tasks for after Juriscraper update that I forgot to complete --- cl/corpus_importer/tasks.py | 12 ++++++------ cl/corpus_importer/tests.py | 11 ++--------- cl/search/state/texas/factories.py | 26 +++++++++++++++----------- 3 files changed, 23 insertions(+), 26 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 38412ad317..67f1930710 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -70,6 +70,7 @@ ) from juriscraper.state.texas.common import ( CourtID, + CourtType, TexasAppellateBrief, TexasCaseDocument, ) @@ -3747,22 +3748,21 @@ def texas_originating_court_to_court_id( :param court_data: The originating court data from Juriscraper. :return: The matching Court ID or None if no court could be found.""" - # TODO Replace with JS CourtID enum values when dependency is updated court_type = court_data["court_type"] - if court_type == "texas_appellate": + if court_type == CourtType.APPELLATE.value: return texas_js_court_id_to_court_id(court_data["court_id"]) - if court_type == "texas_district": + if court_type == CourtType.DISTRICT.value: district_number = court_data["district"] if district_number: if district_number > 1: district_number = district_number + 1 return f"texdistct{district_number}" return "texdistct" - if court_type == "texas_business": + if court_type == CourtType.BUSINESS.value: return "texbizct" - if court_type == "texas_municipal": + if court_type == CourtType.MUNICIPAL.value: return "texctyct" - if court_type == "texas_probate": + if court_type == CourtType.PROBATE.value: return "texprobct" # County, justice, and unknown court types return None diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 3b9e97e64a..dec8485b88 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -25,7 +25,7 @@ from juriscraper.state.texas import ( TexasCaseParty, ) -from juriscraper.state.texas.common import CourtID +from juriscraper.state.texas.common import CourtID, CourtType from openai import RateLimitError from pydantic import ValidationError @@ -2859,14 +2859,11 @@ def test_merge_texas_case_transfers_appellate_court_from_trial(self): texas_district = CourtFactory.create(id="texdistct6") originating_court = TexasOriginatingDistrictCourtDictFactory( - court_type="texas_district", district=5, - case="2023-12345", ) docket_data = TexasCourtOfAppealsDocketDictFactory( court_id=CourtID.FIRST_COURT_OF_APPEALS.value, docket_number=self.docket_number_coa1, - date_filed=date(2025, 1, 15), originating_court=originating_court, transfer_from=None, ) @@ -2898,12 +2895,10 @@ def test_merge_texas_case_transfers_appellate_with_workload_transfer( ) transfer_from = TexasAppellateTransferDictFactory( court_id=CourtID.SECOND_COURT_OF_APPEALS.value, - date=date(2025, 1, 10), ) docket_data = TexasCourtOfAppealsDocketDictFactory( court_id=CourtID.FIRST_COURT_OF_APPEALS.value, docket_number=self.docket_number_coa1, - date_filed=date(2025, 1, 15), originating_court=originating_court, transfer_from=transfer_from, ) @@ -3030,7 +3025,7 @@ def test_merge_texas_case_transfers_cca_death_penalty_direct_appeal( def test_merge_texas_case_transfers_no_trial_court_info(self): """Do we handle appellate cases without trial court info?""" originating_court = TexasOriginatingCourtDictFactory( - court_type="texas_unknown", + court_type=CourtType.UNKNOWN.value, case="", ) docket_data = TexasCourtOfAppealsDocketDictFactory( @@ -3059,7 +3054,6 @@ def test_merge_texas_case_transfers_duplicate_handling(self): ) originating_court = TexasOriginatingDistrictCourtDictFactory( - court_type="texas_district", district=5, case=transfer.origin_docket_number, ) @@ -3083,7 +3077,6 @@ def test_merge_texas_docket_appellate_sets_appeal_from(self): """Does merge_texas_docket set appeal_from for appellate courts?""" texas_district = CourtFactory.create(id="texdistct6") originating_court = TexasOriginatingDistrictCourtDictFactory( - court_type="texas_district", district=5, ) docket_data = TexasCourtOfAppealsDocketDictFactory( diff --git a/cl/search/state/texas/factories.py b/cl/search/state/texas/factories.py index f08a98dc72..84113f4fe7 100644 --- a/cl/search/state/texas/factories.py +++ b/cl/search/state/texas/factories.py @@ -5,7 +5,7 @@ from factory import DictFactory, Faker, List, SubFactory from factory.declarations import LazyAttribute from factory.django import DjangoModelFactory -from juriscraper.state.texas.common import CourtID +from juriscraper.state.texas.common import CourtID, CourtType from cl.search.factories import DocketFactory from cl.search.models import TexasDocketEntry, TexasDocument @@ -37,16 +37,15 @@ class TexasCasePartyDictFactory(DictFactory): class TexasOriginatingCourtDictFactory(DictFactory): name = Faker("court_name") - # TODO Replace the literals with values from Juriscraper when the dependency is updated court_type = Faker( "random_element", elements=( - "texas_probate", - "texas_business", - "texas_county", - "texas_municipal", - "texas_justice", - "texas_unknown", + CourtType.PROBATE.value, + CourtType.BUSINESS.value, + CourtType.COUNTY.value, + CourtType.MUNICIPAL.value, + CourtType.JUSTICE.value, + CourtType.UNKNOWN.value, ), ) county = Faker("pystr") @@ -60,17 +59,22 @@ class TexasOriginatingCourtDictFactory(DictFactory): class TexasOriginatingAppellateCourtDictFactory( TexasOriginatingCourtDictFactory ): - court_type = "texas_appellate" + court_type = CourtType.APPELLATE.value court_id = Faker( "random_element", - elements=("texas_coa01", "texas_coa02", "texas_coa14", "texas_coa15"), + elements=( + CourtID.FIRST_COURT_OF_APPEALS.value, + CourtID.SECOND_COURT_OF_APPEALS.value, + CourtID.FOURTEENTH_COURT_OF_APPEALS.value, + CourtID.FIFTEENTH_COURT_OF_APPEALS.value, + ), ) class TexasOriginatingDistrictCourtDictFactory( TexasOriginatingCourtDictFactory ): - court_type = "texas_district" + court_type = CourtType.DISTRICT.value district = Faker("random_element", elements=list(range(1, 527)) + [None]) From ddb757f17e2286c8d244cbd329d5a73b5993a301 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 17 Feb 2026 10:00:51 -0700 Subject: [PATCH 28/87] fix(texas): Address some review comments Remove unnecessary lock in `merge_texas_docket`; improve docs for `generate_texas_appellate_brief_flags`; move court ID mapping methods to `corpus_importer.utils` --- cl/corpus_importer/tasks.py | 59 +++++------------------------ cl/corpus_importer/utils.py | 75 +++++++++++++++++++++++++------------ 2 files changed, 60 insertions(+), 74 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 67f1930710..d99120a8db 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -61,8 +61,6 @@ TexasCaseParty, TexasCourtOfCriminalAppealsDocket, TexasCourtOfCriminalAppealsScraper, - TexasOriginatingAppellateCourt, - TexasOriginatingDistrictCourt, TexasSupremeCourtAppellateBrief, TexasSupremeCourtCaseEvent, TexasSupremeCourtDocket, @@ -70,7 +68,6 @@ ) from juriscraper.state.texas.common import ( CourtID, - CourtType, TexasAppellateBrief, TexasCaseDocument, ) @@ -120,6 +117,8 @@ is_long_appellate_document_number, make_iquery_probing_key, mark_ia_upload_needed, + texas_js_court_id_to_court_id, + texas_originating_court_to_court_id, ) from cl.custom_filters.templatetags.text_filters import best_case_name from cl.lib.celery_utils import throttle_task @@ -3724,50 +3723,6 @@ def merge_texas_parties( return MergeResult(create=False, update=False, success=True, pk=None) -def texas_js_court_id_to_court_id(js_court_id: str) -> str: - """Translates a Juriscraper Texas court ID to a CourtListener Court ID. - - :param js_court_id: The court ID extracted from Juriscraper. - :return: The corresponding Court ID.""" - if js_court_id == CourtID.SUPREME_COURT.value: - return "tex" - if js_court_id == CourtID.COURT_OF_CRIMINAL_APPEALS.value: - return "texcrimapp" - # Court of appeals - appellate_number = str(int(js_court_id[len("texas_coa") :])) - if appellate_number == "13": - appellate_number = "13A" - return f"txctapp{appellate_number}" - - -def texas_originating_court_to_court_id( - court_data: TexasOriginatingAppellateCourt | TexasOriginatingDistrictCourt, -) -> str | None: - """Attempts to translate Juriscraper Texas originating court data to a - CourtListener Court ID. - - :param court_data: The originating court data from Juriscraper. - :return: The matching Court ID or None if no court could be found.""" - court_type = court_data["court_type"] - if court_type == CourtType.APPELLATE.value: - return texas_js_court_id_to_court_id(court_data["court_id"]) - if court_type == CourtType.DISTRICT.value: - district_number = court_data["district"] - if district_number: - if district_number > 1: - district_number = district_number + 1 - return f"texdistct{district_number}" - return "texdistct" - if court_type == CourtType.BUSINESS.value: - return "texbizct" - if court_type == CourtType.MUNICIPAL.value: - return "texctyct" - if court_type == CourtType.PROBATE.value: - return "texprobct" - # County, justice, and unknown court types - return None - - def merge_texas_docket_originating_court( docket: Docket, docket_data: TexasCourtOfAppealsDocket @@ -3957,6 +3912,13 @@ def generate_texas_appellate_brief_flags( """Generates a list of booleans indicating whether the corresponding entry in the list of TexasCaseEvents is in the list of TexasAppellateBriefs. + The "Appellate Briefs" table in TAMES appears to always be a subset of the + case events table. Therefore, we simply use the case events table to + generate docket entries and set an "appellate_brief" flag to indicate + whether the entry appears in the appellate briefs table. This method + generates those flags given the list of case events and the list of + appellate briefs. + :param case_events: A list of TexasCaseEvent objects. :param appellate_briefs: A list of TexasAppellateBrief objects. :return: A list of booleans indicating whether the corresponding entry is @@ -3998,9 +3960,6 @@ def merge_texas_docket( pk=texas_js_court_id_to_court_id(docket_data["court_id"]) ) with transaction.atomic(): - Docket.objects.select_for_update().get( - docket_number=docket_data["docket_number"], court_id=court.pk - ) docket_number = docket_data["docket_number"] try: docket = Docket.objects.get( diff --git a/cl/corpus_importer/utils.py b/cl/corpus_importer/utils.py index b42b4a938b..28c3b59ac0 100644 --- a/cl/corpus_importer/utils.py +++ b/cl/corpus_importer/utils.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import itertools import math import random @@ -20,6 +22,11 @@ from eyecite.models import FullCaseCitation from eyecite.tokenizers import HyperscanTokenizer from juriscraper.lib.string_utils import harmonize, titlecase +from juriscraper.state.texas import ( + TexasOriginatingAppellateCourt, + TexasOriginatingDistrictCourt, +) +from juriscraper.state.texas.common import CourtID, CourtType from cl.citations.utils import map_reporter_db_cite_type from cl.lib.command_utils import logger @@ -1293,33 +1300,53 @@ def create_docket_entry_sequence_numbers( return sequence_numbers -def juriscraper_to_cl_court_id(js_court_id: str) -> str | None: - """Converts a court ID from Juriscraper to the court ID used in the - database. Utility function for a lot of if statements basically. - - :param js_court_id: The court ID from Juriscraper. - :return: The ID of this court in the database or `None` if the Juriscraper - ID was not recognized.""" - if js_court_id.startswith("texas_"): - js_texas_court_id = js_court_id[len("texas_") :] - - if js_texas_court_id.startswith("coa"): - coa_number = int(js_texas_court_id[len("coa") :]) - # TODO 13A and B (for some reason) - return f"txctapp{coa_number}" - if js_texas_court_id == "coscca": - return "texcrimapp" - if js_texas_court_id == "cossup": - return "tex" - logger.error("Unrecognized Texas court ID: %s", js_court_id) - return None - logger.error("Unrecognized court ID: %s", js_court_id) - return None - - @dataclass class DownloadPDFResult: """Result of a PDF download operation.""" success: bool sha1: str | None = None + + +def texas_js_court_id_to_court_id(js_court_id: str) -> str: + """Translates a Juriscraper Texas court ID to a CourtListener Court ID. + + :param js_court_id: The court ID extracted from Juriscraper. + :return: The corresponding Court ID.""" + if js_court_id == CourtID.SUPREME_COURT.value: + return "tex" + if js_court_id == CourtID.COURT_OF_CRIMINAL_APPEALS.value: + return "texcrimapp" + # Court of appeals + appellate_number = str(int(js_court_id[len("texas_coa") :])) + if appellate_number == "13": + appellate_number = "13A" + return f"txctapp{appellate_number}" + + +def texas_originating_court_to_court_id( + court_data: TexasOriginatingAppellateCourt | TexasOriginatingDistrictCourt, +) -> str | None: + """Attempts to translate Juriscraper Texas originating court data to a + CourtListener Court ID. + + :param court_data: The originating court data from Juriscraper. + :return: The matching Court ID or None if no court could be found.""" + court_type = court_data["court_type"] + if court_type == CourtType.APPELLATE.value: + return texas_js_court_id_to_court_id(court_data["court_id"]) + if court_type == CourtType.DISTRICT.value: + district_number = court_data["district"] + if district_number: + if district_number > 1: + district_number = district_number + 1 + return f"texdistct{district_number}" + return "texdistct" + if court_type == CourtType.BUSINESS.value: + return "texbizct" + if court_type == CourtType.MUNICIPAL.value: + return "texctyct" + if court_type == CourtType.PROBATE.value: + return "texprobct" + # County, justice, and unknown court types + return None From dd8a181946dd2ed1445dae660425f01f97ee4ac4 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 17 Feb 2026 10:07:01 -0700 Subject: [PATCH 29/87] fix(texas): Broken assert in test Fix assert that was checking for a constant value breaking after making that value fuzzy. --- cl/corpus_importer/tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index dec8485b88..7b02e2bf6e 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -2881,7 +2881,7 @@ def test_merge_texas_case_transfers_appellate_court_from_trial(self): assert transfer.origin_court == texas_district assert transfer.origin_docket_number == originating_court["case"] assert transfer.transfer_type == CaseTransfer.APPEAL - assert transfer.transfer_date == date(2025, 1, 15) + assert transfer.transfer_date == docket_data["date_filed"] def test_merge_texas_case_transfers_appellate_with_workload_transfer( self, From 0dfb9b81c4a15e990cf53ab56b7ec7474287e485 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 17 Feb 2026 10:35:25 -0700 Subject: [PATCH 30/87] feat(texas): Set originating court judges --- cl/corpus_importer/tasks.py | 26 +++++++++++++++++++++++--- cl/people_db/lookup_utils.py | 9 +++++++-- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index d99120a8db..80bf18a6bd 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -150,6 +150,9 @@ ) from cl.lib.redis_utils import delete_redis_semaphore, get_redis_interface from cl.lib.types import TaskData +from cl.people_db.lookup_utils import ( + lookup_judge_by_full_name_and_set_attr, +) from cl.people_db.models import Attorney, Role from cl.recap.constants import CR_2017, CR_OLD, CV_2017, CV_2020, CV_OLD from cl.recap.mergers import ( @@ -3751,7 +3754,19 @@ def merge_texas_docket_originating_court( originating_court_information.assigned_to_str = originating_court_data[ "judge" ] - # TODO Get judge from PeopleDB to add + originating_court_id = texas_originating_court_to_court_id( + originating_court_data + ) + # Only update judge if we're able to associate them with a court. + if originating_court_id: + async_to_sync(lookup_judge_by_full_name_and_set_attr)( + item=originating_court_information, + target_field="assigned_to", + full_name=originating_court_data["judge"], + court_id=originating_court_id, + event_date=None, + require_living_judge=False, + ) originating_court_information.save() if created: docket.save() @@ -3962,8 +3977,13 @@ def merge_texas_docket( with transaction.atomic(): docket_number = docket_data["docket_number"] try: - docket = Docket.objects.get( - court_id=court.pk, docket_number=docket_number + docket = find_docket_object( + court_id=court.pk, + pacer_case_id=None, + docket_number=docket_number, + federal_defendant_number=None, + federal_dn_judge_initials_assigned=None, + federal_dn_judge_initials_referred=None, ) except Docket.DoesNotExist: logger.info( diff --git a/cl/people_db/lookup_utils.py b/cl/people_db/lookup_utils.py index a376c8c36f..62c8c74d11 100644 --- a/cl/people_db/lookup_utils.py +++ b/cl/people_db/lookup_utils.py @@ -555,7 +555,8 @@ async def lookup_judge_by_full_name_and_set_attr( target_field: str, full_name: HumanName | str, court_id: str, - event_date: date, + event_date: date | None, + require_living_judge: bool = True, ) -> None: """Lookup a judge by the attribute of an object @@ -564,11 +565,15 @@ async def lookup_judge_by_full_name_and_set_attr( :param full_name: The full name of the judge to look up :param court_id: The court where the judge did something :param event_date: The date the judge did something + :param require_living_judge: Whether to ensure that the judge found was + born before the event date and died after it. :return None """ if not full_name: return None - judge = await lookup_judge_by_full_name(full_name, court_id, event_date) + judge = await lookup_judge_by_full_name( + full_name, court_id, event_date, require_living_judge + ) setattr(item, target_field, judge) From ebeed203989238cf2318a1f740356a542df36b64 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 17 Feb 2026 10:46:24 -0700 Subject: [PATCH 31/87] fix(texas): Missing return lint --- cl/corpus_importer/tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 80bf18a6bd..8b230fe92a 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4158,3 +4158,4 @@ def parse_texas_docket( meta.case_url, str(e), ) + return None From 3fa3963d9cadc65926448e9a72fa11f80c58233a Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 17 Feb 2026 11:46:11 -0700 Subject: [PATCH 32/87] fix(texas): Wrap async method in `async_to_sync` --- cl/corpus_importer/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 8b230fe92a..3274a98c36 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3977,7 +3977,7 @@ def merge_texas_docket( with transaction.atomic(): docket_number = docket_data["docket_number"] try: - docket = find_docket_object( + docket = async_to_sync(find_docket_object)( court_id=court.pk, pacer_case_id=None, docket_number=docket_number, From f3235ca638ce45bc77b1405170c94fbf5336c1ab Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 17 Feb 2026 13:32:31 -0700 Subject: [PATCH 33/87] feat(texas): Add logic to disaggregate dockets Add logic to disaggregate Texas appellate docket courts, which are currently all stored in the `texapp` court. --- cl/corpus_importer/tasks.py | 52 +++++++++++++----------------- cl/recap/mergers.py | 19 ++++++++--- cl/search/state/texas/factories.py | 6 ++-- 3 files changed, 39 insertions(+), 38 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 3274a98c36..dbea79bda2 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -68,6 +68,7 @@ ) from juriscraper.state.texas.common import ( CourtID, + CourtType, TexasAppellateBrief, TexasCaseDocument, ) @@ -3976,41 +3977,34 @@ def merge_texas_docket( ) with transaction.atomic(): docket_number = docket_data["docket_number"] - try: + docket = None + if docket_data["court_type"] == CourtType.APPELLATE: docket = async_to_sync(find_docket_object)( - court_id=court.pk, + court_id="texapp", pacer_case_id=None, docket_number=docket_number, federal_defendant_number=None, federal_dn_judge_initials_assigned=None, federal_dn_judge_initials_referred=None, + docket_source=Docket.SCRAPER, + allow_create=False, ) - except Docket.DoesNotExist: - logger.info( - "Could not find docket %s in court %s. Creating new Docket.", - docket_number, - court.pk, - ) - docket_created = True - docket = Docket( + if docket is not None: + logger.info( + "Disaggregating Texas appellate docket %s", docket_number + ) + docket.court = court + if docket is None: + docket = async_to_sync(find_docket_object)( court_id=court.pk, + pacer_case_id=None, docket_number=docket_number, + federal_defendant_number=None, + federal_dn_judge_initials_assigned=None, + federal_dn_judge_initials_referred=None, + docket_source=Docket.SCRAPER, + allow_create=True, ) - except Docket.MultipleObjectsReturned: - logger.error( - "Multiple dockets found for court %s with docket number %s. This likely indicates an error in the merging code.", - court.pk, - docket_number, - ) - return MergeResult.failed() - else: - logger.info( - "Found existing Docket with docket number %s in court %s. Acquiring DB lock for update.", - docket_number, - court.pk, - ) - docket_created = False - Docket.objects.select_for_update().get(pk=docket.pk) docket.date_filed = docket_data["date_filed"] docket.cause = docket_data["case_type"] originating_court_merge_result = merge_texas_docket_originating_court( @@ -4023,7 +4017,7 @@ def merge_texas_docket( court.pk, ) - if docket_data["court_type"] == "texas_appellate": + if docket_data["court_type"] == CourtType.APPELLATE.value: lower_court_data = docket_data["originating_court"] lower_court_id = texas_originating_court_to_court_id( lower_court_data @@ -4083,15 +4077,13 @@ def merge_texas_docket( ) create = ( - docket_created - or party_merge_result.create + party_merge_result.create or originating_court_merge_result.create or merge_case_transfer_result.create or any(r.create for r in entry_merge_results) ) update = ( - not docket_created - or party_merge_result.update + party_merge_result.update or originating_court_merge_result.update or merge_case_transfer_result.update or any(r.update for r in entry_merge_results) diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py index 649fe7e9a1..65593c2a00 100644 --- a/cl/recap/mergers.py +++ b/cl/recap/mergers.py @@ -134,7 +134,9 @@ async def find_docket_object( federal_dn_judge_initials_assigned: str | None, federal_dn_judge_initials_referred: str | None, using: str = "default", -) -> Docket: + docket_source: int = Docket.RECAP, + allow_create: bool = True, +) -> Docket | None: """Attempt to find the docket based on the parsed docket data. If cannot be found, create a new docket. If multiple are found, return the oldest. @@ -148,6 +150,9 @@ async def find_docket_object( :param federal_dn_judge_initials_referred: The judge's initials referred to validate the match. :param using: The database to use for the lookup queries. + :param docket_source: The source to set when creating a new docket. + :param allow_create: Whether to create a new docket if no matching one is + found :return The docket found or created. """ # Attempt several lookups of decreasing specificity. Note that @@ -251,10 +256,14 @@ async def find_docket_object( break if d is None: # Couldn't find a docket. Return a new one. - return Docket( - source=Docket.RECAP, - pacer_case_id=pacer_case_id, - court_id=court_id, + return ( + Docket( + source=docket_source, + pacer_case_id=pacer_case_id, + court_id=court_id, + ) + if allow_create + else None ) if using != "default": diff --git a/cl/search/state/texas/factories.py b/cl/search/state/texas/factories.py index 84113f4fe7..3f2c923800 100644 --- a/cl/search/state/texas/factories.py +++ b/cl/search/state/texas/factories.py @@ -90,7 +90,7 @@ class TexasCommonDataDictFactory(DictFactory): ) court_type = Faker( "random_element", - elements=("texas_appellate", "texas_final"), + elements=(CourtType.APPELLATE.value, CourtType.SUPREME.value), ) # Not correct, but close enough docket_number = Faker("federal_district_docket_number") @@ -176,7 +176,7 @@ class TexasAppellateTransferDictFactory(DictFactory): class TexasCourtOfAppealsDocketDictFactory(TexasCommonDataDictFactory): """Factory for Texas Court of Appeals docket data.""" - court_type = "texas_appellate" + court_type = CourtType.APPELLATE.value court_id = Faker( "random_element", elements=( @@ -200,7 +200,7 @@ class TexasCourtOfAppealsDocketDictFactory(TexasCommonDataDictFactory): class TexasFinalCourtDocketDictFactory(TexasCommonDataDictFactory): """Factory for Texas Supreme Court and Court of Criminal Appeals docket data.""" - court_type = "texas_final" + court_type = CourtType.SUPREME.value appeals_court = SubFactory(TexasAppellateCourtInfoDictFactory) court_id = Faker( "random_element", From b5a50134cf22beadd644885a4af21fcbc47f1163 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 17 Feb 2026 17:06:01 -0700 Subject: [PATCH 34/87] feat(texas): Rate-limit PDF downloads --- cl/corpus_importer/tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index dbea79bda2..ad0667d200 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3398,6 +3398,7 @@ def download_qp_scotus_pdf(self, docket_id: int) -> None: ignore_result=True, # No retries because download_pdf_in_stream already has retry logic ) +@throttle_task("2/s") def download_texas_document_pdf( self: Task, texas_document_pk: int ) -> int | None: From 02636729c58f52befdc19cfa3ccd36666bc17ef7 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 17 Feb 2026 17:50:48 -0700 Subject: [PATCH 35/87] docs(texas): More documentation for case transfer merger --- cl/corpus_importer/tasks.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index ad0667d200..045572b56c 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3782,7 +3782,16 @@ def merge_texas_case_transfers( | TexasCourtOfCriminalAppealsDocket | TexasSupremeCourtDocket, ) -> MergeResult: - """Merge appeal and work sharing information into the given Texas docket. + """This method creates or updates up any `CaseTransfer` objects which point to + or originate from a given docket to capture appeal and work sharing + information. + + If a `CaseTransfer` exists with the same origin and destination docket + numbers and court fields as one we want to create, update the origin or + destination docket foreign key field to point to this docket. This allows + us to merge `CaseTransfer` objects for which we only have partial + information and complete the information at a later time (or never if the + origin/destination is a court we don't scrape). :param docket: The docket to add the appeal information to. :param docket_data: The docket data from Juriscraper. @@ -3791,7 +3800,7 @@ def merge_texas_case_transfers( docket_data["originating_court"] ) - if docket_data["court_type"] == "texas_final": + if docket_data["court_type"] == CourtType.SUPREME.value: # Assume that the originating court -> appellate court transfer will # be populated by an appellate docket later on. transfer = CaseTransfer( @@ -3849,7 +3858,7 @@ def merge_texas_case_transfers( ) return MergeResult.failed() transfers = [transfer] - elif docket_data["court_type"] == "texas_appellate": + elif docket_data["court_type"] == CourtType.APPELLATE.value: transfers = [] if trial_court_id: transfers.append( @@ -3893,6 +3902,7 @@ def merge_texas_case_transfers( any_created = False for transfer in transfers: + # TODO Update once fk's are back to match docstring _, created = CaseTransfer.objects.get_or_create( origin_court=transfer.origin_court, origin_docket_number=transfer.origin_docket_number, From 76cc5fc7732c8026c2052d4f5481d26e1b37f0c8 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Wed, 18 Feb 2026 17:47:04 -0700 Subject: [PATCH 36/87] feat(texas): Generalized CaseTransfer merger --- cl/corpus_importer/tasks.py | 101 +++++++++++++++++++++++++++++------- 1 file changed, 83 insertions(+), 18 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 045572b56c..7ea19214ce 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3498,6 +3498,85 @@ def unnecessary[S](pk: S) -> MergeResult[S]: return MergeResult(create=False, update=False, success=True, pk=pk) +def merge_case_transfer(case_transfer: CaseTransfer) -> MergeResult: + """Merges a CaseTransfer object in the database by first checking if it can + be used to update an existing object, and if not, creating a new object (if + necessary). + + :param case_transfer: The CaseTransfer object to be merged. + :return: The result of this merge attempt.""" + logger.info( + "Merging CaseTransfer from docket %s in court %s to docket %s in court %s on %s with type %s.", + case_transfer.origin_court.pk, + case_transfer.origin_docket_number, + case_transfer.destination_court.pk, + case_transfer.destination_docket_number, + case_transfer.transfer_date.isoformat(), + case_transfer.transfer_type, + ) + candidate_case_transfers = CaseTransfer.objects.filter( + origin_court=case_transfer.origin_court, + origin_docket_number=case_transfer.origin_docket_number, + destination_court=case_transfer.destination_court, + destination_docket_number=case_transfer.destination_docket_number, + transfer_date=case_transfer.transfer_date, + transfer_type=case_transfer.transfer_type, + ) + try: + # Try to find an existing CaseTransfer to fill in info for. + if case_transfer.origin_docket: + existing_case_transfer = candidate_case_transfers.get( + origin_docket=None + ) + else: + existing_case_transfer = candidate_case_transfers.get( + destination_docket=None + ) + except CaseTransfer.MultipleObjectsReturned: + # This should never happen + logger.error( + "Found multiple matching CaseTransfer objects.", + ) + return MergeResult.failed() + except CaseTransfer.DoesNotExist: + logger.info( + "Could not find existing transfer to update. Checking if transfer already exists..." + ) + try: + existing_case_transfer = candidate_case_transfers.get( + origin_docket=case_transfer.origin_docket, + destination_docket=case_transfer.destination_docket, + ) + except CaseTransfer.MultipleObjectsReturned: + # Should never happen + logger.error( + "Found multiple matching CaseTransfer objects.", + ) + return MergeResult.failed() + except CaseTransfer.DoesNotExist: + logger.info( + "Did not find existing CaseTransfer object. Creating..." + ) + case_transfer.save() + return MergeResult.created(case_transfer.pk) + logger.info( + "Identical CaseTransfer object already exists. Merge is unnecessary." + ) + return MergeResult.unnecessary(existing_case_transfer.pk) + else: + logger.info( + "Updating existing CaseTransfer %s.", existing_case_transfer.pk + ) + if case_transfer.origin_docket: + existing_case_transfer.origin_docket = case_transfer.origin_docket + else: + existing_case_transfer.destination_docket = ( + case_transfer.destination_docket + ) + existing_case_transfer.save() + return MergeResult.updated(existing_case_transfer.pk) + + def merge_texas_document( docket_entry: TexasDocketEntry, input_document: TexasCaseDocument ) -> MergeResult: @@ -3800,6 +3879,7 @@ def merge_texas_case_transfers( docket_data["originating_court"] ) + # TODO Also need to generate reverse transfers to ensure information is complete if docket_data["court_type"] == CourtType.SUPREME.value: # Assume that the originating court -> appellate court transfer will # be populated by an appellate docket later on. @@ -3902,8 +3982,7 @@ def merge_texas_case_transfers( any_created = False for transfer in transfers: - # TODO Update once fk's are back to match docstring - _, created = CaseTransfer.objects.get_or_create( + case_transfer = CaseTransfer( origin_court=transfer.origin_court, origin_docket_number=transfer.origin_docket_number, destination_court=transfer.destination_court, @@ -3911,23 +3990,9 @@ def merge_texas_case_transfers( transfer_date=transfer.transfer_date, transfer_type=transfer.transfer_type, ) - if created: + merge_result = merge_case_transfer(case_transfer) + if merge_result.create: any_created = True - logger.info( - "Created CaseTransfer object from docket %s in court %s to docket %s in court %s", - transfer.origin_docket_number, - transfer.origin_court.pk, - transfer.destination_docket_number, - transfer.destination_court.pk, - ) - else: - logger.warning( - "CaseTransfer object from docket %s in court %s to docket %s in court %s already exists", - transfer.origin_docket_number, - transfer.origin_court.pk, - transfer.destination_docket_number, - transfer.destination_court.pk, - ) return MergeResult(success=True, create=any_created, update=False, pk=None) From e125035881352cec0ba71199e0182cb50ec249e3 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Thu, 19 Feb 2026 09:45:03 -0700 Subject: [PATCH 37/87] fix(texas): Remove duplicate migrations Remove unnecessary 0053 migrations now that model changes are in main. --- .../0053_case_transfer_docket_to_text_id.py | 107 ------------------ .../0053_case_transfer_docket_to_text_id.sql | 41 ------- ...e_transfer_docket_to_text_id_customers.sql | 21 ---- 3 files changed, 169 deletions(-) delete mode 100644 cl/search/migrations/0053_case_transfer_docket_to_text_id.py delete mode 100644 cl/search/migrations/0053_case_transfer_docket_to_text_id.sql delete mode 100644 cl/search/migrations/0053_case_transfer_docket_to_text_id_customers.sql diff --git a/cl/search/migrations/0053_case_transfer_docket_to_text_id.py b/cl/search/migrations/0053_case_transfer_docket_to_text_id.py deleted file mode 100644 index 55b8db455d..0000000000 --- a/cl/search/migrations/0053_case_transfer_docket_to_text_id.py +++ /dev/null @@ -1,107 +0,0 @@ -# Generated by Django 6.0.1 on 2026-02-10 17:22 - -import pgtrigger.compiler -import pgtrigger.migrations -from django.db import migrations, models - - -class Migration(migrations.Migration): - dependencies = [ - ("search", "0052_add_scotus_docket_entry_and_scotus_documents"), - ] - - operations = [ - pgtrigger.migrations.RemoveTrigger( - model_name="casetransfer", - name="update_update", - ), - pgtrigger.migrations.RemoveTrigger( - model_name="casetransfer", - name="delete_delete", - ), - migrations.RemoveField( - model_name="casetransfer", - name="destination_docket", - ), - migrations.RemoveField( - model_name="casetransfer", - name="origin_docket", - ), - migrations.RemoveField( - model_name="casetransferevent", - name="destination_docket", - ), - migrations.RemoveField( - model_name="casetransferevent", - name="origin_docket", - ), - migrations.AddField( - model_name="casetransfer", - name="destination_docket_number", - field=models.TextField( - db_comment="The ID of the case docket in the destination court.", - default="", - help_text="The ID of the case docket in the destination court.", - ), - preserve_default=False, - ), - migrations.AddField( - model_name="casetransfer", - name="origin_docket_number", - field=models.TextField( - db_comment="The ID of the docket this transfer originates from.", - default="", - help_text="The ID of the docket this transfer originates from.", - ), - preserve_default=False, - ), - migrations.AddField( - model_name="casetransferevent", - name="destination_docket_number", - field=models.TextField( - db_comment="The ID of the case docket in the destination court.", - default="", - help_text="The ID of the case docket in the destination court.", - ), - preserve_default=False, - ), - migrations.AddField( - model_name="casetransferevent", - name="origin_docket_number", - field=models.TextField( - db_comment="The ID of the docket this transfer originates from.", - default="", - help_text="The ID of the docket this transfer originates from.", - ), - preserve_default=False, - ), - pgtrigger.migrations.AddTrigger( - model_name="casetransfer", - trigger=pgtrigger.compiler.Trigger( - name="update_update", - sql=pgtrigger.compiler.UpsertTriggerSql( - condition='WHEN (OLD."destination_court_id" IS DISTINCT FROM (NEW."destination_court_id") OR OLD."destination_docket_number" IS DISTINCT FROM (NEW."destination_docket_number") OR OLD."id" IS DISTINCT FROM (NEW."id") OR OLD."origin_court_id" IS DISTINCT FROM (NEW."origin_court_id") OR OLD."origin_docket_number" IS DISTINCT FROM (NEW."origin_docket_number") OR OLD."transfer_date" IS DISTINCT FROM (NEW."transfer_date") OR OLD."transfer_type" IS DISTINCT FROM (NEW."transfer_type"))', - func='INSERT INTO "search_casetransferevent" ("date_created", "date_modified", "destination_court_id", "destination_docket_number", "id", "origin_court_id", "origin_docket_number", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "transfer_date", "transfer_type") VALUES (OLD."date_created", OLD."date_modified", OLD."destination_court_id", OLD."destination_docket_number", OLD."id", OLD."origin_court_id", OLD."origin_docket_number", _pgh_attach_context(), NOW(), \'update\', OLD."id", OLD."transfer_date", OLD."transfer_type"); RETURN NULL;', - hash="80b917540054665b8a49836933ea803aad526606", - operation="UPDATE", - pgid="pgtrigger_update_update_8e8e1", - table="search_casetransfer", - when="AFTER", - ), - ), - ), - pgtrigger.migrations.AddTrigger( - model_name="casetransfer", - trigger=pgtrigger.compiler.Trigger( - name="delete_delete", - sql=pgtrigger.compiler.UpsertTriggerSql( - func='INSERT INTO "search_casetransferevent" ("date_created", "date_modified", "destination_court_id", "destination_docket_number", "id", "origin_court_id", "origin_docket_number", "pgh_context_id", "pgh_created_at", "pgh_label", "pgh_obj_id", "transfer_date", "transfer_type") VALUES (OLD."date_created", OLD."date_modified", OLD."destination_court_id", OLD."destination_docket_number", OLD."id", OLD."origin_court_id", OLD."origin_docket_number", _pgh_attach_context(), NOW(), \'delete\', OLD."id", OLD."transfer_date", OLD."transfer_type"); RETURN NULL;', - hash="1ebca4ba9fa46ebba2ad0edfdedbc2710d4cb1b6", - operation="DELETE", - pgid="pgtrigger_delete_delete_b8bc0", - table="search_casetransfer", - when="AFTER", - ), - ), - ), - ] diff --git a/cl/search/migrations/0053_case_transfer_docket_to_text_id.sql b/cl/search/migrations/0053_case_transfer_docket_to_text_id.sql deleted file mode 100644 index ca82dce37f..0000000000 --- a/cl/search/migrations/0053_case_transfer_docket_to_text_id.sql +++ /dev/null @@ -1,41 +0,0 @@ -BEGIN; -SET CONSTRAINTS "search_casetransfer_destination_docket_i_9941948f_fk_search_do" IMMEDIATE; ALTER TABLE "search_casetransfer" DROP CONSTRAINT "search_casetransfer_destination_docket_i_9941948f_fk_search_do"; -ALTER TABLE "search_casetransfer" DROP COLUMN "destination_docket_id"; --- --- Remove field origin_docket from casetransfer --- -SET CONSTRAINTS "search_casetransfer_origin_docket_id_b23a08e9_fk_search_do" IMMEDIATE; ALTER TABLE "search_casetransfer" DROP CONSTRAINT "search_casetransfer_origin_docket_id_b23a08e9_fk_search_do"; -ALTER TABLE "search_casetransfer" DROP COLUMN "origin_docket_id"; --- --- Remove field destination_docket from casetransferevent --- -ALTER TABLE "search_casetransferevent" DROP COLUMN "destination_docket_id"; --- --- Remove field origin_docket from casetransferevent --- -ALTER TABLE "search_casetransferevent" DROP COLUMN "origin_docket_id"; --- --- Add field destination_docket_number to casetransfer --- -ALTER TABLE "search_casetransfer" ADD COLUMN "destination_docket_number" text DEFAULT '' NOT NULL; -ALTER TABLE "search_casetransfer" ALTER COLUMN "destination_docket_number" DROP DEFAULT; -COMMENT ON COLUMN "search_casetransfer"."destination_docket_number" IS 'The ID of the case docket in the destination court.'; --- --- Add field origin_docket_number to casetransfer --- -ALTER TABLE "search_casetransfer" ADD COLUMN "origin_docket_number" text DEFAULT '' NOT NULL; -ALTER TABLE "search_casetransfer" ALTER COLUMN "origin_docket_number" DROP DEFAULT; -COMMENT ON COLUMN "search_casetransfer"."origin_docket_number" IS 'The ID of the docket this transfer originates from.'; --- --- Add field destination_docket_number to casetransferevent --- -ALTER TABLE "search_casetransferevent" ADD COLUMN "destination_docket_number" text DEFAULT '' NOT NULL; -ALTER TABLE "search_casetransferevent" ALTER COLUMN "destination_docket_number" DROP DEFAULT; -COMMENT ON COLUMN "search_casetransferevent"."destination_docket_number" IS 'The ID of the case docket in the destination court.'; --- --- Add field origin_docket_number to casetransferevent --- -ALTER TABLE "search_casetransferevent" ADD COLUMN "origin_docket_number" text DEFAULT '' NOT NULL; -ALTER TABLE "search_casetransferevent" ALTER COLUMN "origin_docket_number" DROP DEFAULT; -COMMENT ON COLUMN "search_casetransferevent"."origin_docket_number" IS 'The ID of the docket this transfer originates from.'; -COMMIT; diff --git a/cl/search/migrations/0053_case_transfer_docket_to_text_id_customers.sql b/cl/search/migrations/0053_case_transfer_docket_to_text_id_customers.sql deleted file mode 100644 index 7877aa77da..0000000000 --- a/cl/search/migrations/0053_case_transfer_docket_to_text_id_customers.sql +++ /dev/null @@ -1,21 +0,0 @@ -BEGIN; -SET CONSTRAINTS "search_casetransfer_destination_docket_i_9941948f_fk_search_do" IMMEDIATE; ALTER TABLE "search_casetransfer" DROP CONSTRAINT "search_casetransfer_destination_docket_i_9941948f_fk_search_do"; -ALTER TABLE "search_casetransfer" DROP COLUMN "destination_docket_id"; --- --- Remove field origin_docket from casetransfer --- -SET CONSTRAINTS "search_casetransfer_origin_docket_id_b23a08e9_fk_search_do" IMMEDIATE; ALTER TABLE "search_casetransfer" DROP CONSTRAINT "search_casetransfer_origin_docket_id_b23a08e9_fk_search_do"; -ALTER TABLE "search_casetransfer" DROP COLUMN "origin_docket_id"; --- --- Add field destination_docket_number to casetransfer --- -ALTER TABLE "search_casetransfer" ADD COLUMN "destination_docket_number" text DEFAULT '' NOT NULL; -ALTER TABLE "search_casetransfer" ALTER COLUMN "destination_docket_number" DROP DEFAULT; -COMMENT ON COLUMN "search_casetransfer"."destination_docket_number" IS 'The ID of the case docket in the destination court.'; --- --- Add field origin_docket_number to casetransfer --- -ALTER TABLE "search_casetransfer" ADD COLUMN "origin_docket_number" text DEFAULT '' NOT NULL; -ALTER TABLE "search_casetransfer" ALTER COLUMN "origin_docket_number" DROP DEFAULT; -COMMENT ON COLUMN "search_casetransfer"."origin_docket_number" IS 'The ID of the docket this transfer originates from.'; -COMMIT; From 6cb63eb74ff84ca79af31c9f8a1a9547dabee7b7 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Thu, 19 Feb 2026 13:45:09 -0700 Subject: [PATCH 38/87] fix(texas): Test failures Address test failures that were happening because I forgot to update them after changing the code. --- cl/corpus_importer/tasks.py | 5 +++++ cl/corpus_importer/tests.py | 32 +++++++++++++++++++++----------- cl/search/factories.py | 12 ++++++++++-- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 7ea19214ce..695b7d0117 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3886,6 +3886,7 @@ def merge_texas_case_transfers( transfer = CaseTransfer( destination_court=docket.court, destination_docket_number=docket.docket_number, + destination_docket=docket, transfer_date=docket_data["date_filed"], transfer_type=CaseTransfer.APPEAL, ) @@ -3949,6 +3950,7 @@ def merge_texas_case_transfers( ], destination_court=docket.court, destination_docket_number=docket.docket_number, + destination_docket=docket, transfer_date=docket_data["date_filed"], transfer_type=CaseTransfer.APPEAL, ) @@ -3966,6 +3968,7 @@ def merge_texas_case_transfers( ], destination_court=docket.court, destination_docket_number=docket.docket_number, + destination_docket=docket, # The "date" field of transfers is not always set, but when it is, it seems to match date filed. transfer_date=docket_data["date_filed"], transfer_type=CaseTransfer.WORKLOAD, @@ -3985,8 +3988,10 @@ def merge_texas_case_transfers( case_transfer = CaseTransfer( origin_court=transfer.origin_court, origin_docket_number=transfer.origin_docket_number, + origin_docket=transfer.origin_docket, destination_court=transfer.destination_court, destination_docket_number=transfer.destination_docket_number, + destination_docket=transfer.destination_docket, transfer_date=transfer.transfer_date, transfer_type=transfer.transfer_type, ) diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 4d41c4b4f6..9b7e043bc9 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -2278,11 +2278,16 @@ def test_merge_texas_document_existing_document_update(self): self.download_task_mock.assert_called_once_with(current_document.pk) + @mock.patch("cl.lib.celery_utils.get_task_wait", return_value=0) + @mock.patch("cl.corpus_importer.tasks.doc_page_count_service") @responses.activate - def test_merge_texas_document_plaintext_extraction(self): + def test_merge_texas_document_plaintext_extraction( + self, pcs_mock, throttle_mock + ): """ Ensure plaintext extraction is triggered by `merge_texas_document`. """ + pcs_mock.return_value = httpx.Response(200, text="1") # Stop the mocks just for this test self.download_task_patch.stop() self.extract_pdf_document_patch.stop() @@ -2723,9 +2728,12 @@ def test_normalize_empty_parties_list(self): result = normalize_texas_parties([]) assert result == [] + @mock.patch("cl.lib.celery_utils.get_task_wait", return_value=0) @mock.patch("cl.corpus_importer.tasks.doc_page_count_service") @responses.activate - def test_download_texas_document_pdf_success(self, pcs_mock): + def test_download_texas_document_pdf_success( + self, pcs_mock, throttle_mock + ): """Can we successfully download a PDF for a TexasDocument?""" self.download_pdf_patch.stop() texas_document = TexasDocumentFactory.create() @@ -3039,25 +3047,27 @@ def test_merge_texas_case_transfers_duplicate_handling(self): """Do we properly handle duplicate CaseTransfer objects?""" texas_district = CourtFactory.create(id="texdistct6") - transfer = CaseTransferFactory.create( - origin_court=texas_district, - destination_court=self.texas_coa1, - destination_docket_number=self.docket_number_coa1, - transfer_type=CaseTransfer.APPEAL, - ) - originating_court = TexasOriginatingDistrictCourtDictFactory( district=5, - case=transfer.origin_docket_number, ) docket_data = TexasCourtOfAppealsDocketDictFactory( court_id=CourtID.FIRST_COURT_OF_APPEALS.value, docket_number=self.docket_number_coa1, originating_court=originating_court, - date_filed=transfer.transfer_date, transfer_from=None, ) + CaseTransferFactory.create( + origin_court=texas_district, + origin_docket=None, + origin_docket_number=originating_court["case"], + destination_court=self.texas_coa1, + destination_docket_number=self.docket_number_coa1, + destination_docket=self.docket_coa1, + transfer_date=docket_data["date_filed"], + transfer_type=CaseTransfer.APPEAL, + ) + result = merge_texas_case_transfers(self.docket_coa1, docket_data) assert result.success is True diff --git a/cl/search/factories.py b/cl/search/factories.py index 57dbff2e5d..a1961f934f 100644 --- a/cl/search/factories.py +++ b/cl/search/factories.py @@ -433,9 +433,17 @@ class ScotusDocketDataFactory(DictFactory): class CaseTransferFactory(DjangoModelFactory): origin_court = SubFactory(CourtFactory) - origin_docket_number = Faker("federal_district_docket_number") + origin_docket_number = LazyAttribute( + lambda ct: ct.origin_docket.docket_number if ct.origin_docket else None + ) + origin_docket = SubFactory(DocketFactory) destination_court = SubFactory(CourtFactory) - destination_docket_number = Faker("federal_district_docket_number") + destination_docket_number = LazyAttribute( + lambda ct: ct.destination_docket.docket_number + if ct.destination_docket + else None + ) + destination_docket = SubFactory(DocketFactory) transfer_date = Faker("date_object") transfer_type = Faker( "random_element", From 649356845c440a86a7e1e2a576b798ed6c8d060b Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Thu, 19 Feb 2026 16:02:38 -0700 Subject: [PATCH 39/87] chore(texas): More logging and transfer date fix --- cl/corpus_importer/tasks.py | 48 ++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 695b7d0117..26eed58f78 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3879,7 +3879,6 @@ def merge_texas_case_transfers( docket_data["originating_court"] ) - # TODO Also need to generate reverse transfers to ensure information is complete if docket_data["court_type"] == CourtType.SUPREME.value: # Assume that the originating court -> appellate court transfer will # be populated by an appellate docket later on. @@ -3916,6 +3915,10 @@ def merge_texas_case_transfers( return MergeResult.failed() else: if appeals_court["court_id"] == CourtID.UNKNOWN: + logger.warning( + "Found appellate court with unknown ID (docket %s)", + docket.docket_number, + ) appeals_court_id = "texapp" else: appeals_court_id = texas_js_court_id_to_court_id( @@ -3925,6 +3928,10 @@ def merge_texas_case_transfers( transfer.origin_docket_number = appeals_court["case_number"] elif docket_data["court_id"] == CourtID.SUPREME_COURT.value: if appeals_court["court_id"] == CourtID.UNKNOWN: + logger.warning( + "Found appellate court with unknown ID (docket %s)", + docket.docket_number, + ) appeals_court_id = "texapp" else: appeals_court_id = texas_js_court_id_to_court_id( @@ -3956,6 +3963,12 @@ def merge_texas_case_transfers( ) ) if docket_data["transfer_from"]: + transfer_from_date = docket_data["transfer_from"]["date"] + if not transfer_from_date: + logger.warning( + "Missing transfer date for workload transfer of docket %s", + docket.docket_number, + ) transfers.append( CaseTransfer( origin_court=Court.objects.get( @@ -3969,13 +3982,13 @@ def merge_texas_case_transfers( destination_court=docket.court, destination_docket_number=docket.docket_number, destination_docket=docket, - # The "date" field of transfers is not always set, but when it is, it seems to match date filed. - transfer_date=docket_data["date_filed"], + # If the transfer date is absent or empty, assume it matches the filing date + transfer_date=transfer_from_date + if transfer_from_date + else docket_data["date_filed"], transfer_type=CaseTransfer.WORKLOAD, ) ) - # Assume that the value in the "transfer_to" field will be filled in - # by another court. else: logger.error( "Unrecognized Texas court type %s while creating CaseTransfer", @@ -4056,10 +4069,14 @@ def merge_texas_docket( court = Court.objects.get( pk=texas_js_court_id_to_court_id(docket_data["court_id"]) ) + docket_number = docket_data["docket_number"] + logger.info("Merging Texas docket %s", docket_number) with transaction.atomic(): - docket_number = docket_data["docket_number"] docket = None - if docket_data["court_type"] == CourtType.APPELLATE: + if docket_data["court_type"] == CourtType.APPELLATE.value: + logger.info( + "Docket is appellate. Checking if disaggregation is necessary..." + ) docket = async_to_sync(find_docket_object)( court_id="texapp", pacer_case_id=None, @@ -4075,7 +4092,7 @@ def merge_texas_docket( "Disaggregating Texas appellate docket %s", docket_number ) docket.court = court - if docket is None: + else: docket = async_to_sync(find_docket_object)( court_id=court.pk, pacer_case_id=None, @@ -4122,6 +4139,11 @@ def merge_texas_docket( ) # Assumes that we will only fail to generate a court ID for trial courts and never appellate courts court_name = lower_court_data["name"] + logger.info( + "Updating lower court info with court %s (ID %s).", + court_name, + lower_court_id, + ) docket.appeal_from_str = court_name docket.save() @@ -4175,6 +4197,12 @@ def merge_texas_docket( and merge_case_transfer_result.success and all(r.success for r in entry_merge_results) ) + if not success: + logger.error( + "One or more steps in Texas case merging failed for docket %s (pk %s). Please review logs.", + docket_number, + docket.pk, + ) return MergeResult( create=create, @@ -4207,6 +4235,7 @@ def parse_texas_docket( - Docket metadata. :return: The parsed docket or `None` if parsing failed.""" content, meta = i + logger.info("Attempting to parse Texas docket %s...", meta.case_number) try: if meta.court_code == "cossup": parser = TexasSupremeCourtScraper() @@ -4216,8 +4245,9 @@ def parse_texas_docket( parser = TexasCourtOfAppealsScraper(meta.court_code) else: logger.error( - "Unrecognized Texas court type %s. Cannot parse.", + "Unrecognized Texas court type %s. Cannot parse docket %s.", meta.court_code, + meta.case_number, ) self.request.chain = None return None From d4f41bcc12b917079cbf796a3ca959a99443c06a Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Mon, 23 Feb 2026 13:09:28 -0700 Subject: [PATCH 40/87] fix(texas): Address review comments --- .../commands/import_texas_dockets.py | 61 +------ cl/corpus_importer/management/utils.py | 74 +++------ cl/corpus_importer/tasks.py | 150 +++++++++++++----- cl/corpus_importer/utils.py | 7 +- pyproject.toml | 4 +- uv.lock | 2 +- 6 files changed, 141 insertions(+), 157 deletions(-) diff --git a/cl/corpus_importer/management/commands/import_texas_dockets.py b/cl/corpus_importer/management/commands/import_texas_dockets.py index dc98738db1..d16131d503 100644 --- a/cl/corpus_importer/management/commands/import_texas_dockets.py +++ b/cl/corpus_importer/management/commands/import_texas_dockets.py @@ -1,61 +1,14 @@ from collections.abc import Iterable from pathlib import Path -import botocore.exceptions - from cl.celery_init import app from cl.corpus_importer.management.utils import ( CorpusImporterCommand, - TexasDocketMeta, ) -from cl.corpus_importer.tasks import merge_texas_docket, parse_texas_docket -from cl.lib.command_utils import logger -from cl.lib.decorators import time_call -from cl.lib.storage import AWSMediaStorage - - -@app.task( - bind=True, - autoretry_for=( - botocore.exceptions.HTTPClientError, - botocore.exceptions.ConnectionError, - ), - max_retries=5, - retry_backoff=10, - ignore_result=True, +from cl.corpus_importer.tasks import ( + texas_corpus_download_task, + texas_ingest_docket_task, ) -@time_call(logger) -def _texas_corpus_download_task( - self: app.Task, - docket: tuple[str, str], - docket_meta: tuple[str, str], -) -> tuple[bytes, TexasDocketMeta]: - """Downloads a scraped file from S3 and returns it for parsing. - - :param docket: Tuple of S3 bucket name and key where docket HTML is stored. - :param docket_meta: Tuple of S3 bucket name and key where docket metadata - is stored. - :return: Tuple with entries: Bytes of downloaded file, dictionary with - response headers, and docket metadata.""" - storage = AWSMediaStorage(bucket_name=docket[0]) - logger.info( - "Downloading docket HTML from S3: (Bucket: %s; Path: %s)", - docket[0], - docket[1], - ) - with storage.open(docket[1], "rb") as f: - content = f.read() - - storage = AWSMediaStorage(bucket_name=docket_meta[0]) - logger.info( - "Downloading docket meta from S3: (Bucket: %s; Path: %s)", - docket_meta[0], - docket_meta[1], - ) - with storage.open(docket_meta[1], "r") as f: - meta = TexasDocketMeta.model_validate_json(f.read()) - - return content, meta class Command(CorpusImporterCommand): @@ -89,12 +42,8 @@ def transform_inventory_iterator( @staticmethod def download_task() -> app.Task: - return _texas_corpus_download_task - - @staticmethod - def parse_task() -> app.Task: - return parse_texas_docket + return texas_corpus_download_task @staticmethod def merge_task() -> app.Task: - return merge_texas_docket + return texas_ingest_docket_task diff --git a/cl/corpus_importer/management/utils.py b/cl/corpus_importer/management/utils.py index 5621e5064c..f050770cbe 100644 --- a/cl/corpus_importer/management/utils.py +++ b/cl/corpus_importer/management/utils.py @@ -7,7 +7,6 @@ from itertools import islice from typing import final -import botocore.exceptions from celery import chain from django.conf import settings from pydantic import BaseModel, field_validator @@ -19,7 +18,6 @@ get_last_parent_document_id_processed, log_last_document_indexed, ) -from cl.lib.storage import AWSMediaStorage class TexasDocketMeta(BaseModel): @@ -41,54 +39,26 @@ def date_filed_validator(cls, v): return date(*(time.strptime(v, "%m/%d/%Y")[0:3])) -@app.task( - bind=True, - autoretry_for=( - botocore.exceptions.HTTPClientError, - botocore.exceptions.ConnectionError, - ), - max_retries=5, - retry_backoff=10, - ignore_result=True, -) -def _corpus_download_task(bucket: str, s3_key: str) -> tuple[bytes, str, str]: - """Downloads a scraped file from S3 and returns it for parsing. - - :param bucket: S3 bucket name. - :param s3_key: S3 key to download file from. - :return: Tuple with entries: Bytes of downloaded file, the bucket - parameter, and the s3_key parameter.""" - logger.info("Downloading file from S3: %s", s3_key) - storage = AWSMediaStorage(bucket_name=bucket) - with storage.open(s3_key, "rb") as f: - content = f.read() - return content, bucket, s3_key - - class CorpusImporterCommand(VerboseCommand, ABC): - """Base class for `cl.corpus_importer` commands encapsulating inventory - file reading, celery queue interactions, and redis logging. + """Base class for `cl.corpus_importer` commands encapsulating inventory\ + file reading, celery queue interactions, and redis logging. - Uses an inventory CSV from S3 to find files to parse and ingest into the - database. Includes ratelimiting and autoresume logic. + Uses an inventory CSV from S3 to find files to parse and ingest into the\ + database. Includes ratelimiting and autoresume logic. Required methods are: - - `parse_task`: Should return a Celery task which parses a `bytes` object - into some usable format, typically using Juriscraper. Signature should be: - `task(content: bytes, bucket_name: str, s3_key: str)`, unless you manually - override `download_task` to return a different format. - `merge_task`: Should return a Celery task which takes the output of - `parse_task` and merges it into the database. Input should be whatever the - output of `parse_task` is. + `download_task`, parses it, and merges it into the database. Input\ + should be whatever the output of `download_task` is. Required properties are: - `compose_redis_key`: The Redis log key to use for tracking progress. Optional methods are: - - `download_task`: Should return the task used to download files from S3. A - default implementation is provided for convenience.""" + - `download_task`: Should return the task used to download files from S3.\ + A default implementation is provided for convenience.""" compose_redis_key: str @@ -104,11 +74,6 @@ def add_arguments(self, parser): default="celery", help="Which celery queue to use for S3 retrieval.", ) - parser.add_argument( - "--parsing-queue", - default="celery", - help="Which celery queue to use for document parsing.", - ) parser.add_argument( "--ingesting-queue", default="celery", @@ -154,11 +119,9 @@ def add_arguments(self, parser): @staticmethod def download_task() -> app.Task: - return _corpus_download_task + from cl.corpus_importer.tasks import default_corpus_download_task - @staticmethod - @abstractmethod - def parse_task() -> app.Task: ... + return default_corpus_download_task @staticmethod @abstractmethod @@ -168,14 +131,17 @@ def merge_task() -> app.Task: ... def transform_inventory_iterator( csv_reader: Iterable[list[str]], ) -> Iterable: - """Optionally performs transformations on the inventory CSV file before - passing it to the download Celery task. Can be used for instance to - merge consecutive rows which represent the same docket into one object. + """ + Optionally performs transformations on the inventory CSV file\ + before passing it to the download Celery task. Can be used for\ + instance to merge consecutive rows which represent the same docket\ + into one object. :param csv_reader: The `csv.Reader` object to use to read the CSV. - :return: The transformed inventory CSV iterator. The item of the - iterable should be a list of arguments to be passed to the download - task.""" + + :return: The transformed inventory CSV iterator. The item of the\ + iterable should be a list of arguments to be passed to the\ + download task.""" return map(lambda row: [row[0].strip(), row[1].strip()], csv_reader) @final @@ -183,7 +149,6 @@ def handle(self, *args, **options): super().handle(*args, **options) retrieval_queue = options["retrieval_queue"] - parse_queue = options["parsing_queue"] ingesting_queue = options["ingesting_queue"] delay = options["delay"] inventory_rows = options["inventory_rows"] @@ -220,7 +185,6 @@ def handle(self, *args, **options): self.download_task() .si(*download_args) .set(queue=retrieval_queue), - self.parse_task().s().set(queue=parse_queue), self.merge_task().s().set(queue=ingesting_queue), ).apply_async() time.sleep(delay) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 26eed58f78..51eec5f0f9 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -16,6 +16,7 @@ from tempfile import NamedTemporaryFile from typing import IO, Any, NamedTuple +import botocore.exceptions import environ import eyecite import internetarchive as ia @@ -123,6 +124,7 @@ ) from cl.custom_filters.templatetags.text_filters import best_case_name from cl.lib.celery_utils import throttle_task +from cl.lib.command_utils import logger from cl.lib.crypto import sha1 from cl.lib.decorators import retry, time_call from cl.lib.llm import call_llm @@ -150,6 +152,7 @@ get_document_filename, ) from cl.lib.redis_utils import delete_redis_semaphore, get_redis_interface +from cl.lib.storage import AWSMediaStorage from cl.lib.types import TaskData from cl.people_db.lookup_utils import ( lookup_judge_by_full_name_and_set_attr, @@ -3915,11 +3918,11 @@ def merge_texas_case_transfers( return MergeResult.failed() else: if appeals_court["court_id"] == CourtID.UNKNOWN: - logger.warning( + logger.error( "Found appellate court with unknown ID (docket %s)", docket.docket_number, ) - appeals_court_id = "texapp" + return MergeResult.failed() else: appeals_court_id = texas_js_court_id_to_court_id( appeals_court["court_id"] @@ -3932,7 +3935,7 @@ def merge_texas_case_transfers( "Found appellate court with unknown ID (docket %s)", docket.docket_number, ) - appeals_court_id = "texapp" + return MergeResult.failed() else: appeals_court_id = texas_js_court_id_to_court_id( appeals_court["court_id"] @@ -3986,7 +3989,11 @@ def merge_texas_case_transfers( transfer_date=transfer_from_date if transfer_from_date else docket_data["date_filed"], - transfer_type=CaseTransfer.WORKLOAD, + # Texas Government Code 73.001 (accessed 2026-02-23) + transfer_type=CaseTransfer.JURISDICTION + if docket_data["court_id"] + == CourtID.FIFTEENTH_COURT_OF_APPEALS + else CaseTransfer.WORKLOAD, ) ) else: @@ -4033,30 +4040,18 @@ def generate_texas_appellate_brief_flags( :param appellate_briefs: A list of TexasAppellateBrief objects. :return: A list of booleans indicating whether the corresponding entry is an appellate brief.""" - if not appellate_briefs: - return [False] * len(case_events) - i = 0 + brief_iter = iter(appellate_briefs) + next_brief = next(brief_iter, None) flags = [] - # Assumes that appellate briefs will appear in the same order as the - # corresponding case events. for case_event in case_events: - if i == len(appellate_briefs): - flags.append(False) - continue - if case_event == appellate_briefs[i]: + if next_brief is not None and case_event == next_brief: flags.append(True) - i += 1 + next_brief = next(brief_iter, None) else: flags.append(False) - return flags -@app.task( - max_retries=5, - ignore_result=True, -) -@time_call(logger) def merge_texas_docket( docket_data: TexasCourtOfAppealsDocket | TexasCourtOfCriminalAppealsDocket @@ -4065,6 +4060,7 @@ def merge_texas_docket( """Merges scraped data from a Texas docket into the `Docket` table. :param docket_data: The scraped Texas docket data. + :return: The result of the merge operation.""" court = Court.objects.get( pk=texas_js_court_id_to_court_id(docket_data["court_id"]) @@ -4073,6 +4069,11 @@ def merge_texas_docket( logger.info("Merging Texas docket %s", docket_number) with transaction.atomic(): docket = None + if docket_data["court_type"] == CourtType.UNKNOWN.value: + logger.error( + "Texas docket %s has unknown court type", docket_number + ) + return MergeResult.failed() if docket_data["court_type"] == CourtType.APPELLATE.value: logger.info( "Docket is appellate. Checking if disaggregation is necessary..." @@ -4092,7 +4093,7 @@ def merge_texas_docket( "Disaggregating Texas appellate docket %s", docket_number ) docket.court = court - else: + if docket is None: docket = async_to_sync(find_docket_object)( court_id=court.pk, pacer_case_id=None, @@ -4218,22 +4219,21 @@ def merge_texas_docket( ignore_result=True, ) @time_call(logger) -def parse_texas_docket( - self: Task, i: tuple[bytes, TexasDocketMeta] -) -> ( - TexasCourtOfAppealsDocket - | TexasCourtOfCriminalAppealsDocket - | TexasSupremeCourtDocket - | None -): - """Uses Juriscraper to parse bytes into a Texas docket object. +def texas_ingest_docket_task( + task: Task, + i: tuple[bytes, TexasDocketMeta], +) -> MergeResult: + """ + Task to parse and merge a Texas docket. + + :param task: The Celery task. - :param self: The Celery task. :param i: Tuple with the following entries: - - Bytes string to parse. - - The response headers to the scraper. - - Docket metadata. - :return: The parsed docket or `None` if parsing failed.""" + - Bytes string to parse. + - Docket metadata. + + :return: The result of the merge operation. + """ content, meta = i logger.info("Attempting to parse Texas docket %s...", meta.case_number) try: @@ -4249,16 +4249,86 @@ def parse_texas_docket( meta.court_code, meta.case_number, ) - self.request.chain = None - return None + task.request.chain = None + return MergeResult.failed() parser._parse_text(content.decode("utf-8")) - return parser.data + docket_data = parser.data except Exception as e: - self.request.chain = None logger.error( "Encountered error parsing Texas docket at URL %s: %s", meta.case_url, str(e), ) - return None + task.request.chain = None + return MergeResult.failed() + return merge_texas_docket(docket_data) + + +@app.task( + autoretry_for=( + botocore.exceptions.HTTPClientError, + botocore.exceptions.ConnectionError, + ), + max_retries=5, + retry_backoff=10, + ignore_result=True, +) +def default_corpus_download_task( + bucket: str, s3_key: str +) -> tuple[bytes, str, str]: + """Downloads a scraped file from S3 and returns it for parsing. + + :param bucket: S3 bucket name. + :param s3_key: S3 key to download file from. + :return: Tuple with entries: Bytes of downloaded file, the bucket + parameter, and the s3_key parameter.""" + logger.info("Downloading file from S3: %s", s3_key) + storage = AWSMediaStorage(bucket_name=bucket) + with storage.open(s3_key, "rb") as f: + content = f.read() + return content, bucket, s3_key + + +@app.task( + autoretry_for=( + botocore.exceptions.HTTPClientError, + botocore.exceptions.ConnectionError, + ), + max_retries=5, + retry_backoff=10, + ignore_result=True, +) +@time_call(logger) +def texas_corpus_download_task( + docket: tuple[str, str], + docket_meta: tuple[str, str], +) -> tuple[bytes, TexasDocketMeta]: + """Downloads a scraped file from S3 and returns it for parsing. + + :param docket: Tuple of S3 bucket name and key where docket HTML is stored. + + :param docket_meta: Tuple of S3 bucket name and key where docket metadata\ + is stored. + + :return: Tuple with entries: Bytes of downloaded file, dictionary with\ + response headers, and docket metadata.""" + storage = AWSMediaStorage(bucket_name=docket[0]) + logger.info( + "Downloading docket HTML from S3: (Bucket: %s; Path: %s)", + docket[0], + docket[1], + ) + with storage.open(docket[1], "rb") as f: + content = f.read() + + storage = AWSMediaStorage(bucket_name=docket_meta[0]) + logger.info( + "Downloading docket meta from S3: (Bucket: %s; Path: %s)", + docket_meta[0], + docket_meta[1], + ) + with storage.open(docket_meta[1], "r") as f: + meta = TexasDocketMeta.model_validate_json(f.read()) + + return content, meta diff --git a/cl/corpus_importer/utils.py b/cl/corpus_importer/utils.py index 28c3b59ac0..612a56f8e9 100644 --- a/cl/corpus_importer/utils.py +++ b/cl/corpus_importer/utils.py @@ -1317,10 +1317,11 @@ def texas_js_court_id_to_court_id(js_court_id: str) -> str: return "tex" if js_court_id == CourtID.COURT_OF_CRIMINAL_APPEALS.value: return "texcrimapp" + if js_court_id == CourtID.UNKNOWN.value: + logger.error("Unknown court ID: %s", js_court_id) + return "" # Court of appeals - appellate_number = str(int(js_court_id[len("texas_coa") :])) - if appellate_number == "13": - appellate_number = "13A" + appellate_number = str(int(js_court_id.removeprefix("texas_coa"))) return f"txctapp{appellate_number}" diff --git a/pyproject.toml b/pyproject.toml index 66836d83d7..23d3c9d39c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ urls.Home = "https://www.courtlistener.com/" urls.Repository = "https://github.com/freelawproject/courtlistener" urls.Documentation = "https://github.com/freelawproject/courtlistener/wiki" license = " AGPL-3.0-only" -license-files = ["LICENSE.txt"] +license-files = [ "LICENSE.txt" ] dependencies = [ "ada-url>=1.28.0", @@ -112,7 +112,7 @@ dependencies = [ "django-cotton>=2.6.0", "django-cursor-pagination>=0.3.0", "django-elasticsearch-dsl>=8.0", - "juriscraper>=2.7.7", + "juriscraper>=2.6.68", "instructor>=1.14.1", "django-s3-express-cache>=0.1.0", "zohocrmsdk8-0==4.0.0", diff --git a/uv.lock b/uv.lock index 681ba6f126..e9b818daa8 100644 --- a/uv.lock +++ b/uv.lock @@ -502,7 +502,7 @@ requires-dist = [ { name = "ipython", specifier = ">=9.9.0" }, { name = "itypes", specifier = ">=1.1.0" }, { name = "judge-pics", specifier = ">=2.0.5" }, - { name = "juriscraper", specifier = ">=2.7.7" }, + { name = "juriscraper", specifier = ">=2.6.68" }, { name = "kombu", specifier = ">=5.5.1" }, { name = "lxml", specifier = ">=6.0.2" }, { name = "markdown2", specifier = ">=2.5.4" }, From ee7e97a54c4feea4fb71836fa9fd1e7afed5044c Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Mon, 23 Feb 2026 18:28:30 -0700 Subject: [PATCH 41/87] fix(texas): Missing docket numbers `find_docket_object` doesn't set docket numbers when creating dockets :'( --- cl/corpus_importer/tasks.py | 63 +++++++++++++++++++++++++++++++------ cl/corpus_importer/utils.py | 6 ++-- cl/recap/mergers.py | 3 ++ 3 files changed, 60 insertions(+), 12 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 51eec5f0f9..ea847f57c2 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -132,6 +132,7 @@ doc_page_count_service, microservice, ) +from cl.lib.model_helpers import make_docket_number_core from cl.lib.pacer import ( get_blocked_status, get_first_missing_de_date, @@ -3510,10 +3511,10 @@ def merge_case_transfer(case_transfer: CaseTransfer) -> MergeResult: :return: The result of this merge attempt.""" logger.info( "Merging CaseTransfer from docket %s in court %s to docket %s in court %s on %s with type %s.", - case_transfer.origin_court.pk, case_transfer.origin_docket_number, - case_transfer.destination_court.pk, + case_transfer.origin_court.pk, case_transfer.destination_docket_number, + case_transfer.destination_court.pk, case_transfer.transfer_date.isoformat(), case_transfer.transfer_type, ) @@ -3878,6 +3879,11 @@ def merge_texas_case_transfers( :param docket: The docket to add the appeal information to. :param docket_data: The docket data from Juriscraper. :return: The result of the CaseTransfer merge operation""" + logger.info( + "Determining transfers for docket %s in court %s...", + docket.docket_number, + docket.court.pk, + ) trial_court_id = texas_originating_court_to_court_id( docket_data["originating_court"] ) @@ -3896,13 +3902,18 @@ def merge_texas_case_transfers( appeals_court = docket_data["appeals_court"] if docket_data["court_id"] == CourtID.COURT_OF_CRIMINAL_APPEALS.value: + logger.info("Docket %s is from the CCA", docket.docket_number) # Death penalty cases are automatically appealed to the CCA so the # appellate court may be missing. if ( not appeals_court - or appeals_court["court_id"] == CourtID.UNKNOWN + or appeals_court["court_id"] == CourtID.UNKNOWN.value ): # Death penalty appeal + logger.info( + "Docket %s in the CCA is a death penalty appeal", + docket.docket_number, + ) if trial_court_id: transfer.origin_court = Court.objects.get( pk=trial_court_id @@ -3917,7 +3928,11 @@ def merge_texas_case_transfers( ) return MergeResult.failed() else: - if appeals_court["court_id"] == CourtID.UNKNOWN: + logger.info( + "Docket %s in the CCA is not a death penalty appeal", + docket.docket_number, + ) + if appeals_court["court_id"] == CourtID.UNKNOWN.value: logger.error( "Found appellate court with unknown ID (docket %s)", docket.docket_number, @@ -3927,10 +3942,16 @@ def merge_texas_case_transfers( appeals_court_id = texas_js_court_id_to_court_id( appeals_court["court_id"] ) + logger.info( + "Appeals court ID for CCA docket %s is %s", + docket.docket_number, + appeals_court_id, + ) transfer.origin_court = Court.objects.get(pk=appeals_court_id) transfer.origin_docket_number = appeals_court["case_number"] elif docket_data["court_id"] == CourtID.SUPREME_COURT.value: - if appeals_court["court_id"] == CourtID.UNKNOWN: + logger.info("Docket %s is from the SC", docket.docket_number) + if appeals_court["court_id"] == CourtID.UNKNOWN.value: logger.warning( "Found appellate court with unknown ID (docket %s)", docket.docket_number, @@ -3940,6 +3961,11 @@ def merge_texas_case_transfers( appeals_court_id = texas_js_court_id_to_court_id( appeals_court["court_id"] ) + logger.info( + "Appeals court ID for SC docket %s is %s", + docket.docket_number, + appeals_court_id, + ) transfer.origin_court = Court.objects.get(pk=appeals_court_id) transfer.origin_docket_number = appeals_court["case_number"] else: @@ -3950,8 +3976,13 @@ def merge_texas_case_transfers( return MergeResult.failed() transfers = [transfer] elif docket_data["court_type"] == CourtType.APPELLATE.value: + logger.info("Docket %s is an appellate docket", docket.docket_number) transfers = [] if trial_court_id: + logger.info( + "Appellate docket %s has a valid trial court", + docket.docket_number, + ) transfers.append( CaseTransfer( origin_court=Court.objects.get(pk=trial_court_id), @@ -3966,6 +3997,9 @@ def merge_texas_case_transfers( ) ) if docket_data["transfer_from"]: + logger.info( + "Appellate docket %s has a transfer in", docket.docket_number + ) transfer_from_date = docket_data["transfer_from"]["date"] if not transfer_from_date: logger.warning( @@ -4104,6 +4138,8 @@ def merge_texas_docket( docket_source=Docket.SCRAPER, allow_create=True, ) + docket.docket_number = docket_number + docket.docket_number_core = make_docket_number_core(docket_number) docket.date_filed = docket_data["date_filed"] docket.cause = docket_data["case_type"] originating_court_merge_result = merge_texas_docket_originating_court( @@ -4122,10 +4158,19 @@ def merge_texas_docket( lower_court_data ) else: - lower_court_data = docket_data["appeals_court"] - lower_court_id = texas_js_court_id_to_court_id( - lower_court_data["court_id"] - ) + if ( + docket_data["appeals_court"]["court_id"] + == CourtID.UNKNOWN.value + ): + lower_court_data = docket_data["originating_court"] + lower_court_id = texas_originating_court_to_court_id( + lower_court_data + ) + else: + lower_court_data = docket_data["appeals_court"] + lower_court_id = texas_js_court_id_to_court_id( + lower_court_data["court_id"] + ) if lower_court_id is not None: court = Court.objects.get(pk=lower_court_id) diff --git a/cl/corpus_importer/utils.py b/cl/corpus_importer/utils.py index 612a56f8e9..84f05fa000 100644 --- a/cl/corpus_importer/utils.py +++ b/cl/corpus_importer/utils.py @@ -1308,18 +1308,18 @@ class DownloadPDFResult: sha1: str | None = None -def texas_js_court_id_to_court_id(js_court_id: str) -> str: +def texas_js_court_id_to_court_id(js_court_id: str) -> str | None: """Translates a Juriscraper Texas court ID to a CourtListener Court ID. :param js_court_id: The court ID extracted from Juriscraper. - :return: The corresponding Court ID.""" + :return: The corresponding Court ID or None if invalid.""" if js_court_id == CourtID.SUPREME_COURT.value: return "tex" if js_court_id == CourtID.COURT_OF_CRIMINAL_APPEALS.value: return "texcrimapp" if js_court_id == CourtID.UNKNOWN.value: logger.error("Unknown court ID: %s", js_court_id) - return "" + return None # Court of appeals appellate_number = str(int(js_court_id.removeprefix("texas_coa"))) return f"txctapp{appellate_number}" diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py index 65593c2a00..ddd8784d71 100644 --- a/cl/recap/mergers.py +++ b/cl/recap/mergers.py @@ -140,6 +140,9 @@ async def find_docket_object( """Attempt to find the docket based on the parsed docket data. If cannot be found, create a new docket. If multiple are found, return the oldest. + Note: Only sets `source`, `pacer_case_id`, and `court_id` fields on the + created docket. + :param court_id: The CourtListener court_id to lookup :param pacer_case_id: The PACER case ID for the docket :param docket_number: The docket number to lookup. From f1f23c2019ec83fc7f18f7ffa9ee2e34c6ec3fdd Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 24 Feb 2026 10:47:02 -0700 Subject: [PATCH 42/87] feat(command): Add command to populate existing CaseTransfers Try to update missing CaseTransfer docket foreign keys after the Texas import command completes and on a schedule. --- .../commands/fill_case_transfers.py | 20 +++++ .../commands/import_texas_dockets.py | 5 ++ cl/corpus_importer/management/utils.py | 1 - cl/corpus_importer/tasks.py | 15 ++++ cl/search/models.py | 84 ++++++++++++++++++- 5 files changed, 123 insertions(+), 2 deletions(-) create mode 100644 cl/corpus_importer/management/commands/fill_case_transfers.py diff --git a/cl/corpus_importer/management/commands/fill_case_transfers.py b/cl/corpus_importer/management/commands/fill_case_transfers.py new file mode 100644 index 0000000000..cb168b106f --- /dev/null +++ b/cl/corpus_importer/management/commands/fill_case_transfers.py @@ -0,0 +1,20 @@ +from cl.corpus_importer.tasks import fill_case_transfer_missing_dockets +from cl.lib.command_utils import VerboseCommand + + +class Command(VerboseCommand): + help = "Update missing docket foreign keys in the CaseTransfer table." + + def add_arguments(self, parser): + parser.add_argument( + "--queue", + type=str, + help="The queue to run the update task in.", + default="celery", + ) + + def handle(self, *args, **options): + super().handle(*args, **options) + + queue = options["queue"] + fill_case_transfer_missing_dockets.delay(queue) diff --git a/cl/corpus_importer/management/commands/import_texas_dockets.py b/cl/corpus_importer/management/commands/import_texas_dockets.py index d16131d503..e35845ac2c 100644 --- a/cl/corpus_importer/management/commands/import_texas_dockets.py +++ b/cl/corpus_importer/management/commands/import_texas_dockets.py @@ -6,6 +6,7 @@ CorpusImporterCommand, ) from cl.corpus_importer.tasks import ( + fill_case_transfer_missing_dockets, texas_corpus_download_task, texas_ingest_docket_task, ) @@ -47,3 +48,7 @@ def download_task() -> app.Task: @staticmethod def merge_task() -> app.Task: return texas_ingest_docket_task + + def handle(self, *args, **options): + super().handle(*args, **options) + fill_case_transfer_missing_dockets.delay() diff --git a/cl/corpus_importer/management/utils.py b/cl/corpus_importer/management/utils.py index f050770cbe..8563657ef8 100644 --- a/cl/corpus_importer/management/utils.py +++ b/cl/corpus_importer/management/utils.py @@ -144,7 +144,6 @@ def transform_inventory_iterator( download task.""" return map(lambda row: [row[0].strip(), row[1].strip()], csv_reader) - @final def handle(self, *args, **options): super().handle(*args, **options) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index ea847f57c2..7d03c156a8 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4377,3 +4377,18 @@ def texas_corpus_download_task( meta = TexasDocketMeta.model_validate_json(f.read()) return content, meta + + +@app.task( + ignore_result=True, +) +@time_call(logger) +def fill_case_transfer_missing_dockets(): + """ + Attempt to populate missing Docket foreign key fields in CaseTransfer\ + objects. Run after a scraping task and on a daily schedule. + """ + logger.info( + "Attempting to populate missing CaseTransfer docket foreign keys..." + ) + CaseTransfer.fill_null_dockets() diff --git a/cl/search/models.py b/cl/search/models.py index 46179373d6..c51be0c510 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -6,7 +6,7 @@ import nh3 import pghistory import pytz -from asgiref.sync import sync_to_async +from asgiref.sync import async_to_sync, sync_to_async from celery.canvas import chain from django.contrib.contenttypes.fields import GenericRelation from django.contrib.postgres.indexes import HashIndex @@ -4054,6 +4054,88 @@ class CaseTransfer(AbstractDateTimeModel): choices=transfer_type_choices.items(), ) + # We currently only generate transfers for state courts, and we do not + # scrape trial courts so skip trying to populate fields we'll never be able + # to populate. + TRACKED_JURISDICTIONS = (Court.STATE_APPELLATE, Court.STATE_SUPREME) + + @classmethod + def fill_null_dockets(cls): + from cl.recap.mergers import find_docket_object + + logger.info( + "Attempting to populate missing fields in CaseTransfer table..." + ) + + missing_origin_docket = cls.objects.filter( + origin_court__jurisdiction__in=cls.TRACKED_JURISDICTIONS, + origin_docket__isnull=True, + ) + total_origin = missing_origin_docket.count() + for transfer in missing_origin_docket: + origin_docket = async_to_sync(find_docket_object)( + court_id=transfer.origin_court_id, + pacer_case_id=None, + docket_number=transfer.origin_docket_number, + federal_defendant_number=None, + federal_dn_judge_initials_assigned=None, + federal_dn_judge_initials_referred=None, + docket_source=Docket.SCRAPER, + allow_create=False, + ) + if origin_docket: + logger.info( + "Found origin docket %s!", transfer.origin_docket_number + ) + transfer.origin_docket = origin_docket + else: + logger.info( + "Could not find origin docket %s.", + transfer.origin_docket_number, + ) + updated_origin = cls.objects.bulk_update( + missing_origin_docket, ["origin_docket"], batch_size=100 + ) + + missing_destination_docket = cls.objects.filter( + destination_court__jurisdiction__in=cls.TRACKED_JURISDICTIONS, + destination_docket__isnull=True, + ) + total_destination = missing_destination_docket.count() + for transfer in missing_destination_docket: + destination_docket = async_to_sync(find_docket_object)( + court_id=transfer.destination_court_id, + pacer_case_id=None, + docket_number=transfer.destination_docket_number, + federal_defendant_number=None, + federal_dn_judge_initials_assigned=None, + federal_dn_judge_initials_referred=None, + docket_source=Docket.SCRAPER, + allow_create=False, + ) + if destination_docket: + logger.info( + "Found destination docket %s!", + transfer.destination_docket_number, + ) + transfer.destination_docket = destination_docket + else: + logger.info( + "Could not find destination docket %s.", + transfer.destination_docket_number, + ) + updated_destination = cls.objects.bulk_update( + missing_destination_docket, ["destination_docket"], batch_size=100 + ) + + logger.info( + "Update complete. Populated %s/%s origin dockets and %s/%s destination dockets.", + updated_origin, + total_origin, + updated_destination, + total_destination, + ) + class Meta: constraints = [ CheckConstraint( From 54df36dc917bf592c6ad7daea7b770ae6c410779 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 24 Feb 2026 16:29:42 -0700 Subject: [PATCH 43/87] test(command): Add test for CaseTransfer population functionality --- cl/search/tests/tests.py | 83 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/cl/search/tests/tests.py b/cl/search/tests/tests.py index 07e4e0ef3a..c1eef74d5c 100644 --- a/cl/search/tests/tests.py +++ b/cl/search/tests/tests.py @@ -60,6 +60,7 @@ ) from cl.search.exception import InvalidRelativeDateSyntax from cl.search.factories import ( + CaseTransferFactory, CourtFactory, DocketEntryFactory, DocketFactory, @@ -83,6 +84,7 @@ from cl.search.models import ( PRECEDENTIAL_STATUS, SEARCH_TYPES, + CaseTransfer, Citation, ClusterRedirection, Court, @@ -3858,3 +3860,84 @@ def test_llm_clean_docket_number_daemon( {str(self.docket_4.id)}, "Redis cache set should contain docket_4 only", ) + + +class CaseTransferFillNullDocketsTest(TestCase): + """Tests for CaseTransfer.fill_null_dockets.""" + + @classmethod + def setUpTestData(cls): + cls.appellate_court = CourtFactory.create( + jurisdiction=Court.STATE_APPELLATE, + ) + cls.supreme_court = CourtFactory.create( + jurisdiction=Court.STATE_SUPREME, + ) + + def test_fills_missing_origin_docket(self): + """Does fill_null_dockets populate a missing origin_docket FK?""" + origin_docket = DocketFactory.create(court=self.appellate_court) + destination_docket = DocketFactory.create(court=self.supreme_court) + transfer = CaseTransferFactory.create( + origin_court=origin_docket.court, + origin_docket=None, + origin_docket_number=origin_docket.docket_number, + destination_court=destination_docket.court, + destination_docket=destination_docket, + ) + + CaseTransfer.fill_null_dockets() + + transfer.refresh_from_db() + assert transfer.origin_docket_id == origin_docket.pk + + def test_fills_missing_destination_docket(self): + """Does fill_null_dockets populate a missing destination_docket FK?""" + origin_docket = DocketFactory.create(court=self.appellate_court) + destination_docket = DocketFactory.create(court=self.supreme_court) + transfer = CaseTransferFactory.create( + origin_court=origin_docket.court, + origin_docket=origin_docket, + destination_court=destination_docket.court, + destination_docket=None, + destination_docket_number=destination_docket.docket_number, + ) + + CaseTransfer.fill_null_dockets() + + transfer.refresh_from_db() + assert transfer.destination_docket == destination_docket + + def test_leaves_already_populated_dockets_unchanged(self): + """Does fill_null_dockets leave already-populated FKs alone?""" + origin = DocketFactory.create() + destination = DocketFactory.create() + transfer = CaseTransferFactory.create( + origin_court=origin.court, + origin_docket=origin, + destination_court=destination.court, + destination_docket=destination, + ) + + CaseTransfer.fill_null_dockets() + + transfer.refresh_from_db() + assert transfer.origin_docket == origin + assert transfer.destination_docket == destination + + def test_no_match_leaves_null(self): + """Does fill_null_dockets leave FK null when no docket is found?""" + destination_docket = DocketFactory.create() + transfer = CaseTransferFactory.create( + origin_court=CourtFactory.create(), + origin_docket=None, + origin_docket_number=destination_docket.docket_number + + "dontmatch", + destination_court=destination_docket.court, + destination_docket=destination_docket, + ) + + CaseTransfer.fill_null_dockets() + + transfer.refresh_from_db() + assert transfer.origin_docket is None From 21f5f49d639421464f1b1ede3a9be00f70705cc5 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Wed, 25 Feb 2026 11:27:12 -0700 Subject: [PATCH 44/87] fix(texas): Address PR feedback --- cl/corpus_importer/tasks.py | 54 +++++++++++++++++++++++-------------- cl/corpus_importer/utils.py | 31 ++++++++++----------- cl/lib/model_helpers.py | 24 +++++++++++++++++ cl/lib/tests.py | 16 +++++++++++ 4 files changed, 90 insertions(+), 35 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index ea847f57c2..e2cac9a8e2 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -132,7 +132,9 @@ doc_page_count_service, microservice, ) -from cl.lib.model_helpers import make_docket_number_core +from cl.lib.model_helpers import ( + make_texas_docket_number_core, +) from cl.lib.pacer import ( get_blocked_status, get_first_missing_de_date, @@ -4139,7 +4141,9 @@ def merge_texas_docket( allow_create=True, ) docket.docket_number = docket_number - docket.docket_number_core = make_docket_number_core(docket_number) + docket.docket_number_core = make_texas_docket_number_core( + docket_number + ) docket.date_filed = docket_data["date_filed"] docket.cause = docket_data["case_type"] originating_court_merge_result = merge_texas_docket_originating_court( @@ -4172,11 +4176,19 @@ def merge_texas_docket( lower_court_data["court_id"] ) + court_name = None if lower_court_id is not None: - court = Court.objects.get(pk=lower_court_id) - docket.appeal_from = court - court_name = court.full_name - else: + try: + court = Court.objects.get(pk=lower_court_id) + except Court.DoesNotExist: + logger.error( + "Could not find lower court with ID %s to set appeal_from for Texas docket.", + lower_court_id, + ) + else: + docket.appeal_from = court + court_name = court.full_name + if not court_name: logger.warning( "Failed to find court ID %s while populating appeal_from field for Texas docket %s in court %s", lower_court_id, @@ -4282,20 +4294,22 @@ def texas_ingest_docket_task( content, meta = i logger.info("Attempting to parse Texas docket %s...", meta.case_number) try: - if meta.court_code == "cossup": - parser = TexasSupremeCourtScraper() - elif meta.court_code == "coscca": - parser = TexasCourtOfCriminalAppealsScraper() - elif meta.court_code.startswith("coa"): - parser = TexasCourtOfAppealsScraper(meta.court_code) - else: - logger.error( - "Unrecognized Texas court type %s. Cannot parse docket %s.", - meta.court_code, - meta.case_number, - ) - task.request.chain = None - return MergeResult.failed() + match meta.court_code: + case "cossup": + parser = TexasSupremeCourtScraper() + case "coscca": + parser = TexasCourtOfCriminalAppealsScraper() + case _: + if meta.court_code.startswith("coa"): + parser = TexasCourtOfAppealsScraper(meta.court_code) + else: + logger.error( + "Unrecognized Texas court type %s. Cannot parse docket %s.", + meta.court_code, + meta.case_number, + ) + task.request.chain = None + return MergeResult.failed() parser._parse_text(content.decode("utf-8")) docket_data = parser.data diff --git a/cl/corpus_importer/utils.py b/cl/corpus_importer/utils.py index 84f05fa000..155766e515 100644 --- a/cl/corpus_importer/utils.py +++ b/cl/corpus_importer/utils.py @@ -1334,20 +1334,21 @@ def texas_originating_court_to_court_id( :param court_data: The originating court data from Juriscraper. :return: The matching Court ID or None if no court could be found.""" court_type = court_data["court_type"] - if court_type == CourtType.APPELLATE.value: - return texas_js_court_id_to_court_id(court_data["court_id"]) - if court_type == CourtType.DISTRICT.value: - district_number = court_data["district"] - if district_number: - if district_number > 1: - district_number = district_number + 1 - return f"texdistct{district_number}" - return "texdistct" - if court_type == CourtType.BUSINESS.value: - return "texbizct" - if court_type == CourtType.MUNICIPAL.value: - return "texctyct" - if court_type == CourtType.PROBATE.value: - return "texprobct" + match court_type: + case CourtType.APPELLATE.value: + return texas_js_court_id_to_court_id(court_data["court_id"]) + case CourtType.DISTRICT.value: + district_number = court_data["district"] + if district_number: + if district_number > 1: + district_number = district_number + 1 + return f"texdistct{district_number}" + return "texdistct" + case CourtType.BUSINESS.value: + return "texbizct" + case CourtType.MUNICIPAL.value: + return "texctyct" + case CourtType.PROBATE.value: + return "texprobct" # County, justice, and unknown court types return None diff --git a/cl/lib/model_helpers.py b/cl/lib/model_helpers.py index 0cbac63f80..24b266d66e 100644 --- a/cl/lib/model_helpers.py +++ b/cl/lib/model_helpers.py @@ -111,6 +111,30 @@ def make_docket_number_core(docket_number: str | None) -> str: return "" +def make_texas_docket_number_core(docket_number: str | None) -> str: + """ + Normalize Texas docket numbers. + + There is overlap between valid Texas docket numbers and valid Federal\ + docket numbers, but they need to be normalized differently so we need a\ + separate method. + + :param docket_number: The docket number to normalize. + + :return: The normalized docket number. + """ + + if docket_number is None: + return "" + not_alphanum_regex = re.compile(r"[^a-z0-9]") + + # Normalize dashes + docket_number = normalize_dashes(docket_number) + # Normalize to lowercase + docket_number = docket_number.lower() + return not_alphanum_regex.sub("", docket_number) + + def make_scotus_docket_number_core(docket_number: str | None) -> str: """Normalize SCOTUS docket numbers like 16A985. diff --git a/cl/lib/tests.py b/cl/lib/tests.py index 833cbaac09..9b5c31f8e3 100644 --- a/cl/lib/tests.py +++ b/cl/lib/tests.py @@ -28,6 +28,7 @@ linkify_orig_docket_number, make_docket_number_core, make_scotus_docket_number_core, + make_texas_docket_number_core, make_upload_path, ) from cl.lib.pacer import ( @@ -427,6 +428,21 @@ def test_making_docket_number_core(self) -> None: # an empty string. self.assertEqual(make_docket_number_core(None), "") + def test_texas_docket_number_core(self) -> None: + """Can we correctly normalize Texas docket numbers?""" + self.assertEqual( + make_texas_docket_number_core("04-97-00972-CV"), "049700972cv" + ) + self.assertEqual( + make_texas_docket_number_core("01-18-00277-CR"), "011800277cr" + ) + self.assertEqual(make_texas_docket_number_core("AP-77,129"), "ap77129") + self.assertEqual( + make_texas_docket_number_core("WR-70,849-04"), "wr7084904" + ) + self.assertEqual(make_texas_docket_number_core("A-4369-A"), "a4369a") + self.assertEqual(make_texas_docket_number_core("C-2302"), "c2302") + def test_avoid_generating_docket_number_core(self) -> None: """Can we avoid generating docket_number_core when the docket number format doesn't match a valid format or if a string contains more than From fe29010557f597bd5a525420aba1f1bb057f611a Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Wed, 25 Feb 2026 11:56:42 -0700 Subject: [PATCH 45/87] fix(texas): Incorrect docket_number_core in find_docket_object for Texas --- cl/corpus_importer/tests.py | 14 ++++++++++++-- cl/recap/mergers.py | 12 +++++++----- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 9b7e043bc9..d939f4fad0 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -111,6 +111,7 @@ winnow_case_name, ) from cl.favorites.models import PrayerAvailability +from cl.lib.model_helpers import make_texas_docket_number_core from cl.lib.pacer import process_docket_data from cl.lib.redis_utils import get_redis_interface from cl.people_db.factories import ( @@ -2163,7 +2164,11 @@ def setUpTestData(cls): cls.texas_coa1 = CourtFactory.create(id="txctapp1") cls.docket_number_coa1 = "01-25-00011-CV" cls.docket_coa1 = DocketFactory.create( - court=cls.texas_coa1, docket_number=cls.docket_number_coa1 + court=cls.texas_coa1, + docket_number=cls.docket_number_coa1, + docket_number_core=make_texas_docket_number_core( + cls.docket_number_coa1 + ), ) cls.docket_coa1_entry = TexasDocketEntryFactory.create( docket=cls.docket_coa1, @@ -3102,7 +3107,12 @@ def test_merge_texas_docket_appellate_sets_appeal_from(self): def test_merge_texas_docket_final_court_sets_appeal_from(self): """Does merge_texas_docket set appeal_from for final courts?""" - docket_sc = DocketFactory.create(court=self.texas_sc) + sc_dn = "25-1066" + docket_sc = DocketFactory.create( + court=self.texas_sc, + docket_number=sc_dn, + docket_number_core=make_texas_docket_number_core(sc_dn), + ) appeals_court = TexasAppellateCourtInfoDictFactory( court_id=CourtID.FIRST_COURT_OF_APPEALS.value, ) diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py index bcae206471..433a25c26a 100644 --- a/cl/recap/mergers.py +++ b/cl/recap/mergers.py @@ -31,6 +31,7 @@ clean_docket_number, make_docket_number_core, make_scotus_docket_number_core, + make_texas_docket_number_core, ) from cl.lib.pacer import ( get_blocked_status, @@ -161,11 +162,12 @@ async def find_docket_object( # Attempt several lookups of decreasing specificity. Note that # pacer_case_id is required for Docket and Docket History uploads. d = None - docket_number_core = ( - make_scotus_docket_number_core(docket_number) - if court_id == "scotus" - else make_docket_number_core(docket_number) - ) + if court_id == "scotus": + docket_number_core = make_scotus_docket_number_core(docket_number) + elif court_id.startswith("tex") or court_id.startswith("tx"): + docket_number_core = make_texas_docket_number_core(docket_number) + else: + docket_number_core = make_docket_number_core(docket_number) lookups = [] if pacer_case_id: # Appellate RSS feeds don't contain a pacer_case_id, avoid lookups by From d984c8365bd2af9b33499ffde4f1da140426d793 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Wed, 25 Feb 2026 12:06:44 -0700 Subject: [PATCH 46/87] fix(texas): Overly broad condition for making Texas docket numbers --- cl/recap/mergers.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py index 433a25c26a..c3d15d320c 100644 --- a/cl/recap/mergers.py +++ b/cl/recap/mergers.py @@ -164,7 +164,14 @@ async def find_docket_object( d = None if court_id == "scotus": docket_number_core = make_scotus_docket_number_core(docket_number) - elif court_id.startswith("tex") or court_id.startswith("tx"): + elif ( + court_id == "tex" + or court_id == "texcrimapp" + or court_id.startswith("txctapp") + or court_id.startswith("texdistct") + or court_id.startswith("texcrimdistct") + or court_id.startswith("texctyct") + ): docket_number_core = make_texas_docket_number_core(docket_number) else: docket_number_core = make_docket_number_core(docket_number) From 37aa895a2c4310ebc3a5a53db6b10969155149bf Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Thu, 26 Feb 2026 09:55:25 -0700 Subject: [PATCH 47/87] feat(texas): Add scraper source when merging dockets --- cl/corpus_importer/tasks.py | 1 + cl/search/models.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index e2cac9a8e2..a3e441716b 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4140,6 +4140,7 @@ def merge_texas_docket( docket_source=Docket.SCRAPER, allow_create=True, ) + docket.add_scraper_source() docket.docket_number = docket_number docket.docket_number_core = make_texas_docket_number_core( docket_number diff --git a/cl/search/models.py b/cl/search/models.py index be37ed29f8..16e76492b7 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -906,6 +906,10 @@ def save(self, update_fields=None, *args, **kwargs): def get_absolute_url(self) -> str: return reverse("view_docket", args=[self.pk, self.slug]) + def add_scraper_source(self) -> None: + if self.source in self.NON_SCRAPER_SOURCES(): + self.source = self.source + self.SCRAPER + def add_recap_source(self): if self.source == self.DEFAULT: self.source = self.RECAP_AND_SCRAPER From 94a5d730759473690143b64b7cf2650eca36f0e1 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 27 Feb 2026 16:47:53 -0700 Subject: [PATCH 48/87] fix(texas): Address PR feedback Extract repeated logic; fix potential OOM; call fill_null_dockets directly instead of using a task. --- .../commands/fill_case_transfers.py | 14 +-- .../commands/import_texas_dockets.py | 4 +- cl/corpus_importer/tasks.py | 15 --- cl/search/admin.py | 32 ++++++ cl/search/models.py | 99 +++++++++---------- 5 files changed, 83 insertions(+), 81 deletions(-) diff --git a/cl/corpus_importer/management/commands/fill_case_transfers.py b/cl/corpus_importer/management/commands/fill_case_transfers.py index cb168b106f..52f8744a1e 100644 --- a/cl/corpus_importer/management/commands/fill_case_transfers.py +++ b/cl/corpus_importer/management/commands/fill_case_transfers.py @@ -1,20 +1,10 @@ -from cl.corpus_importer.tasks import fill_case_transfer_missing_dockets from cl.lib.command_utils import VerboseCommand +from cl.search.models import CaseTransfer class Command(VerboseCommand): help = "Update missing docket foreign keys in the CaseTransfer table." - def add_arguments(self, parser): - parser.add_argument( - "--queue", - type=str, - help="The queue to run the update task in.", - default="celery", - ) - def handle(self, *args, **options): super().handle(*args, **options) - - queue = options["queue"] - fill_case_transfer_missing_dockets.delay(queue) + CaseTransfer.fill_null_dockets() diff --git a/cl/corpus_importer/management/commands/import_texas_dockets.py b/cl/corpus_importer/management/commands/import_texas_dockets.py index e35845ac2c..8e80579933 100644 --- a/cl/corpus_importer/management/commands/import_texas_dockets.py +++ b/cl/corpus_importer/management/commands/import_texas_dockets.py @@ -6,10 +6,10 @@ CorpusImporterCommand, ) from cl.corpus_importer.tasks import ( - fill_case_transfer_missing_dockets, texas_corpus_download_task, texas_ingest_docket_task, ) +from cl.search.models import CaseTransfer class Command(CorpusImporterCommand): @@ -51,4 +51,4 @@ def merge_task() -> app.Task: def handle(self, *args, **options): super().handle(*args, **options) - fill_case_transfer_missing_dockets.delay() + CaseTransfer.fill_null_dockets() diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 7d03c156a8..ea847f57c2 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4377,18 +4377,3 @@ def texas_corpus_download_task( meta = TexasDocketMeta.model_validate_json(f.read()) return content, meta - - -@app.task( - ignore_result=True, -) -@time_call(logger) -def fill_case_transfer_missing_dockets(): - """ - Attempt to populate missing Docket foreign key fields in CaseTransfer\ - objects. Run after a scraping task and on a daily schedule. - """ - logger.info( - "Attempting to populate missing CaseTransfer docket foreign keys..." - ) - CaseTransfer.fill_null_dockets() diff --git a/cl/search/admin.py b/cl/search/admin.py index e6b4403049..956d4775c4 100644 --- a/cl/search/admin.py +++ b/cl/search/admin.py @@ -14,6 +14,7 @@ from cl.lib.string_utils import trunc from cl.search.models import ( BankruptcyInformation, + CaseTransfer, Citation, Claim, ClaimHistory, @@ -358,6 +359,37 @@ class BankruptcyInformationAdmin(admin.ModelAdmin): raw_id_fields = ("docket",) +@admin.register(CaseTransfer) +class CaseTransferAdmin(CursorPaginatorAdmin): + raw_id_fields = ( + "origin_court", + "origin_docket", + "destination_court", + "destination_docket", + ) + list_display = ( + "pk", + "origin_court", + "origin_docket_number", + "destination_court", + "destination_docket_number", + "transfer_date", + "transfer_type", + ) + list_filter = ( + "transfer_type", + "transfer_date", + ) + search_fields = ( + "origin_docket_number", + "destination_docket_number", + ) + readonly_fields = ( + "date_created", + "date_modified", + ) + + @admin.register(RECAPDocument) class RECAPDocumentAdmin(CursorPaginatorAdmin): search_fields = ( diff --git a/cl/search/models.py b/cl/search/models.py index c51be0c510..447c2f112a 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -1,7 +1,7 @@ import logging import re from datetime import datetime -from typing import TypeVar +from typing import Literal, TypeVar import nh3 import pghistory @@ -4060,72 +4060,67 @@ class CaseTransfer(AbstractDateTimeModel): TRACKED_JURISDICTIONS = (Court.STATE_APPELLATE, Court.STATE_SUPREME) @classmethod - def fill_null_dockets(cls): + def _fill_null_docket_side( + cls, side: Literal["origin"] | Literal["destination"] + ) -> tuple[int, int]: + """Fill null docket FKs for one side (origin or destination). + + :param side: Either "origin" or "destination". + :return: Tuple of (updated_count, total_count). + """ from cl.recap.mergers import find_docket_object - logger.info( - "Attempting to populate missing fields in CaseTransfer table..." + qs = cls.objects.filter( + **{ + f"{side}_court__jurisdiction__in": cls.TRACKED_JURISDICTIONS, + f"{side}_docket__isnull": True, + } ) + total = qs.count() + updated_transfers: list[CaseTransfer] = [] + total_updated = 0 - missing_origin_docket = cls.objects.filter( - origin_court__jurisdiction__in=cls.TRACKED_JURISDICTIONS, - origin_docket__isnull=True, - ) - total_origin = missing_origin_docket.count() - for transfer in missing_origin_docket: - origin_docket = async_to_sync(find_docket_object)( - court_id=transfer.origin_court_id, + for transfer in qs.iterator(): + docket = async_to_sync(find_docket_object)( + court_id=getattr(transfer, f"{side}_court_id"), pacer_case_id=None, - docket_number=transfer.origin_docket_number, + docket_number=getattr(transfer, f"{side}_docket_number"), federal_defendant_number=None, federal_dn_judge_initials_assigned=None, federal_dn_judge_initials_referred=None, - docket_source=Docket.SCRAPER, allow_create=False, ) - if origin_docket: + if docket: logger.info( - "Found origin docket %s!", transfer.origin_docket_number + "Found %s docket %s!", + side, + getattr(transfer, f"{side}_docket_number"), ) - transfer.origin_docket = origin_docket - else: - logger.info( - "Could not find origin docket %s.", - transfer.origin_docket_number, + setattr(transfer, f"{side}_docket", docket) + updated_transfers.append(transfer) + + if len(updated_transfers) >= 100: + total_updated += cls.objects.bulk_update( + updated_transfers, [f"{side}_docket"] ) - updated_origin = cls.objects.bulk_update( - missing_origin_docket, ["origin_docket"], batch_size=100 - ) + updated_transfers = [] - missing_destination_docket = cls.objects.filter( - destination_court__jurisdiction__in=cls.TRACKED_JURISDICTIONS, - destination_docket__isnull=True, - ) - total_destination = missing_destination_docket.count() - for transfer in missing_destination_docket: - destination_docket = async_to_sync(find_docket_object)( - court_id=transfer.destination_court_id, - pacer_case_id=None, - docket_number=transfer.destination_docket_number, - federal_defendant_number=None, - federal_dn_judge_initials_assigned=None, - federal_dn_judge_initials_referred=None, - docket_source=Docket.SCRAPER, - allow_create=False, + if updated_transfers: + total_updated += cls.objects.bulk_update( + updated_transfers, [f"{side}_docket"] ) - if destination_docket: - logger.info( - "Found destination docket %s!", - transfer.destination_docket_number, - ) - transfer.destination_docket = destination_docket - else: - logger.info( - "Could not find destination docket %s.", - transfer.destination_docket_number, - ) - updated_destination = cls.objects.bulk_update( - missing_destination_docket, ["destination_docket"], batch_size=100 + + return total_updated, total + + @classmethod + def fill_null_dockets(cls) -> None: + logger.info( + "Attempting to populate missing fields in CaseTransfer table..." + ) + + updated_origin, total_origin = cls._fill_null_docket_side("origin") + updated_destination, total_destination = cls._fill_null_docket_side( + "destination" ) logger.info( From 6fc4de7fd1d7d7c524a4ee6b8947060449718182 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 3 Mar 2026 14:32:14 -0700 Subject: [PATCH 49/87] fix(texas): Incorrect file filtering in import command and PR feedback --- .../commands/import_texas_dockets.py | 25 +++--- cl/corpus_importer/tasks.py | 81 ++++++++----------- 2 files changed, 46 insertions(+), 60 deletions(-) diff --git a/cl/corpus_importer/management/commands/import_texas_dockets.py b/cl/corpus_importer/management/commands/import_texas_dockets.py index 8e80579933..4189444da6 100644 --- a/cl/corpus_importer/management/commands/import_texas_dockets.py +++ b/cl/corpus_importer/management/commands/import_texas_dockets.py @@ -21,24 +21,21 @@ class Command(CorpusImporterCommand): def transform_inventory_iterator( csv_reader: Iterable[list[str]], ) -> Iterable[tuple[tuple[str, str], tuple[str, str]]]: - html_rows = filter( - lambda r: Path(r[1]).suffix == ".html" and "searches" not in r[1], + meta_rows = filter( + # Filter only for meta files which are not duplicates (don't end in "_X") and not for search result scrapes + lambda r: "searches" not in r[1] + and Path(r[1]).name.endswith("_meta.json"), map(lambda r: (r[0].strip(), r[1].strip()), csv_reader), ) - previous_key_stem = None - for html_row in html_rows: - html_bucket, html_key = html_row - html_path = Path(html_key) - docket_name = html_path.stem - if previous_key_stem and docket_name.startswith(previous_key_stem): - continue - else: - previous_key_stem = docket_name - meta_key = str(html_path.with_name(f"{docket_name}_meta.json")) + for meta_row in meta_rows: + meta_bucket, meta_key = meta_row + meta_path = Path(meta_key) + docket_name = meta_path.stem.removesuffix("_meta") + html_key = str(meta_path.with_name(f"{docket_name}.html")) yield ( - (html_bucket, html_key), - (html_bucket, meta_key), + (meta_bucket, html_key), + (meta_bucket, meta_key), ) @staticmethod diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index a3e441716b..4375d539c9 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3831,10 +3831,10 @@ def merge_texas_docket_originating_court( originating_court_information = docket.originating_court_information originating_court_data = docket_data["originating_court"] + oc_docket_number = originating_court_data["case"] - originating_court_information.docket_number = originating_court_data[ - "case" - ] + originating_court_information.docket_number = oc_docket_number + originating_court_information.docket_number_raw = oc_docket_number originating_court_information.court_reporter = originating_court_data[ "reporter" ] @@ -4098,7 +4098,7 @@ def merge_texas_docket( :param docket_data: The scraped Texas docket data. :return: The result of the merge operation.""" - court = Court.objects.get( + lower_court = Court.objects.get( pk=texas_js_court_id_to_court_id(docket_data["court_id"]) ) docket_number = docket_data["docket_number"] @@ -4128,10 +4128,10 @@ def merge_texas_docket( logger.info( "Disaggregating Texas appellate docket %s", docket_number ) - docket.court = court + docket.court = lower_court if docket is None: docket = async_to_sync(find_docket_object)( - court_id=court.pk, + court_id=lower_court.pk, pacer_case_id=None, docket_number=docket_number, federal_defendant_number=None, @@ -4154,56 +4154,46 @@ def merge_texas_docket( logger.error( "Failed to update originating court information for Texas docket %s in court %s", docket.docket_number, - court.pk, + lower_court.pk, ) - if docket_data["court_type"] == CourtType.APPELLATE.value: + if ( + docket_data["court_type"] == CourtType.APPELLATE.value + or docket_data["appeals_court"]["court_id"] + == CourtID.UNKNOWN.value + ): lower_court_data = docket_data["originating_court"] lower_court_id = texas_originating_court_to_court_id( lower_court_data ) else: - if ( - docket_data["appeals_court"]["court_id"] - == CourtID.UNKNOWN.value - ): - lower_court_data = docket_data["originating_court"] - lower_court_id = texas_originating_court_to_court_id( - lower_court_data - ) - else: - lower_court_data = docket_data["appeals_court"] - lower_court_id = texas_js_court_id_to_court_id( - lower_court_data["court_id"] - ) + lower_court_data = docket_data["appeals_court"] + lower_court_id = texas_js_court_id_to_court_id( + lower_court_data["court_id"] + ) - court_name = None + lower_court_name = None if lower_court_id is not None: try: - court = Court.objects.get(pk=lower_court_id) + lower_court = Court.objects.get(pk=lower_court_id) except Court.DoesNotExist: logger.error( "Could not find lower court with ID %s to set appeal_from for Texas docket.", lower_court_id, ) else: - docket.appeal_from = court - court_name = court.full_name - if not court_name: + docket.appeal_from = lower_court + lower_court_name = lower_court.full_name + if not lower_court_name: logger.warning( "Failed to find court ID %s while populating appeal_from field for Texas docket %s in court %s", lower_court_id, docket.pk, - court.pk, + lower_court.pk, ) # Assumes that we will only fail to generate a court ID for trial courts and never appellate courts - court_name = lower_court_data["name"] - logger.info( - "Updating lower court info with court %s (ID %s).", - court_name, - lower_court_id, - ) - docket.appeal_from_str = court_name + lower_court_name = lower_court_data.get("name", "") + docket.appeal_from_str = lower_court_name docket.save() @@ -4212,7 +4202,7 @@ def merge_texas_docket( logger.error( "Failed to merge party data for Texas docket %s in court %s", docket.docket_number, - court.pk, + lower_court.pk, ) entry_merge_results = [ @@ -4235,7 +4225,7 @@ def merge_texas_docket( logger.error( "Failed to merge CaseTransfer data for Texas docket %s in court %s", docket.docket_number, - court.pk, + lower_court.pk, ) create = ( @@ -4300,17 +4290,16 @@ def texas_ingest_docket_task( parser = TexasSupremeCourtScraper() case "coscca": parser = TexasCourtOfCriminalAppealsScraper() + case court_code if court_code.startswith("coa"): + parser = TexasCourtOfAppealsScraper(meta.court_code) case _: - if meta.court_code.startswith("coa"): - parser = TexasCourtOfAppealsScraper(meta.court_code) - else: - logger.error( - "Unrecognized Texas court type %s. Cannot parse docket %s.", - meta.court_code, - meta.case_number, - ) - task.request.chain = None - return MergeResult.failed() + logger.error( + "Unrecognized Texas court type %s. Cannot parse docket %s.", + meta.court_code, + meta.case_number, + ) + task.request.chain = None + return MergeResult.failed() parser._parse_text(content.decode("utf-8")) docket_data = parser.data From 210087fee48575a9098bc2892433935b44a3aa5c Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Wed, 4 Mar 2026 11:42:17 -0700 Subject: [PATCH 50/87] test(texas): Update factories to better model actual data --- cl/corpus_importer/tests.py | 48 ++++++++++++------------ cl/search/state/texas/factories.py | 60 +++++++++++++++++++++++++++--- 2 files changed, 78 insertions(+), 30 deletions(-) diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index d939f4fad0..fe7c214ae4 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -174,8 +174,8 @@ TexasAppellateCourtInfoDictFactory, TexasAppellateTransferDictFactory, TexasCaseDocumentDictFactory, + TexasCaseEventDictFactory, TexasCourtOfAppealsDocketDictFactory, - TexasDocketEntryDictFactory, TexasDocketEntryFactory, TexasDocumentFactory, TexasFinalCourtDocketDictFactory, @@ -2326,7 +2326,7 @@ def get_test_pdf( def test_merge_texas_docket_entry_new_entry(self): """Can we correctly handle a docket entry?""" - docket_entry = TexasDocketEntryDictFactory( + docket_entry = TexasCaseEventDictFactory( attachments=[TexasCaseDocumentDictFactory()], date=date.fromisoformat("2025-01-02"), type="Brief", @@ -2343,7 +2343,7 @@ def test_merge_texas_docket_entry_new_entry(self): created_docket_entry = TexasDocketEntry.objects.get(pk=output.pk) assert created_docket_entry.docket_id == self.docket_coa1.id assert created_docket_entry.entry_type == docket_entry["type"] - assert created_docket_entry.description == docket_entry["description"] + assert created_docket_entry.disposition == docket_entry["disposition"] assert created_docket_entry.date_filed == docket_entry["date"] n_attachments = TexasDocument.objects.filter( docket_entry_id=created_docket_entry.id @@ -2353,7 +2353,7 @@ def test_merge_texas_docket_entry_new_entry(self): def test_merge_texas_docket_entry_no_update(self): """Can we correctly handle a docket entry update noop?""" - js_docket_entry = TexasDocketEntryDictFactory() + js_docket_entry = TexasCaseEventDictFactory() result = merge_texas_docket_entry( self.docket_coa1, "2025-01-02.000", True, js_docket_entry @@ -2380,7 +2380,7 @@ def test_merge_texas_docket_entry_no_update(self): assert created_docket_entry.docket_id == self.docket_coa1.id assert created_docket_entry.entry_type == js_docket_entry["type"] assert ( - created_docket_entry.description == js_docket_entry["description"] + created_docket_entry.disposition == js_docket_entry["disposition"] ) assert created_docket_entry.date_filed == js_docket_entry["date"] n_attachments = TexasDocument.objects.filter( @@ -2391,7 +2391,7 @@ def test_merge_texas_docket_entry_no_update(self): def test_merge_texas_docket_entry_add_document(self): """Can we correctly add a new document to an existing docket entry?""" - js_docket_entry = TexasDocketEntryDictFactory() + js_docket_entry = TexasCaseEventDictFactory() initial_n_attachments = len(js_docket_entry["attachments"]) result = merge_texas_docket_entry( @@ -2419,7 +2419,7 @@ def test_merge_texas_docket_entry_add_document(self): assert created_docket_entry.docket_id == self.docket_coa1.id assert created_docket_entry.entry_type == js_docket_entry["type"] assert ( - created_docket_entry.description == js_docket_entry["description"] + created_docket_entry.disposition == js_docket_entry["disposition"] ) assert created_docket_entry.date_filed == js_docket_entry["date"] n_attachments = TexasDocument.objects.filter( @@ -2437,7 +2437,7 @@ def test_merge_texas_docket_entry_multiple_matches_with_sequence(self): entry_type="Brief", appellate_brief=True, sequence_number="2025-01-02.000", - description="First entry", + disposition="First entry", ) existing_entry_2 = TexasDocketEntryFactory.create( docket=self.docket_coa1, @@ -2445,12 +2445,12 @@ def test_merge_texas_docket_entry_multiple_matches_with_sequence(self): entry_type="Brief", appellate_brief=True, sequence_number="2025-01-02.001", - description="Second entry", + disposition="Second entry", ) - js_docket_entry = TexasDocketEntryDictFactory( + js_docket_entry = TexasCaseEventDictFactory( attachments=[], - description="Updated description", + disposition="Updated disposition", date=date.fromisoformat("2025-01-02"), type="Brief", ) @@ -2465,11 +2465,11 @@ def test_merge_texas_docket_entry_multiple_matches_with_sequence(self): assert output.success is True assert output.pk == existing_entry_2.pk updated_entry = TexasDocketEntry.objects.get(pk=output.pk) - assert updated_entry.description == "Updated description" + assert updated_entry.disposition == js_docket_entry["disposition"] assert updated_entry.sequence_number == "2025-01-02.001" # Ensure the first entry was not modified existing_entry_1.refresh_from_db() - assert existing_entry_1.description == "First entry" + assert existing_entry_1.disposition == "First entry" def test_merge_texas_docket_entry_single_match_updates_entry(self): """When exactly one entry matches by date/type/brief, update it even @@ -2480,12 +2480,12 @@ def test_merge_texas_docket_entry_single_match_updates_entry(self): entry_type="Brief", appellate_brief=True, sequence_number="2025-01-04.000", - description="Original description", + disposition="Original description", ) - js_docket_entry = TexasDocketEntryDictFactory( + js_docket_entry = TexasCaseEventDictFactory( attachments=[], - description="Updated description", + disposition="Updated disposition", date=date.fromisoformat("2025-01-04"), type="Brief", ) @@ -2500,7 +2500,7 @@ def test_merge_texas_docket_entry_single_match_updates_entry(self): assert output.success is True assert output.pk == existing_entry.pk updated_entry = TexasDocketEntry.objects.get(pk=output.pk) - assert updated_entry.description == "Updated description" + assert updated_entry.disposition == js_docket_entry["disposition"] assert updated_entry.sequence_number == "2025-01-04.001" def test_merge_texas_docket_entry_multiple_matches_without_sequence(self): @@ -2512,7 +2512,7 @@ def test_merge_texas_docket_entry_multiple_matches_without_sequence(self): entry_type="Brief", appellate_brief=True, sequence_number="2025-01-03.000", - description="First entry", + disposition="First entry", ) existing_entry_2 = TexasDocketEntryFactory.create( docket=self.docket_coa1, @@ -2520,12 +2520,12 @@ def test_merge_texas_docket_entry_multiple_matches_without_sequence(self): entry_type="Brief", appellate_brief=True, sequence_number="2025-01-03.001", - description="Second entry", + disposition="Second entry", ) - js_docket_entry = TexasDocketEntryDictFactory( + js_docket_entry = TexasCaseEventDictFactory( attachments=[], - description="New third entry", + disposition="New third entry", date=date.fromisoformat("2025-01-03"), type="Brief", ) @@ -2541,13 +2541,13 @@ def test_merge_texas_docket_entry_multiple_matches_without_sequence(self): assert output.pk is not None assert output.pk not in (existing_entry_1.pk, existing_entry_2.pk) new_entry = TexasDocketEntry.objects.get(pk=output.pk) - assert new_entry.description == "New third entry" + assert new_entry.disposition == js_docket_entry["disposition"] assert new_entry.sequence_number == "2025-01-03.002" # Ensure existing entries were not modified existing_entry_1.refresh_from_db() existing_entry_2.refresh_from_db() - assert existing_entry_1.description == "First entry" - assert existing_entry_2.description == "Second entry" + assert existing_entry_1.disposition == "First entry" + assert existing_entry_2.disposition == "Second entry" def test_merge_single_party_with_attorney(self): """Can we merge a single party with an attorney?""" diff --git a/cl/search/state/texas/factories.py b/cl/search/state/texas/factories.py index 3f2c923800..dd00530ee1 100644 --- a/cl/search/state/texas/factories.py +++ b/cl/search/state/texas/factories.py @@ -2,6 +2,7 @@ import random +import factory from factory import DictFactory, Faker, List, SubFactory from factory.declarations import LazyAttribute from factory.django import DjangoModelFactory @@ -23,10 +24,25 @@ class TexasCaseDocumentDictFactory(DictFactory): class TexasDocketEntryDictFactory(DictFactory): date = Faker("date_object") type = Faker("pystr", min_chars=3, max_chars=3) - disposition = Faker("text") + attachments = List([SubFactory(TexasCaseDocumentDictFactory)]) + + +class TexasAppellateBriefDictFactory(TexasDocketEntryDictFactory): description = Faker("text") + + +class TexasSupremeCourtAppellateBriefDictFactory( + TexasAppellateBriefDictFactory +): + remarks = Faker("text") + + +class TexasCaseEventDictFactory(TexasDocketEntryDictFactory): + disposition = Faker("text") + + +class TexasSupremeCourtCaseEventDictFactory(TexasCaseEventDictFactory): remarks = Faker("text") - attachments = List([SubFactory(TexasCaseDocumentDictFactory)]) class TexasCasePartyDictFactory(DictFactory): @@ -100,12 +116,19 @@ class TexasCommonDataDictFactory(DictFactory): case_type = Faker("pystr") parties = List([SubFactory(TexasCasePartyDictFactory)]) originating_court = SubFactory(TexasOriginatingCourtDictFactory) - case_events = List([SubFactory(TexasDocketEntryDictFactory)]) + case_events = List([SubFactory(TexasCaseEventDictFactory)]) appellate_briefs = LazyAttribute( lambda d: list( - filter( - lambda e: True if random.random() < 0.1 else False, - d.case_events, + map( + lambda ce: TexasAppellateBriefDictFactory( + date=ce["date"], + type=ce["type"], + attachments=ce["attachments"], + ), + filter( + lambda e: True if random.random() < 0.1 else False, + d.case_events, + ), ) ) ) @@ -209,3 +232,28 @@ class TexasFinalCourtDocketDictFactory(TexasCommonDataDictFactory): CourtID.COURT_OF_CRIMINAL_APPEALS.value, ), ) + + @factory.post_generation + @staticmethod + def set_sc(obj, create, extracted, **kwargs): + if not create: + return + if obj["court_id"] == CourtID.SUPREME_COURT.value: + obj["case_events"] = map( + lambda ce: TexasSupremeCourtCaseEventDictFactory( + date=ce["date"], + type=ce["type"], + attachments=ce["attachments"], + disposition=ce["disposition"], + ), + obj["case_events"], + ) + obj["appellate_briefs"] = map( + lambda ab: TexasSupremeCourtAppellateBriefDictFactory( + date=ab["date"], + type=ab["type"], + attachments=ab["attachments"], + description=ab["description"], + ), + obj["appellate_briefs"], + ) From 76702edfa056ebd327de86733a7d0f39415621c6 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Wed, 4 Mar 2026 12:03:49 -0700 Subject: [PATCH 51/87] test(texas): Add test for generate_texas_appellate_brief_flags --- cl/corpus_importer/tests.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index fe7c214ae4..920ff643c7 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -85,6 +85,7 @@ classify_case_name_by_llm, download_texas_document_pdf, generate_ia_json, + generate_texas_appellate_brief_flags, get_and_save_free_document_report, merge_texas_case_transfers, merge_texas_docket, @@ -171,6 +172,7 @@ RECAPDocument, ) from cl.search.state.texas.factories import ( + TexasAppellateBriefDictFactory, TexasAppellateCourtInfoDictFactory, TexasAppellateTransferDictFactory, TexasCaseDocumentDictFactory, @@ -186,6 +188,7 @@ from cl.settings import MEDIA_ROOT from cl.tests.cases import TestCase from cl.tests.fakes import FakeCaseQueryReport, FakeFreeOpinionReport +from cl.tests.providers import fake from cl.tests.utils import MockResponse from cl.users.factories import UserProfileWithParentsFactory @@ -2180,6 +2183,36 @@ def tearDown(self): self.extract_pdf_document_patch.stop() self.download_pdf_patch.stop() + def test_generate_appellate_brief_flags(self): + n_events = fake.random_int(min=0, max=30) + case_events = [TexasCaseEventDictFactory() for _ in range(n_events)] + + appellate_brief_indices = sorted( + fake.random_elements(range(len(case_events)), unique=True) + ) + + appellate_briefs = [ + TexasAppellateBriefDictFactory( + date=case_events[i]["date"], + type=case_events[i]["type"], + attachments=case_events[i]["attachments"], + ) + for i in appellate_brief_indices + ] + + appellate_brief_flags = generate_texas_appellate_brief_flags( + case_events, appellate_briefs + ) + + actual_flags = [ + True if i in appellate_brief_indices else False + for i in range(len(case_events)) + ] + + assert appellate_brief_flags == actual_flags, ( + f"Incorrect appellate brief flags ({appellate_brief_flags}!={actual_flags}).\nCase events: {case_events}\nAppellate briefs: {appellate_briefs}" + ) + def test_merge_texas_document_new_document(self): """Can we correctly add a new attachment to an existing docket entry?""" docket_entry = self.docket_coa1_entry From 05d5fb07049b87774b919f918274e87814711936 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Wed, 4 Mar 2026 14:00:59 -0700 Subject: [PATCH 52/87] fix(texas): Incorrect logs and appellate brief flag generation --- cl/corpus_importer/tasks.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 4375d539c9..85f5529a01 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4080,7 +4080,12 @@ def generate_texas_appellate_brief_flags( next_brief = next(brief_iter, None) flags = [] for case_event in case_events: - if next_brief is not None and case_event == next_brief: + if ( + next_brief is not None + and case_event["date"] == next_brief["date"] + and case_event["type"] == next_brief["type"] + and case_event["attachments"] == next_brief["attachments"] + ): flags.append(True) next_brief = next(brief_iter, None) else: @@ -4185,12 +4190,6 @@ def merge_texas_docket( docket.appeal_from = lower_court lower_court_name = lower_court.full_name if not lower_court_name: - logger.warning( - "Failed to find court ID %s while populating appeal_from field for Texas docket %s in court %s", - lower_court_id, - docket.pk, - lower_court.pk, - ) # Assumes that we will only fail to generate a court ID for trial courts and never appellate courts lower_court_name = lower_court_data.get("name", "") docket.appeal_from_str = lower_court_name @@ -4202,7 +4201,7 @@ def merge_texas_docket( logger.error( "Failed to merge party data for Texas docket %s in court %s", docket.docket_number, - lower_court.pk, + court.pk, ) entry_merge_results = [ @@ -4225,7 +4224,7 @@ def merge_texas_docket( logger.error( "Failed to merge CaseTransfer data for Texas docket %s in court %s", docket.docket_number, - lower_court.pk, + court.pk, ) create = ( From eb6d9f94c5bc03dec402ea1c91e58a6a8a581571 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Wed, 4 Mar 2026 14:04:51 -0700 Subject: [PATCH 53/87] fix(texas): Rename lower_court to court --- cl/corpus_importer/tasks.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 85f5529a01..c6ea355d54 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4103,7 +4103,7 @@ def merge_texas_docket( :param docket_data: The scraped Texas docket data. :return: The result of the merge operation.""" - lower_court = Court.objects.get( + court = Court.objects.get( pk=texas_js_court_id_to_court_id(docket_data["court_id"]) ) docket_number = docket_data["docket_number"] @@ -4133,10 +4133,10 @@ def merge_texas_docket( logger.info( "Disaggregating Texas appellate docket %s", docket_number ) - docket.court = lower_court + docket.court = court if docket is None: docket = async_to_sync(find_docket_object)( - court_id=lower_court.pk, + court_id=court.pk, pacer_case_id=None, docket_number=docket_number, federal_defendant_number=None, @@ -4159,7 +4159,7 @@ def merge_texas_docket( logger.error( "Failed to update originating court information for Texas docket %s in court %s", docket.docket_number, - lower_court.pk, + court.pk, ) if ( From f0a7c84c9371b4b417f8d1af46446d9ac4f511a4 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Wed, 4 Mar 2026 14:22:39 -0700 Subject: [PATCH 54/87] feat(texas): Make OCI match appeal_from --- cl/corpus_importer/tasks.py | 39 ++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index c6ea355d54..ecf9866ad5 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3830,27 +3830,34 @@ def merge_texas_docket_originating_court( docket.originating_court_information = OriginatingCourtInformation() originating_court_information = docket.originating_court_information - originating_court_data = docket_data["originating_court"] - oc_docket_number = originating_court_data["case"] - originating_court_information.docket_number = oc_docket_number - originating_court_information.docket_number_raw = oc_docket_number - originating_court_information.court_reporter = originating_court_data[ - "reporter" - ] - originating_court_information.assigned_to_str = originating_court_data[ - "judge" - ] - originating_court_id = texas_originating_court_to_court_id( - originating_court_data - ) + if ( + docket_data["court_type"] == CourtType.APPELLATE.value + or docket_data["appeals_court"]["court_id"] == CourtID.UNKNOWN.value + ): + ocd = docket_data["originating_court"] + oc_dn = ocd["case"] + oc_reporter = ocd["reporter"] + oc_judge = ocd["judge"] + oc_id = texas_originating_court_to_court_id(ocd) + else: + ocd = docket_data["appeals_court"] + oc_dn = ocd["case_number"] + oc_reporter = "" + oc_judge = ocd["justice"] + oc_id = texas_js_court_id_to_court_id(ocd["court_id"]) + + originating_court_information.docket_number = oc_dn + originating_court_information.docket_number_raw = oc_dn + originating_court_information.court_reporter = oc_reporter + originating_court_information.assigned_to_str = oc_judge # Only update judge if we're able to associate them with a court. - if originating_court_id: + if oc_id: async_to_sync(lookup_judge_by_full_name_and_set_attr)( item=originating_court_information, target_field="assigned_to", - full_name=originating_court_data["judge"], - court_id=originating_court_id, + full_name=oc_judge, + court_id=oc_id, event_date=None, require_living_judge=False, ) From 8c3ce8d88e30965c2b58a8a2fda3bd65336eb67c Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Thu, 5 Mar 2026 15:05:20 -0700 Subject: [PATCH 55/87] fix(texas): Missing case_name and case_name_full in Texas merger Also add code to check everything's set. --- cl/corpus_importer/tasks.py | 7 ++++- cl/corpus_importer/tests.py | 58 ++++++++++++++++++++++++++++++++----- 2 files changed, 57 insertions(+), 8 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index ecf9866ad5..1c94c10c8e 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3786,7 +3786,9 @@ def normalize_texas_parties( "contact": attorney, "roles": ["LEAD_ATTORNEY"] if i == 0 else ["UNKNOWN"], } - for i, attorney in enumerate(party["representatives"]) + for i, attorney in enumerate( + [rep for rep in party["representatives"] if len(rep) > 0] + ) ], } for party in parties @@ -4159,6 +4161,9 @@ def merge_texas_docket( ) docket.date_filed = docket_data["date_filed"] docket.cause = docket_data["case_type"] + docket.case_name = docket_data["case_name"] + docket.case_name_full = docket_data["case_name_full"] + docket.docket_number_raw = docket_number originating_court_merge_result = merge_texas_docket_originating_court( docket, docket_data ) diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 920ff643c7..3f6e9c1325 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -82,6 +82,7 @@ update_latest_case_id_and_schedule_iquery_sweep, ) from cl.corpus_importer.tasks import ( + MergeResult, classify_case_name_by_llm, download_texas_document_pdf, generate_ia_json, @@ -2219,10 +2220,8 @@ def test_merge_texas_document_new_document(self): input_document = TexasCaseDocumentDictFactory() - # Run the function result = merge_texas_document(docket_entry, input_document) - # Assertions assert result.create is True assert result.success is True assert result.pk is not None @@ -2258,10 +2257,8 @@ def test_merge_texas_document_existing_document_no_update(self): current_document.filepath_local = "a" current_document.save() - # Run the function result = merge_texas_document(docket_entry, input_document) - # Assertions assert result.create is False assert result.success is True assert result.pk == current_document.pk @@ -2286,7 +2283,6 @@ def test_merge_texas_document_existing_document_update(self): media_id=input_document["media_id"], ) - # Create an attachment current_document = TexasDocumentFactory.create( docket_entry=docket_entry, description=old_document["description"], @@ -2295,10 +2291,8 @@ def test_merge_texas_document_existing_document_update(self): url=old_document["document_url"], ) - # Run the function result = merge_texas_document(docket_entry, input_document) - # Assertions assert result.create is False assert result.update is True assert result.success is True @@ -3166,6 +3160,56 @@ def test_merge_texas_docket_final_court_sets_appeal_from(self): assert docket_sc.appeal_from_id == "txctapp1" assert docket_sc.appeal_from_str == self.texas_coa1.full_name + @patch( + "cl.corpus_importer.tasks.merge_texas_case_transfers", + return_value=MergeResult.created(1), + ) + @patch( + "cl.corpus_importer.tasks.merge_texas_docket_entry", + return_value=MergeResult.created(1), + ) + @patch( + "cl.corpus_importer.tasks.merge_texas_parties", + return_value=MergeResult.created(1), + ) + @patch( + "cl.corpus_importer.tasks.merge_texas_docket_originating_court", + return_value=MergeResult.created(1), + ) + def test_merge_texas_docket_populates_all_fields( + self, mock_oci, mock_parties, mock_entry, mock_transfers + ): + """Does merge_texas_docket populate all Docket fields from input data?""" + texas_district = CourtFactory.create(id="texdistct6") + originating_court = TexasOriginatingDistrictCourtDictFactory( + district=5, + ) + docket_data = TexasCourtOfAppealsDocketDictFactory( + court_id=CourtID.FIRST_COURT_OF_APPEALS.value, + originating_court=originating_court, + transfer_from=None, + ) + + result = merge_texas_docket(docket_data) + + assert result.success is True + assert result.pk is not None + + docket = Docket.objects.get(pk=result.pk) + assert docket.source & Docket.SCRAPER + assert docket.court_id == "txctapp1" + assert docket.docket_number == docket_data["docket_number"] + assert docket.docket_number_core == make_texas_docket_number_core( + docket_data["docket_number"] + ) + assert docket.docket_number_raw == docket_data["docket_number"] + assert docket.case_name == docket_data["case_name"] + assert docket.case_name_full == docket_data["case_name_full"] + assert docket.date_filed == docket_data["date_filed"] + assert docket.cause == docket_data["case_type"] + assert docket.appeal_from_id == "texdistct6" + assert docket.appeal_from_str == texas_district.full_name + @patch("cl.corpus_importer.tasks.get_or_cache_pacer_cookies") @override_settings( From 81396e789d21077e6354cc06de099c6da8bda0d5 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Thu, 5 Mar 2026 15:30:38 -0700 Subject: [PATCH 56/87] test(texas): Add test for empty attorney name from JS --- cl/corpus_importer/tests.py | 57 +++++++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 3f6e9c1325..f95bd8b789 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -178,12 +178,15 @@ TexasAppellateTransferDictFactory, TexasCaseDocumentDictFactory, TexasCaseEventDictFactory, + TexasCasePartyDictFactory, TexasCourtOfAppealsDocketDictFactory, TexasDocketEntryFactory, TexasDocumentFactory, TexasFinalCourtDocketDictFactory, TexasOriginatingCourtDictFactory, TexasOriginatingDistrictCourtDictFactory, + TexasSupremeCourtAppellateBriefDictFactory, + TexasSupremeCourtCaseEventDictFactory, ) from cl.search.state.texas.models import TexasDocketEntry, TexasDocument from cl.settings import MEDIA_ROOT @@ -2184,6 +2187,36 @@ def tearDown(self): self.extract_pdf_document_patch.stop() self.download_pdf_patch.stop() + def get_random_docket_entry_dict(self, **kwargs): + return fake.random_element( + ( + TexasSupremeCourtAppellateBriefDictFactory(**kwargs), + TexasSupremeCourtCaseEventDictFactory(**kwargs), + TexasAppellateBriefDictFactory(**kwargs), + TexasCaseEventDictFactory(**kwargs), + ) + ) + + def test_normalize_texas_parties_empty_atty_name(self): + party_0 = TexasCasePartyDictFactory(representatives=[""]) + # Filter out empty representatives + parties = [party_0] + + normalized = normalize_texas_parties(parties) + + self.assertEqual( + normalized, + [ + { + "name": party_0["name"], + "type": party_0["type"], + "date_terminated": None, + "extra_info": "", + "attorneys": [], + } + ], + ) + def test_generate_appellate_brief_flags(self): n_events = fake.random_int(min=0, max=30) case_events = [TexasCaseEventDictFactory() for _ in range(n_events)] @@ -2353,7 +2386,7 @@ def get_test_pdf( def test_merge_texas_docket_entry_new_entry(self): """Can we correctly handle a docket entry?""" - docket_entry = TexasCaseEventDictFactory( + docket_entry = self.get_random_docket_entry_dict( attachments=[TexasCaseDocumentDictFactory()], date=date.fromisoformat("2025-01-02"), type="Brief", @@ -2370,7 +2403,13 @@ def test_merge_texas_docket_entry_new_entry(self): created_docket_entry = TexasDocketEntry.objects.get(pk=output.pk) assert created_docket_entry.docket_id == self.docket_coa1.id assert created_docket_entry.entry_type == docket_entry["type"] - assert created_docket_entry.disposition == docket_entry["disposition"] + assert created_docket_entry.disposition == docket_entry.get( + "disposition", "" + ) + assert created_docket_entry.description == docket_entry.get( + "description", "" + ) + assert created_docket_entry.remarks == docket_entry.get("remarks", "") assert created_docket_entry.date_filed == docket_entry["date"] n_attachments = TexasDocument.objects.filter( docket_entry_id=created_docket_entry.id @@ -2380,7 +2419,7 @@ def test_merge_texas_docket_entry_new_entry(self): def test_merge_texas_docket_entry_no_update(self): """Can we correctly handle a docket entry update noop?""" - js_docket_entry = TexasCaseEventDictFactory() + js_docket_entry = self.get_random_docket_entry_dict() result = merge_texas_docket_entry( self.docket_coa1, "2025-01-02.000", True, js_docket_entry @@ -2418,7 +2457,7 @@ def test_merge_texas_docket_entry_no_update(self): def test_merge_texas_docket_entry_add_document(self): """Can we correctly add a new document to an existing docket entry?""" - js_docket_entry = TexasCaseEventDictFactory() + js_docket_entry = self.get_random_docket_entry_dict() initial_n_attachments = len(js_docket_entry["attachments"]) result = merge_texas_docket_entry( @@ -2445,8 +2484,14 @@ def test_merge_texas_docket_entry_add_document(self): created_docket_entry = TexasDocketEntry.objects.get(pk=output.pk) assert created_docket_entry.docket_id == self.docket_coa1.id assert created_docket_entry.entry_type == js_docket_entry["type"] - assert ( - created_docket_entry.disposition == js_docket_entry["disposition"] + assert created_docket_entry.remarks == js_docket_entry.get( + "remarks", "" + ) + assert created_docket_entry.description == js_docket_entry.get( + "description", "" + ) + assert created_docket_entry.disposition == js_docket_entry.get( + "disposition", "" ) assert created_docket_entry.date_filed == js_docket_entry["date"] n_attachments = TexasDocument.objects.filter( From 05c9b09b7ad5d41b5daea308ccb10e5b3cdbc429 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Thu, 5 Mar 2026 15:35:21 -0700 Subject: [PATCH 57/87] fix(texas): Skip merging OCI for unknown lower court types --- cl/corpus_importer/tasks.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 1c94c10c8e..38b825364e 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -3826,6 +3826,15 @@ def merge_texas_docket_originating_court( :param docket: The docket to add the originating court to. :param docket_data: The docket data from Juriscraper. :return: The result of the merge operation.""" + if ( + docket_data["originating_court"]["court_type"] + == CourtType.UNKNOWN.value + ): + logger.warning( + "Skipping merge of OCI for Texas docket %s due to unknown originating court type.", + docket_data["docket_number"], + ) + return MergeResult(create=False, update=False, success=False, pk=None) created = False if not docket.originating_court_information: created = True From a73375b22bcf2fb5368e67406ab612f4082b4bbb Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Mon, 9 Mar 2026 17:02:35 -0600 Subject: [PATCH 58/87] feat(texas): Create merger for TrialCourtData model. --- cl/corpus_importer/tasks.py | 106 ++++++++++++++++++++++++++++++++++-- cl/corpus_importer/tests.py | 41 ++++++++++++++ cl/search/factories.py | 18 ++++++ 3 files changed, 161 insertions(+), 4 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 826ac9a1cb..8d0bfd516f 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -158,6 +158,7 @@ from cl.lib.storage import AWSMediaStorage from cl.lib.types import TaskData from cl.people_db.lookup_utils import ( + lookup_judge_by_full_name, lookup_judge_by_full_name_and_set_attr, ) from cl.people_db.models import Attorney, Role @@ -196,6 +197,7 @@ RECAPDocument, ScotusDocketMetadata, Tag, + TrialCourtData, ) from cl.search.state.texas.models import TexasDocketEntry, TexasDocument @@ -3499,11 +3501,99 @@ def failed[S]() -> MergeResult[S]: ) @staticmethod - def unnecessary[S](pk: S) -> MergeResult[S]: - """Shorthand for the result of a unnecessary merge operation. + def unnecessary[S](pk: S | None) -> MergeResult[S]: + """Shorthand for the result of an unnecessary merge operation. :return: The constructed MergeResult object.""" - return MergeResult(create=False, update=False, success=True, pk=pk) + return MergeResult[S](create=False, update=False, success=True, pk=pk) + + +def merge_texas_trial_court_data( + docket: Docket, + docket_data: TexasCourtOfCriminalAppealsDocket | TexasSupremeCourtDocket, +) -> MergeResult: + """ + Create or update a TrialCourtData object to capture trial court information + for Texas SC and CCA cases. + + :param docket: The docket in the SC or CCA. + :param docket_data: The scraped docket data. + + :return: The result of the attempted merge operation. + """ + originating_court = docket_data["originating_court"] + if originating_court["court_type"] == CourtType.APPELLATE.value: + logger.info( + "Originating court for Texas docket %s is appellate. TrialCourtData unnecessary.", + docket.docket_number, + ) + return MergeResult.unnecessary(None) + dn_trial = originating_court["case"] + judge_name = originating_court["judge"] + reporter = originating_court["reporter"] + punishment = originating_court["punishment"] + county = originating_court["county"] + court_id = texas_originating_court_to_court_id(originating_court) + if court_id: + court = Court.objects.get(pk=court_id) + court_name = court.full_name + + if judge_name: + judge = async_to_sync(lookup_judge_by_full_name)( + name=judge_name, + court_id=court_id, + event_date=None, + require_living_judge=False, + ) + else: + judge = None + else: + court = None + court_name = originating_court["name"] + judge = None + + try: + trial_court_data = TrialCourtData.objects.get( + docket=docket, + ) + except TrialCourtData.DoesNotExist: + logger.info( + "No existing TrialCourtData object found for Texas docket %s. Creating...", + docket.docket_number, + ) + created = True + trial_court_data = TrialCourtData( + docket=docket, + ) + else: + created = False + + new_values = { + "docket_number_trial": dn_trial, + "docket_number_raw_trial": dn_trial, + "judge_str": judge_name, + "judge": judge, + "reporter": reporter, + "court_name": court_name, + "court": court, + "punishment": punishment, + "county": county, + } + + updated = False + if not created: + updated = any( + getattr(trial_court_data, k) != v for k, v in new_values.items() + ) + if not updated: + return MergeResult.unnecessary(trial_court_data.pk) + + for k, v in new_values.items(): + setattr(trial_court_data, k, v) + trial_court_data.save() + return MergeResult( + create=created, update=updated, success=True, pk=trial_court_data.pk + ) def merge_case_transfer(case_transfer: CaseTransfer) -> MergeResult: @@ -4155,7 +4245,7 @@ def merge_texas_docket( ) docket.court = court if docket is None: - docket = async_to_sync(find_docket_object)( + docket: Docket = async_to_sync(find_docket_object)( court_id=court.pk, pacer_case_id=None, docket_number=docket_number, @@ -4219,6 +4309,11 @@ def merge_texas_docket( docket.save() + if docket_data["court_type"] == CourtType.SUPREME.value: + trial_court_result = merge_texas_trial_court_data(docket, docket_data) + else: + trial_court_result = MergeResult.unnecessary(None) + party_merge_result = merge_texas_parties(docket, docket_data["parties"]) if not party_merge_result.success: logger.error( @@ -4252,18 +4347,21 @@ def merge_texas_docket( create = ( party_merge_result.create + or trial_court_result.create or originating_court_merge_result.create or merge_case_transfer_result.create or any(r.create for r in entry_merge_results) ) update = ( party_merge_result.update + or trial_court_result.update or originating_court_merge_result.update or merge_case_transfer_result.update or any(r.update for r in entry_merge_results) ) success = ( party_merge_result.success + and trial_court_result.success and originating_court_merge_result.success and merge_case_transfer_result.success and all(r.success for r in entry_merge_results) diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index f95bd8b789..adbd80237d 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -94,6 +94,7 @@ merge_texas_docket_originating_court, merge_texas_document, merge_texas_parties, + merge_texas_trial_court_data, normalize_texas_parties, probe_or_scrape_iquery_pages, ) @@ -171,6 +172,7 @@ OpinionCluster, OriginatingCourtInformation, RECAPDocument, + TrialCourtData, ) from cl.search.state.texas.factories import ( TexasAppellateBriefDictFactory, @@ -3255,6 +3257,45 @@ def test_merge_texas_docket_populates_all_fields( assert docket.appeal_from_id == "texdistct6" assert docket.appeal_from_str == texas_district.full_name + def test_merge_trial_court_data(self): + """Can we create and then update TrialCourtData?""" + # Test written with the help of Claude Code + texas_district = CourtFactory.create(id="texdistct6") + originating_court = TexasOriginatingDistrictCourtDictFactory( + district=5, + ) + docket_data = TexasFinalCourtDocketDictFactory( + court_id=CourtID.SUPREME_COURT.value, + originating_court=originating_court, + ) + + docket_sc = DocketFactory.create(court=self.texas_sc) + result = merge_texas_trial_court_data(docket_sc, docket_data) + + assert result.create is True + assert result.success is True + assert result.pk is not None + + tcd = TrialCourtData.objects.get(pk=result.pk) + assert tcd.docket_id == docket_sc.pk + assert tcd.docket_number_raw_trial == originating_court["case"] + assert tcd.docket_number_trial == originating_court["case"] + assert tcd.judge_str == originating_court["judge"] + assert tcd.reporter == originating_court["reporter"] + assert tcd.punishment == originating_court["punishment"] + assert tcd.county == originating_court["county"] + assert tcd.court == texas_district + assert tcd.court_name == texas_district.full_name + + # Merging the same data again should update, not create + result2 = merge_texas_trial_court_data(docket_sc, docket_data) + + assert result2.create is False + assert result2.update is True + assert result2.success is True + assert result2.pk == tcd.pk + assert TrialCourtData.objects.filter(docket=docket_sc).count() == 1 + @patch("cl.corpus_importer.tasks.get_or_cache_pacer_cookies") @override_settings( diff --git a/cl/search/factories.py b/cl/search/factories.py index 41c759a8e3..87aa1ae288 100644 --- a/cl/search/factories.py +++ b/cl/search/factories.py @@ -37,6 +37,7 @@ Parenthetical, ParentheticalGroup, RECAPDocument, + TrialCourtData, ) from cl.tests.providers import LegalProvider @@ -457,3 +458,20 @@ class CaseTransferFactory(DjangoModelFactory): class Meta: model = CaseTransfer + + +class TrialCourtDataFactory(DjangoModelFactory): + docket = SubFactory(DocketFactory) + docket_number_trial = Faker("federal_district_docket_number") + docket_number_raw_trial = SelfAttribute("docket_number_trial") + judge_str = Faker("name") + judge = SubFactory(PersonFactory) + reporter = Faker("name") + date_filed = Faker("date_object") + court_name = Faker("court_name") + court = SubFactory(CourtFactory) + punishment = Faker("pystr") + county = Faker("pystr") + + class Meta: + model = TrialCourtData From 4a3794b1a98f1eadc9f530c1b3302d82df38dd67 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 23:16:56 +0000 Subject: [PATCH 59/87] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cl/corpus_importer/tasks.py | 1 - cl/corpus_importer/tests.py | 1 - 2 files changed, 2 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 4f9b2e39aa..bfea9eb4c4 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -186,7 +186,6 @@ from cl.search.cluster_sources import ClusterSources from cl.search.models import ( PRECEDENTIAL_STATUS, - SOURCES, CaseTransfer, ClaimHistory, Court, diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index c911e8f1f1..18998b55ea 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -165,7 +165,6 @@ ) from cl.search.models import ( SEARCH_TYPES, - SOURCES, CaseTransfer, Citation, Docket, From c47f1d63b9d82dab2c2e9a538dd1378121538bbf Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Mon, 9 Mar 2026 19:18:40 -0600 Subject: [PATCH 60/87] fix(texas): Flaky tests Appellate brief flag test was breaking when there were no case events; trial court data test was incomplete and needed a Texas trial court in the DB to work. --- cl/corpus_importer/tasks.py | 2 +- cl/corpus_importer/tests.py | 53 +++++++++++++++++++++++++++++++------ 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index bfea9eb4c4..83a06f8765 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4245,7 +4245,7 @@ def merge_texas_docket( ) docket.court = court if docket is None: - docket: Docket = async_to_sync(find_docket_object)( + docket = async_to_sync(find_docket_object)( court_id=court.pk, pacer_case_id=None, docket_number=docket_number, diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 18998b55ea..3c307b8f70 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -2171,6 +2171,7 @@ def setUpTestData(cls): cls.texas_sc = CourtFactory.create(id="tex") cls.texas_cca = CourtFactory.create(id="texcrimapp") cls.texas_coa1 = CourtFactory.create(id="txctapp1") + cls.texas_dc100 = CourtFactory.create(id="texdistct101") cls.docket_number_coa1 = "01-25-00011-CV" cls.docket_coa1 = DocketFactory.create( court=cls.texas_coa1, @@ -2223,9 +2224,12 @@ def test_generate_appellate_brief_flags(self): n_events = fake.random_int(min=0, max=30) case_events = [TexasCaseEventDictFactory() for _ in range(n_events)] - appellate_brief_indices = sorted( - fake.random_elements(range(len(case_events)), unique=True) - ) + if len(case_events) == 0: + appellate_brief_indices = [] + else: + appellate_brief_indices = sorted( + fake.random_elements(range(len(case_events)), unique=True) + ) appellate_briefs = [ TexasAppellateBriefDictFactory( @@ -2447,8 +2451,14 @@ def test_merge_texas_docket_entry_no_update(self): created_docket_entry = TexasDocketEntry.objects.get(pk=output.pk) assert created_docket_entry.docket_id == self.docket_coa1.id assert created_docket_entry.entry_type == js_docket_entry["type"] - assert ( - created_docket_entry.disposition == js_docket_entry["disposition"] + assert created_docket_entry.disposition == js_docket_entry.get( + "disposition", "" + ) + assert created_docket_entry.description == js_docket_entry.get( + "description", "" + ) + assert created_docket_entry.remarks == js_docket_entry.get( + "remarks", "" ) assert created_docket_entry.date_filed == js_docket_entry["date"] n_attachments = TexasDocument.objects.filter( @@ -3194,6 +3204,9 @@ def test_merge_texas_docket_final_court_sets_appeal_from(self): court_id=CourtID.SUPREME_COURT.value, docket_number=docket_sc.docket_number, appeals_court=appeals_court, + originating_court=TexasOriginatingDistrictCourtDictFactory( + district=100 + ), ) result = merge_texas_docket(docket_data) @@ -3287,15 +3300,39 @@ def test_merge_trial_court_data(self): assert tcd.court == texas_district assert tcd.court_name == texas_district.full_name - # Merging the same data again should update, not create + # Merging the same data again should be unnecessary result2 = merge_texas_trial_court_data(docket_sc, docket_data) - + tcd.refresh_from_db() assert result2.create is False - assert result2.update is True + assert result2.update is False assert result2.success is True + assert tcd.docket_number_raw_trial == originating_court["case"] + assert tcd.docket_number_trial == originating_court["case"] + assert tcd.judge_str == originating_court["judge"] + assert tcd.reporter == originating_court["reporter"] + assert tcd.punishment == originating_court["punishment"] + assert tcd.county == originating_court["county"] assert result2.pk == tcd.pk assert TrialCourtData.objects.filter(docket=docket_sc).count() == 1 + # Merging changed data should update + new_dn = originating_court["case"] + "Different" + originating_court["case"] = new_dn + + result3 = merge_texas_trial_court_data(docket_sc, docket_data) + tcd.refresh_from_db() + assert result3.create is False + assert result3.update is True + assert result3.success is True + assert tcd.docket_number_raw_trial == originating_court["case"] + assert tcd.docket_number_trial == new_dn + assert tcd.judge_str == originating_court["judge"] + assert tcd.reporter == originating_court["reporter"] + assert tcd.punishment == originating_court["punishment"] + assert tcd.county == originating_court["county"] + assert result3.pk == tcd.pk + assert TrialCourtData.objects.filter(docket=docket_sc).count() == 1 + @patch("cl.corpus_importer.tasks.get_or_cache_pacer_cookies") @override_settings( From 0e25c55b9b6543b9002fa46c9bb02aa64fbd6dd0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Mar 2026 20:46:51 +0000 Subject: [PATCH 61/87] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- cl/search/factories.py | 3 ++- cl/search/models.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cl/search/factories.py b/cl/search/factories.py index c1a4b0176d..0cc115467b 100644 --- a/cl/search/factories.py +++ b/cl/search/factories.py @@ -37,8 +37,8 @@ Parenthetical, ParentheticalGroup, RECAPDocument, - TrialCourtData, SCOTUSDocketEntry, + TrialCourtData, ) from cl.tests.providers import LegalProvider @@ -512,5 +512,6 @@ class TrialCourtDataFactory(DjangoModelFactory): class Meta: model = TrialCourtData + docket_entries = List([SubFactory(SCOTUSDocketEntryDataFactory)]) parties = List([SubFactory(SCOTUSPartyDataFactory)]) diff --git a/cl/search/models.py b/cl/search/models.py index 3c6936c7be..89e2b4eeb9 100644 --- a/cl/search/models.py +++ b/cl/search/models.py @@ -4196,7 +4196,7 @@ class Meta: "attachment_number", ) ordering = ("document_number", "attachment_number") - + def __str__(self) -> str: return f"{self.pk}: Docket_{self.docket_entry.docket.docket_number} , document_number_{self.document_number} , attachment_number_{self.attachment_number}" @@ -4216,6 +4216,7 @@ def file_name(self) -> str: return extract_file_name_from_url(self.url) + @pghistory.track() @document_model class TrialCourtData(AbstractDateTimeModel): @@ -4274,4 +4275,3 @@ class TrialCourtData(AbstractDateTimeModel): ) punishment = models.TextField(blank=True) county = models.TextField(blank=True) - From 47ea1f0a4c14a1cb1cbb61127ac0f04cbf9a4e7c Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Thu, 12 Mar 2026 15:33:08 -0600 Subject: [PATCH 62/87] feat(texas): Separate docket merging and attachment downloading by default Per #7096, not doing so can lead to the celery queue being overwhelmed. --- .../commands/download_texas_documents.py | 104 ++++++++++++++++++ cl/corpus_importer/management/utils.py | 18 ++- cl/corpus_importer/tasks.py | 50 +++++++-- 3 files changed, 157 insertions(+), 15 deletions(-) create mode 100644 cl/corpus_importer/management/commands/download_texas_documents.py diff --git a/cl/corpus_importer/management/commands/download_texas_documents.py b/cl/corpus_importer/management/commands/download_texas_documents.py new file mode 100644 index 0000000000..d5fa6be8b8 --- /dev/null +++ b/cl/corpus_importer/management/commands/download_texas_documents.py @@ -0,0 +1,104 @@ +import time + +from celery import chain +from django.db.models import Q + +from cl.corpus_importer.tasks import download_texas_document_pdf, logger +from cl.lib.celery_utils import CeleryThrottle +from cl.lib.command_utils import VerboseCommand +from cl.scrapers.tasks import extract_pdf_document +from cl.search.models import TexasDocument + + +def download_and_extract_texas_documents( + download_queue: str, extraction_queue: str, delay: float +) -> None: + """ + Download and extract attachments for TexasDocument with a missing or stale + local file. + + Queries TexasDocument instances that have no filepath_local or have + `ocr_status` not unnecessary or complete, then schedules a download -> + extraction chain for each. + + :param download_queue: The celery queue for download tasks. + :param extraction_queue: The celery queue for extraction tasks. + :param delay: Seconds to sleep between scheduling tasks. + :return: None + """ + docs = ( + TexasDocument.objects.filter(Q(filepath_local="")) + .exclude( + ocr_status__notin=( + TexasDocument.OCR_UNNECESSARY, + TexasDocument.OCR_COMPLETE, + ) + ) + .values_list("pk", flat=True) + .order_by() + ) + count = docs.count() + logger.info( + "Found %s TexasDocuments needing download or extraction.", count + ) + throttle = CeleryThrottle(queue_name=extraction_queue) + processed_count = 0 + for pk in docs.iterator(): + throttle.maybe_wait() + chain( + download_texas_document_pdf.si(pk).set(queue=download_queue), + extract_pdf_document.s( + check_if_needed=False, + model_name="search.TexasDocument", + ).set(queue=extraction_queue), + ).apply_async() + processed_count += 1 + if processed_count % 100 == 0: + logger.info( + "Scheduled %s/%s (%s)", + processed_count, + count, + f"{processed_count / count:.0%}", + ) + time.sleep(delay) + logger.info( + "Scheduled %s/%s", + processed_count, + count, + ) + + +class Command(VerboseCommand): + help = "Download and extract PDFs for TexasDocument instances which have missing or stale local files." + + def add_arguments(self, parser): + parser.add_argument( + "--download-queue", + type=str, + default="celery", + help="The celery queue for PDF download tasks.", + ) + parser.add_argument( + "--extraction-queue", + type=str, + default="celery", + help="The celery queue for PDF extraction tasks.", + ) + parser.add_argument( + "--delay", + type=float, + default=1.0, + help="Seconds to sleep between scheduling tasks.", + ) + + def handle(self, *args, **options): + super().handle(*args, **options) + + extraction_queue = options["extraction_queue"] + delay = options["delay"] + download_queue = options["download_queue"] + + logger.info("Downloading and extracting TexasDocument attachments...") + download_and_extract_texas_documents( + download_queue, extraction_queue, delay + ) diff --git a/cl/corpus_importer/management/utils.py b/cl/corpus_importer/management/utils.py index 8563657ef8..932b08dde0 100644 --- a/cl/corpus_importer/management/utils.py +++ b/cl/corpus_importer/management/utils.py @@ -48,9 +48,12 @@ class CorpusImporterCommand(VerboseCommand, ABC): Required methods are: - - `merge_task`: Should return a Celery task which takes the output of + - `merge_task`: Should return a Celery task which takes the output of\ `download_task`, parses it, and merges it into the database. Input\ - should be whatever the output of `download_task` is. + should be whatever the output of `download_task` is. Must accept a\ + `download_attachments` boolean keyword argument indicating whether\ + docket entry attachments should be downloaded as part of the merging\ + process. Required properties are: @@ -116,6 +119,12 @@ def add_arguments(self, parser): default=False, help="Randomly select rows from the inventory file to import.", ) + parser.add_argument( + "--download-attachments", + type=bool, + default=False, + help="Whether to download docket entry attachments as part of this command.", + ) @staticmethod def download_task() -> app.Task: @@ -152,6 +161,7 @@ def handle(self, *args, **options): delay = options["delay"] inventory_rows = options["inventory_rows"] inventory_path = settings.MEDIA_ROOT / options["inventory_file"] + download_attachments = options["download_attachments"] start_row = options["start_row"] if options["auto_resume"]: @@ -184,7 +194,9 @@ def handle(self, *args, **options): self.download_task() .si(*download_args) .set(queue=retrieval_queue), - self.merge_task().s().set(queue=ingesting_queue), + self.merge_task() + .s(download_attachments=download_attachments) + .set(queue=ingesting_queue), ).apply_async() time.sleep(delay) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 66f360599b..dfa9957090 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4142,7 +4142,9 @@ def merge_case_transfer(case_transfer: CaseTransfer) -> MergeResult: def merge_texas_document( - docket_entry: TexasDocketEntry, input_document: TexasCaseDocument + docket_entry: TexasDocketEntry, + input_document: TexasCaseDocument, + download_attachments: bool = False, ) -> MergeResult: """Merge a single TexasCaseDocument object into CL. @@ -4153,7 +4155,10 @@ def merge_texas_document( :param docket_entry: The docket entry this attachment belongs to. :param input_document: The attachment to merge. - :return: The result of the merge operation.""" + :param download_attachments: Whether to download docket entry attachments. + + :return: The result of the merge operation. + """ try: texas_document = TexasDocument.objects.get( media_id=input_document["media_id"], @@ -4185,13 +4190,18 @@ def merge_texas_document( texas_document.description = input_document["description"] texas_document.media_version_id = input_document["media_version_id"] texas_document.url = input_document["document_url"] + texas_document.filepath_local = "" + # Using this as a quick and dirty proxy for stale attachments, because + # I don't want to do a migration. + texas_document.ocr_status = None texas_document.save() - chain( - download_texas_document_pdf.si(texas_document.pk), - extract_pdf_document.s( - check_if_needed=False, model_name="search.TexasDocument" - ), - ).apply_async() + if download_attachments: + chain( + download_texas_document_pdf.si(texas_document.pk), + extract_pdf_document.s( + check_if_needed=False, model_name="search.TexasDocument" + ), + ).apply_async() return MergeResult( create=not existed, update=existed, @@ -4211,6 +4221,7 @@ def merge_texas_docket_entry( | TexasAppellateBrief | TexasSupremeCourtCaseEvent | TexasSupremeCourtAppellateBrief, + download_attachments: bool = False, ) -> MergeResult: """Merges a Texas docket entry into CL. @@ -4218,12 +4229,15 @@ def merge_texas_docket_entry( :param sequence_number: The sequence number of the docket entry. :param appellate_brief: Whether the docket entry is an appellate brief. :param input_docket_entry: The docket entry being merged. + :param download_attachments: Whether to download docket entry attachments. + :return: Tuple with the following entries - A flag indicating whether the docket entry or an attached document needed to be created or updated, - A flag which is set to true when the create/update operations are all either successful or unnecessary, - - The primary key of the updated TexasDocketEntry object.""" + - The primary key of the updated TexasDocketEntry object. + """ logger.info( "Merging TexasDocketEntry with sequence number %s into Docket %s", sequence_number, @@ -4308,7 +4322,9 @@ def merge_texas_docket_entry( docket.pk, ) document_results = [ - merge_texas_document(docket_entry, document) + merge_texas_document( + docket_entry, document, download_attachments=download_attachments + ) for document in input_docket_entry["attachments"] ] @@ -4673,10 +4689,12 @@ def merge_texas_docket( docket_data: TexasCourtOfAppealsDocket | TexasCourtOfCriminalAppealsDocket | TexasSupremeCourtDocket, + download_attachments: bool = False, ) -> MergeResult: """Merges scraped data from a Texas docket into the `Docket` table. :param docket_data: The scraped Texas docket data. + :param download_attachments: Whether to download docket entry attachments. :return: The result of the merge operation.""" court = Court.objects.get( @@ -4790,7 +4808,11 @@ def merge_texas_docket( entry_merge_results = [ merge_texas_docket_entry( - docket, sequence_number, appellate_brief, entry + docket, + sequence_number, + appellate_brief, + entry, + download_attachments=download_attachments, ) for sequence_number, appellate_brief, entry in zip( create_docket_entry_sequence_numbers(docket_data["case_events"]), @@ -4856,6 +4878,7 @@ def merge_texas_docket( def texas_ingest_docket_task( task: Task, i: tuple[bytes, TexasDocketMeta], + download_attachments: bool = False, ) -> MergeResult: """ Task to parse and merge a Texas docket. @@ -4865,6 +4888,7 @@ def texas_ingest_docket_task( :param i: Tuple with the following entries: - Bytes string to parse. - Docket metadata. + :param download_attachments: Whether to download docket entry attachments. :return: The result of the merge operation. """ @@ -4897,7 +4921,9 @@ def texas_ingest_docket_task( ) task.request.chain = None return MergeResult.failed() - return merge_texas_docket(docket_data) + return merge_texas_docket( + docket_data, download_attachments=download_attachments + ) @app.task( From 120b60c6ead82578e7f1bcdb3cec2253c4764273 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Mar 2026 09:27:08 -0600 Subject: [PATCH 63/87] test(texas): Fix tests for download_attachments parameter and add opt-out test Existing tests that assert attachment downloads are triggered now pass download_attachments=True explicitly, matching the new default of False. Added a test to verify downloads are skipped when the flag is False. Co-Authored-By: Claude Opus 4.6 --- cl/corpus_importer/tests.py | 56 +++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 8 deletions(-) diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 3c307b8f70..14104d8827 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -2259,7 +2259,9 @@ def test_merge_texas_document_new_document(self): input_document = TexasCaseDocumentDictFactory() - result = merge_texas_document(docket_entry, input_document) + result = merge_texas_document( + docket_entry, input_document, download_attachments=True + ) assert result.create is True assert result.success is True @@ -2330,7 +2332,9 @@ def test_merge_texas_document_existing_document_update(self): url=old_document["document_url"], ) - result = merge_texas_document(docket_entry, input_document) + result = merge_texas_document( + docket_entry, input_document, download_attachments=True + ) assert result.create is False assert result.update is True @@ -2349,6 +2353,20 @@ def test_merge_texas_document_existing_document_update(self): self.download_task_mock.assert_called_once_with(current_document.pk) + def test_merge_texas_document_skips_download_when_disabled(self): + """Are attachment downloads skipped when download_attachments=False?""" + docket_entry = self.docket_coa1_entry + input_document = TexasCaseDocumentDictFactory() + + result = merge_texas_document( + docket_entry, input_document, download_attachments=False + ) + + assert result.create is True + assert result.success is True + assert result.pk is not None + self.download_task_mock.assert_not_called() + @mock.patch("cl.lib.celery_utils.get_task_wait", return_value=0) @mock.patch("cl.corpus_importer.tasks.doc_page_count_service") @responses.activate @@ -2381,7 +2399,9 @@ def get_test_pdf( ) docket_entry = self.docket_coa1_entry - result = merge_texas_document(docket_entry, input_document) + result = merge_texas_document( + docket_entry, input_document, download_attachments=True + ) docket_entry.refresh_from_db() document = TexasDocument.objects.get(pk=result.pk) @@ -2399,7 +2419,11 @@ def test_merge_texas_docket_entry_new_entry(self): ) output = merge_texas_docket_entry( - self.docket_coa1, "2025-01-02.000", True, docket_entry + self.docket_coa1, + "2025-01-02.000", + True, + docket_entry, + download_attachments=True, ) assert output.create is True @@ -2428,7 +2452,11 @@ def test_merge_texas_docket_entry_no_update(self): js_docket_entry = self.get_random_docket_entry_dict() result = merge_texas_docket_entry( - self.docket_coa1, "2025-01-02.000", True, js_docket_entry + self.docket_coa1, + "2025-01-02.000", + True, + js_docket_entry, + download_attachments=True, ) pk = result.pk documents = TexasDocument.objects.filter(docket_entry_id=pk) @@ -2440,7 +2468,11 @@ def test_merge_texas_docket_entry_no_update(self): # noop output = merge_texas_docket_entry( - self.docket_coa1, "2025-01-02.000", True, js_docket_entry + self.docket_coa1, + "2025-01-02.000", + True, + js_docket_entry, + download_attachments=True, ) assert output.create is False @@ -2473,7 +2505,11 @@ def test_merge_texas_docket_entry_add_document(self): initial_n_attachments = len(js_docket_entry["attachments"]) result = merge_texas_docket_entry( - self.docket_coa1, "2025-01-02.000", True, js_docket_entry + self.docket_coa1, + "2025-01-02.000", + True, + js_docket_entry, + download_attachments=True, ) pk = result.pk documents = TexasDocument.objects.filter(docket_entry_id=pk) @@ -2485,7 +2521,11 @@ def test_merge_texas_docket_entry_add_document(self): js_docket_entry["attachments"].append(TexasCaseDocumentDictFactory()) output = merge_texas_docket_entry( - self.docket_coa1, "2025-01-02.000", True, js_docket_entry + self.docket_coa1, + "2025-01-02.000", + True, + js_docket_entry, + download_attachments=True, ) assert output.create is True From 877ab0e5dc1d571ecae2b9933c74bc6800452cd3 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Mar 2026 09:47:41 -0600 Subject: [PATCH 64/87] fix(texas): Fix queryset lookup and Celery race condition in merger - Fix `ocr_status__notin` (not a valid Django lookup) to `ocr_status__in` in download_texas_documents command's .exclude() queryset. - Use `transaction.on_commit` to defer Celery `apply_async` calls in `merge_texas_document`, preventing a race where the worker reads the TexasDocument row before the enclosing transaction commits. - Update tests to use `captureOnCommitCallbacks(execute=True)` where download side effects are expected. Co-Authored-By: Claude Opus 4.6 --- .../commands/download_texas_documents.py | 2 +- cl/corpus_importer/tasks.py | 15 +-- cl/corpus_importer/tests.py | 96 ++++++++++--------- 3 files changed, 62 insertions(+), 51 deletions(-) diff --git a/cl/corpus_importer/management/commands/download_texas_documents.py b/cl/corpus_importer/management/commands/download_texas_documents.py index d5fa6be8b8..c1c6efda34 100644 --- a/cl/corpus_importer/management/commands/download_texas_documents.py +++ b/cl/corpus_importer/management/commands/download_texas_documents.py @@ -29,7 +29,7 @@ def download_and_extract_texas_documents( docs = ( TexasDocument.objects.filter(Q(filepath_local="")) .exclude( - ocr_status__notin=( + ocr_status__in=( TexasDocument.OCR_UNNECESSARY, TexasDocument.OCR_COMPLETE, ) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index dfa9957090..da8b064f92 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4196,12 +4196,15 @@ def merge_texas_document( texas_document.ocr_status = None texas_document.save() if download_attachments: - chain( - download_texas_document_pdf.si(texas_document.pk), - extract_pdf_document.s( - check_if_needed=False, model_name="search.TexasDocument" - ), - ).apply_async() + transaction.on_commit( + lambda pk=texas_document.pk: chain( + download_texas_document_pdf.si(pk), + extract_pdf_document.s( + check_if_needed=False, + model_name="search.TexasDocument", + ), + ).apply_async() + ) return MergeResult( create=not existed, update=existed, diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 14104d8827..ecc64cce5a 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -2259,9 +2259,10 @@ def test_merge_texas_document_new_document(self): input_document = TexasCaseDocumentDictFactory() - result = merge_texas_document( - docket_entry, input_document, download_attachments=True - ) + with self.captureOnCommitCallbacks(execute=True): + result = merge_texas_document( + docket_entry, input_document, download_attachments=True + ) assert result.create is True assert result.success is True @@ -2332,9 +2333,10 @@ def test_merge_texas_document_existing_document_update(self): url=old_document["document_url"], ) - result = merge_texas_document( - docket_entry, input_document, download_attachments=True - ) + with self.captureOnCommitCallbacks(execute=True): + result = merge_texas_document( + docket_entry, input_document, download_attachments=True + ) assert result.create is False assert result.update is True @@ -2399,9 +2401,10 @@ def get_test_pdf( ) docket_entry = self.docket_coa1_entry - result = merge_texas_document( - docket_entry, input_document, download_attachments=True - ) + with self.captureOnCommitCallbacks(execute=True): + result = merge_texas_document( + docket_entry, input_document, download_attachments=True + ) docket_entry.refresh_from_db() document = TexasDocument.objects.get(pk=result.pk) @@ -2418,13 +2421,14 @@ def test_merge_texas_docket_entry_new_entry(self): type="Brief", ) - output = merge_texas_docket_entry( - self.docket_coa1, - "2025-01-02.000", - True, - docket_entry, - download_attachments=True, - ) + with self.captureOnCommitCallbacks(execute=True): + output = merge_texas_docket_entry( + self.docket_coa1, + "2025-01-02.000", + True, + docket_entry, + download_attachments=True, + ) assert output.create is True assert output.update is False @@ -2451,13 +2455,14 @@ def test_merge_texas_docket_entry_no_update(self): """Can we correctly handle a docket entry update noop?""" js_docket_entry = self.get_random_docket_entry_dict() - result = merge_texas_docket_entry( - self.docket_coa1, - "2025-01-02.000", - True, - js_docket_entry, - download_attachments=True, - ) + with self.captureOnCommitCallbacks(execute=True): + result = merge_texas_docket_entry( + self.docket_coa1, + "2025-01-02.000", + True, + js_docket_entry, + download_attachments=True, + ) pk = result.pk documents = TexasDocument.objects.filter(docket_entry_id=pk) for document in documents: @@ -2467,13 +2472,14 @@ def test_merge_texas_docket_entry_no_update(self): self.extract_pdf_document_mock.reset_mock() # noop - output = merge_texas_docket_entry( - self.docket_coa1, - "2025-01-02.000", - True, - js_docket_entry, - download_attachments=True, - ) + with self.captureOnCommitCallbacks(execute=True): + output = merge_texas_docket_entry( + self.docket_coa1, + "2025-01-02.000", + True, + js_docket_entry, + download_attachments=True, + ) assert output.create is False assert output.update is True @@ -2504,13 +2510,14 @@ def test_merge_texas_docket_entry_add_document(self): js_docket_entry = self.get_random_docket_entry_dict() initial_n_attachments = len(js_docket_entry["attachments"]) - result = merge_texas_docket_entry( - self.docket_coa1, - "2025-01-02.000", - True, - js_docket_entry, - download_attachments=True, - ) + with self.captureOnCommitCallbacks(execute=True): + result = merge_texas_docket_entry( + self.docket_coa1, + "2025-01-02.000", + True, + js_docket_entry, + download_attachments=True, + ) pk = result.pk documents = TexasDocument.objects.filter(docket_entry_id=pk) for document in documents: @@ -2520,13 +2527,14 @@ def test_merge_texas_docket_entry_add_document(self): self.extract_pdf_document_mock.reset_mock() js_docket_entry["attachments"].append(TexasCaseDocumentDictFactory()) - output = merge_texas_docket_entry( - self.docket_coa1, - "2025-01-02.000", - True, - js_docket_entry, - download_attachments=True, - ) + with self.captureOnCommitCallbacks(execute=True): + output = merge_texas_docket_entry( + self.docket_coa1, + "2025-01-02.000", + True, + js_docket_entry, + download_attachments=True, + ) assert output.create is True assert output.update is True From cfc3af730880b83d7ef73c545270f1f670c00608 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Mar 2026 10:51:40 -0600 Subject: [PATCH 65/87] fix(texas): mypy error and add comment explaining reason --- cl/corpus_importer/tasks.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index da8b064f92..a3e4d31398 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4197,13 +4197,18 @@ def merge_texas_document( texas_document.save() if download_attachments: transaction.on_commit( - lambda pk=texas_document.pk: chain( - download_texas_document_pdf.si(pk), - extract_pdf_document.s( - check_if_needed=False, - model_name="search.TexasDocument", - ), - ).apply_async() + # Lambda captures the pk without needing to keep the whole + # object around. It needs to be wrapped in another lambda to + # prevent mypy from complaining. + ( + lambda pk: lambda: chain( + download_texas_document_pdf.si(pk), + extract_pdf_document.s( + check_if_needed=False, + model_name="search.TexasDocument", + ), + ).apply_async() + )(texas_document.pk) ) return MergeResult( create=not existed, From 15e6f510b96e9f36e28418e8ace542bb13adb930 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Mar 2026 11:24:53 -0600 Subject: [PATCH 66/87] feat(texas): Add --only-extraction flag to document download command --- .../commands/download_texas_documents.py | 100 +++++++++++++++--- cl/search/factories.py | 7 +- 2 files changed, 86 insertions(+), 21 deletions(-) diff --git a/cl/corpus_importer/management/commands/download_texas_documents.py b/cl/corpus_importer/management/commands/download_texas_documents.py index c1c6efda34..9ce85102df 100644 --- a/cl/corpus_importer/management/commands/download_texas_documents.py +++ b/cl/corpus_importer/management/commands/download_texas_documents.py @@ -1,4 +1,5 @@ import time +from itertools import batched from celery import chain from django.db.models import Q @@ -10,6 +11,59 @@ from cl.search.models import TexasDocument +def extract_texas_documents( + extraction_queue: str, batch_size: int, delay: float +) -> None: + """ + Run the extraction task for TexasDocument instances where ocr_status is not + OCR_UNNECESSARY or OCR_COMPLETE. + + :param extraction_queue: The celery queue for PDF extraction tasks. + :param batch_size: The batch size for PDF extraction tasks. + :param delay: Seconds to sleep between scheduling tasks. + + :return: None + """ + docs = ( + TexasDocument.objects.exclude( + Q(filepath_local="") + | Q( + ocr_status__in=( + TexasDocument.OCR_UNNECESSARY, + TexasDocument.OCR_COMPLETE, + ) + ) + ) + .values_list("pk", flat=True) + .order_by() + ) + count = docs.count() + logger.info("Found %s TexasDocuments needing extraction.", count) + throttle = CeleryThrottle(queue_name=extraction_queue) + processed_count = 0 + for pks in batched(docs.iterator(), batch_size): + throttle.maybe_wait() + extract_pdf_document.si( + pks=pks, + check_if_needed=False, + model_name="search.TexasDocument", + ).set(queue=extraction_queue).apply_async() + processed_count += 1 + if processed_count % 100 == 0: + logger.info( + "Scheduled %s/%s (%s)", + processed_count, + count, + f"{processed_count / count:.0%}", + ) + time.sleep(delay) + logger.info( + "Scheduled %s/%s", + processed_count, + count, + ) + + def download_and_extract_texas_documents( download_queue: str, extraction_queue: str, delay: float ) -> None: @@ -17,29 +71,23 @@ def download_and_extract_texas_documents( Download and extract attachments for TexasDocument with a missing or stale local file. - Queries TexasDocument instances that have no filepath_local or have - `ocr_status` not unnecessary or complete, then schedules a download -> - extraction chain for each. + Queries TexasDocument instances that have no filepath_local, then schedules + a download -> extraction chain for each. :param download_queue: The celery queue for download tasks. :param extraction_queue: The celery queue for extraction tasks. :param delay: Seconds to sleep between scheduling tasks. + :return: None """ docs = ( - TexasDocument.objects.filter(Q(filepath_local="")) - .exclude( - ocr_status__in=( - TexasDocument.OCR_UNNECESSARY, - TexasDocument.OCR_COMPLETE, - ) - ) + TexasDocument.objects.filter(filepath_local="") .values_list("pk", flat=True) .order_by() ) count = docs.count() logger.info( - "Found %s TexasDocuments needing download or extraction.", count + "Found %s TexasDocuments needing download and extraction.", count ) throttle = CeleryThrottle(queue_name=extraction_queue) processed_count = 0 @@ -84,6 +132,18 @@ def add_arguments(self, parser): default="celery", help="The celery queue for PDF extraction tasks.", ) + parser.add_argument( + "--only-extraction", + type=bool, + default=False, + help="Skip downloading attachments and only run the extraction task.", + ) + parser.add_argument( + "--batch-size", + type=int, + default=1000, + help="The batch size for PDF extraction tasks. Only used if --only-extraction is true.", + ) parser.add_argument( "--delay", type=float, @@ -96,9 +156,17 @@ def handle(self, *args, **options): extraction_queue = options["extraction_queue"] delay = options["delay"] - download_queue = options["download_queue"] + only_extraction = options["only_extraction"] - logger.info("Downloading and extracting TexasDocument attachments...") - download_and_extract_texas_documents( - download_queue, extraction_queue, delay - ) + if only_extraction: + batch_size = options["batch_size"] + logger.info("Running extraction for TexasDocuments...") + extract_texas_documents(extraction_queue, batch_size, delay) + else: + download_queue = options["download_queue"] + logger.info( + "Downloading and extracting TexasDocument attachments..." + ) + download_and_extract_texas_documents( + download_queue, extraction_queue, delay + ) diff --git a/cl/search/factories.py b/cl/search/factories.py index 0cc115467b..3cd640d7ee 100644 --- a/cl/search/factories.py +++ b/cl/search/factories.py @@ -465,8 +465,8 @@ class ScotusDocketDataFactory(DictFactory): lower_court_decision_date = Faker("date_object") lower_court_rehearing_denied_date = Faker("date_object") questions_presented = Faker("url") - docket_entries = List([SubFactory(SCOTUSDocketEntryFactory)]) - parties = [] + docket_entries = List([SubFactory(SCOTUSDocketEntryDataFactory)]) + parties = List([SubFactory(SCOTUSPartyDataFactory)]) class CaseTransferFactory(DjangoModelFactory): @@ -512,6 +512,3 @@ class TrialCourtDataFactory(DjangoModelFactory): class Meta: model = TrialCourtData - - docket_entries = List([SubFactory(SCOTUSDocketEntryDataFactory)]) - parties = List([SubFactory(SCOTUSPartyDataFactory)]) From f205da93d44b37ea0a56d36007e23ff6a22e13a7 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Mar 2026 11:51:24 -0600 Subject: [PATCH 67/87] fix(texas): Address PR review Add Court query error handling in merge_texas_case_transfers and merge_texas_trial_court; delete stale local files; change default for download_attachments to True; simplify nesting in merge_texas_trial_court --- cl/corpus_importer/tasks.py | 152 +++++++++++++++++++++++------------- cl/corpus_importer/tests.py | 42 ++-------- 2 files changed, 105 insertions(+), 89 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index a3e4d31398..a66493fb2a 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4000,10 +4000,16 @@ def merge_texas_trial_court_data( punishment = originating_court["punishment"] county = originating_court["county"] court_id = texas_originating_court_to_court_id(originating_court) + court = None + court_name = originating_court["name"] + judge = None if court_id: - court = Court.objects.get(pk=court_id) + try: + court = Court.objects.get(pk=court_id) + except Court.DoesNotExist: + logger.error("Court with ID %s not found.", court_id) + court = None court_name = court.full_name - if judge_name: judge = async_to_sync(lookup_judge_by_full_name)( name=judge_name, @@ -4011,12 +4017,6 @@ def merge_texas_trial_court_data( event_date=None, require_living_judge=False, ) - else: - judge = None - else: - court = None - court_name = originating_court["name"] - judge = None try: trial_court_data = TrialCourtData.objects.get( @@ -4144,7 +4144,7 @@ def merge_case_transfer(case_transfer: CaseTransfer) -> MergeResult: def merge_texas_document( docket_entry: TexasDocketEntry, input_document: TexasCaseDocument, - download_attachments: bool = False, + download_attachments: bool = True, ) -> MergeResult: """Merge a single TexasCaseDocument object into CL. @@ -4190,9 +4190,9 @@ def merge_texas_document( texas_document.description = input_document["description"] texas_document.media_version_id = input_document["media_version_id"] texas_document.url = input_document["document_url"] + if texas_document.filepath_local: + texas_document.filepath_local.delete(save=False) texas_document.filepath_local = "" - # Using this as a quick and dirty proxy for stale attachments, because - # I don't want to do a migration. texas_document.ocr_status = None texas_document.save() if download_attachments: @@ -4229,7 +4229,7 @@ def merge_texas_docket_entry( | TexasAppellateBrief | TexasSupremeCourtCaseEvent | TexasSupremeCourtAppellateBrief, - download_attachments: bool = False, + download_attachments: bool = True, ) -> MergeResult: """Merges a Texas docket entry into CL. @@ -4502,6 +4502,7 @@ def merge_texas_case_transfers( ) appeals_court = docket_data["appeals_court"] + transfer_origin_court: Court | None = None if docket_data["court_id"] == CourtID.COURT_OF_CRIMINAL_APPEALS.value: logger.info("Docket %s is from the CCA", docket.docket_number) @@ -4517,9 +4518,15 @@ def merge_texas_case_transfers( docket.docket_number, ) if trial_court_id: - transfer.origin_court = Court.objects.get( - pk=trial_court_id - ) + try: + transfer_origin_court = Court.objects.get( + pk=trial_court_id + ) + except Court.DoesNotExist: + logger.error( + "Court with ID %s not found while populating CaseTransfer.origin_court.", + trial_court_id, + ) transfer.origin_docket_number = docket_data[ "originating_court" ]["case"] @@ -4549,7 +4556,15 @@ def merge_texas_case_transfers( docket.docket_number, appeals_court_id, ) - transfer.origin_court = Court.objects.get(pk=appeals_court_id) + try: + transfer_origin_court = Court.objects.get( + pk=appeals_court_id + ) + except Court.DoesNotExist: + logger.error( + "Court with ID %s not found while populating CaseTransfer.origin_court.", + appeals_court_id, + ) transfer.origin_docket_number = appeals_court["case_number"] elif docket_data["court_id"] == CourtID.SUPREME_COURT.value: logger.info("Docket %s is from the SC", docket.docket_number) @@ -4568,7 +4583,13 @@ def merge_texas_case_transfers( docket.docket_number, appeals_court_id, ) - transfer.origin_court = Court.objects.get(pk=appeals_court_id) + try: + transfer_origin_court = Court.objects.get(pk=appeals_court_id) + except Court.DoesNotExist: + logger.error( + "Court with ID %s not found while populating CaseTransfer.origin_court.", + appeals_court_id, + ) transfer.origin_docket_number = appeals_court["case_number"] else: logger.error( @@ -4576,7 +4597,11 @@ def merge_texas_case_transfers( docket_data["court_id"], ) return MergeResult.failed() - transfers = [transfer] + if transfer_origin_court: + transfer.origin_court = transfer_origin_court + transfers = [transfer] + else: + transfers = [] elif docket_data["court_type"] == CourtType.APPELLATE.value: logger.info("Docket %s is an appellate docket", docket.docket_number) transfers = [] @@ -4585,19 +4610,27 @@ def merge_texas_case_transfers( "Appellate docket %s has a valid trial court", docket.docket_number, ) - transfers.append( - CaseTransfer( - origin_court=Court.objects.get(pk=trial_court_id), - origin_docket_number=docket_data["originating_court"][ - "case" - ], - destination_court=docket.court, - destination_docket_number=docket.docket_number, - destination_docket=docket, - transfer_date=docket_data["date_filed"], - transfer_type=CaseTransfer.APPEAL, + try: + appeal_origin_court = Court.objects.get(pk=trial_court_id) + except Court.DoesNotExist: + logger.error( + "Court with ID %s not found while populating CaseTransfer.origin_court.", + trial_court_id, + ) + else: + transfers.append( + CaseTransfer( + origin_court=Court.objects.get(pk=appeal_origin_court), + origin_docket_number=docket_data["originating_court"][ + "case" + ], + destination_court=docket.court, + destination_docket_number=docket.docket_number, + destination_docket=docket, + transfer_date=docket_data["date_filed"], + transfer_type=CaseTransfer.APPEAL, + ) ) - ) if docket_data["transfer_from"]: logger.info( "Appellate docket %s has a transfer in", docket.docket_number @@ -4608,30 +4641,39 @@ def merge_texas_case_transfers( "Missing transfer date for workload transfer of docket %s", docket.docket_number, ) - transfers.append( - CaseTransfer( - origin_court=Court.objects.get( - pk=texas_js_court_id_to_court_id( - docket_data["transfer_from"]["court_id"] - ) - ), - origin_docket_number=docket_data["transfer_from"][ - "origin_docket" - ], - destination_court=docket.court, - destination_docket_number=docket.docket_number, - destination_docket=docket, - # If the transfer date is absent or empty, assume it matches the filing date - transfer_date=transfer_from_date - if transfer_from_date - else docket_data["date_filed"], - # Texas Government Code 73.001 (accessed 2026-02-23) - transfer_type=CaseTransfer.JURISDICTION - if docket_data["court_id"] - == CourtID.FIFTEENTH_COURT_OF_APPEALS - else CaseTransfer.WORKLOAD, - ) + workload_origin_court_id = texas_js_court_id_to_court_id( + docket_data["transfer_from"]["court_id"] ) + try: + workload_origin_court = Court.objects.get( + pk=workload_origin_court_id + ) + except Court.DoesNotExist: + logger.error( + "Court with ID %s not found while populating CaseTransfer.origin_court.", + workload_origin_court_id, + ) + else: + transfers.append( + CaseTransfer( + origin_court=workload_origin_court, + origin_docket_number=docket_data["transfer_from"][ + "origin_docket" + ], + destination_court=docket.court, + destination_docket_number=docket.docket_number, + destination_docket=docket, + # If the transfer date is absent or empty, assume it matches the filing date + transfer_date=transfer_from_date + if transfer_from_date + else docket_data["date_filed"], + # Texas Government Code 73.001 (accessed 2026-02-23) + transfer_type=CaseTransfer.JURISDICTION + if docket_data["court_id"] + == CourtID.FIFTEENTH_COURT_OF_APPEALS + else CaseTransfer.WORKLOAD, + ) + ) else: logger.error( "Unrecognized Texas court type %s while creating CaseTransfer", @@ -4697,7 +4739,7 @@ def merge_texas_docket( docket_data: TexasCourtOfAppealsDocket | TexasCourtOfCriminalAppealsDocket | TexasSupremeCourtDocket, - download_attachments: bool = False, + download_attachments: bool = True, ) -> MergeResult: """Merges scraped data from a Texas docket into the `Docket` table. @@ -4886,7 +4928,7 @@ def merge_texas_docket( def texas_ingest_docket_task( task: Task, i: tuple[bytes, TexasDocketMeta], - download_attachments: bool = False, + download_attachments: bool = True, ) -> MergeResult: """ Task to parse and merge a Texas docket. diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index ecc64cce5a..5e2bfcd1c2 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -2260,9 +2260,7 @@ def test_merge_texas_document_new_document(self): input_document = TexasCaseDocumentDictFactory() with self.captureOnCommitCallbacks(execute=True): - result = merge_texas_document( - docket_entry, input_document, download_attachments=True - ) + result = merge_texas_document(docket_entry, input_document) assert result.create is True assert result.success is True @@ -2334,9 +2332,7 @@ def test_merge_texas_document_existing_document_update(self): ) with self.captureOnCommitCallbacks(execute=True): - result = merge_texas_document( - docket_entry, input_document, download_attachments=True - ) + result = merge_texas_document(docket_entry, input_document) assert result.create is False assert result.update is True @@ -2402,9 +2398,7 @@ def get_test_pdf( docket_entry = self.docket_coa1_entry with self.captureOnCommitCallbacks(execute=True): - result = merge_texas_document( - docket_entry, input_document, download_attachments=True - ) + result = merge_texas_document(docket_entry, input_document) docket_entry.refresh_from_db() document = TexasDocument.objects.get(pk=result.pk) @@ -2423,11 +2417,7 @@ def test_merge_texas_docket_entry_new_entry(self): with self.captureOnCommitCallbacks(execute=True): output = merge_texas_docket_entry( - self.docket_coa1, - "2025-01-02.000", - True, - docket_entry, - download_attachments=True, + self.docket_coa1, "2025-01-02.000", True, docket_entry ) assert output.create is True @@ -2457,11 +2447,7 @@ def test_merge_texas_docket_entry_no_update(self): with self.captureOnCommitCallbacks(execute=True): result = merge_texas_docket_entry( - self.docket_coa1, - "2025-01-02.000", - True, - js_docket_entry, - download_attachments=True, + self.docket_coa1, "2025-01-02.000", True, js_docket_entry ) pk = result.pk documents = TexasDocument.objects.filter(docket_entry_id=pk) @@ -2474,11 +2460,7 @@ def test_merge_texas_docket_entry_no_update(self): # noop with self.captureOnCommitCallbacks(execute=True): output = merge_texas_docket_entry( - self.docket_coa1, - "2025-01-02.000", - True, - js_docket_entry, - download_attachments=True, + self.docket_coa1, "2025-01-02.000", True, js_docket_entry ) assert output.create is False @@ -2512,11 +2494,7 @@ def test_merge_texas_docket_entry_add_document(self): with self.captureOnCommitCallbacks(execute=True): result = merge_texas_docket_entry( - self.docket_coa1, - "2025-01-02.000", - True, - js_docket_entry, - download_attachments=True, + self.docket_coa1, "2025-01-02.000", True, js_docket_entry ) pk = result.pk documents = TexasDocument.objects.filter(docket_entry_id=pk) @@ -2529,11 +2507,7 @@ def test_merge_texas_docket_entry_add_document(self): js_docket_entry["attachments"].append(TexasCaseDocumentDictFactory()) with self.captureOnCommitCallbacks(execute=True): output = merge_texas_docket_entry( - self.docket_coa1, - "2025-01-02.000", - True, - js_docket_entry, - download_attachments=True, + self.docket_coa1, "2025-01-02.000", True, js_docket_entry ) assert output.create is True From 9c0253471f7107958a93413b05b82712e58a2cbc Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Mar 2026 14:27:25 -0600 Subject: [PATCH 68/87] refactor(texas): Lower nesting level and simplify logic in merge_texas_case_transfers Additionally, modify factories to more closely match Juriscraper --- cl/corpus_importer/tasks.py | 323 ++++++++++++----------------- cl/corpus_importer/tests.py | 5 +- cl/search/state/texas/factories.py | 14 +- 3 files changed, 154 insertions(+), 188 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index a66493fb2a..763c8c5970 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4486,218 +4486,169 @@ def merge_texas_case_transfers( docket.docket_number, docket.court.pk, ) - trial_court_id = texas_originating_court_to_court_id( - docket_data["originating_court"] - ) - if docket_data["court_type"] == CourtType.SUPREME.value: - # Assume that the originating court -> appellate court transfer will - # be populated by an appellate docket later on. - transfer = CaseTransfer( - destination_court=docket.court, - destination_docket_number=docket.docket_number, - destination_docket=docket, - transfer_date=docket_data["date_filed"], - transfer_type=CaseTransfer.APPEAL, - ) + originating_court = docket_data["originating_court"] + oc_type = originating_court["court_type"] + oc_dn = originating_court["case"] + appeals_court = docket_data.get("appeals_court", {}) + ac_id = appeals_court.get("court_id", "") + ac_dn = appeals_court.get("case_number", "") + trial_court_id = texas_originating_court_to_court_id(originating_court) + appeal_transfer_origin_court_id = "" + appeal_transfer_origin_dn = "" + + transfers = [] + + match docket_data["court_id"]: + # Death penalty cases are automatically appealed to the CCA so the + # appellate court may be missing. + case CourtID.COURT_OF_CRIMINAL_APPEALS.value if ( + ac_id == CourtID.UNKNOWN.value + ): + logger.info( + "Docket %s in the CCA is a death penalty appeal", + docket.docket_number, + ) - appeals_court = docket_data["appeals_court"] - transfer_origin_court: Court | None = None + if not trial_court_id: + logger.error( + "Unable to determine trial court ID for Texas docket %s to create death penalty appeal CaseTransfer", + docket.docket_number, + ) + return MergeResult.failed() - if docket_data["court_id"] == CourtID.COURT_OF_CRIMINAL_APPEALS.value: - logger.info("Docket %s is from the CCA", docket.docket_number) - # Death penalty cases are automatically appealed to the CCA so the - # appellate court may be missing. - if ( - not appeals_court - or appeals_court["court_id"] == CourtID.UNKNOWN.value - ): - # Death penalty appeal - logger.info( - "Docket %s in the CCA is a death penalty appeal", + appeal_transfer_origin_dn = oc_dn + appeal_transfer_origin_court_id = trial_court_id + case CourtID.COURT_OF_CRIMINAL_APPEALS.value: + logger.info( + "Docket %s is a non-death penalty CCA docket", + docket.docket_number, + ) + + appeal_transfer_origin_dn = ac_dn + appeal_transfer_origin_court_id = texas_js_court_id_to_court_id( + ac_id + ) + case CourtID.SUPREME_COURT.value if ac_id == CourtID.UNKNOWN.value: + if oc_type == CourtType.UNKNOWN.value: + logger.warning( + "Found Texas SC docket with no originating or appellate information (docket number %s).", docket.docket_number, ) - if trial_court_id: - try: - transfer_origin_court = Court.objects.get( - pk=trial_court_id - ) - except Court.DoesNotExist: - logger.error( - "Court with ID %s not found while populating CaseTransfer.origin_court.", - trial_court_id, - ) - transfer.origin_docket_number = docket_data[ - "originating_court" - ]["case"] - else: - logger.error( - "Unable to determine trial court ID for Texas docket %s to create CaseTransfer", - docket.docket_number, - ) - return MergeResult.failed() - else: + + return MergeResult.failed() + + logger.warning( + "Found Texas SC docket with originating information but no appellate information (docket number %s). Falling back to using trial court to create appeal type transfer.", + docket.docket_number, + ) + + appeal_transfer_origin_dn = oc_dn + appeal_transfer_origin_court_id = trial_court_id + case CourtID.SUPREME_COURT.value: + logger.info("Docket %s is a SC docket", docket.docket_number) + appeal_transfer_origin_court_id = texas_js_court_id_to_court_id( + ac_id + ) + appeal_transfer_origin_dn = ac_dn + case _ if docket_data["court_type"] == CourtType.APPELLATE.value: + logger.info( + "Docket %s is an appellate docket", docket.docket_number + ) + + appeal_transfer_origin_court_id = trial_court_id + appeal_transfer_origin_dn = oc_dn + + transfer_from = docket_data.get("transfer_from") + if transfer_from: logger.info( - "Docket %s in the CCA is not a death penalty appeal", + "Appellate docket %s has an incoming transfer", docket.docket_number, ) - if appeals_court["court_id"] == CourtID.UNKNOWN.value: - logger.error( - "Found appellate court with unknown ID (docket %s)", - docket.docket_number, - ) - return MergeResult.failed() - else: - appeals_court_id = texas_js_court_id_to_court_id( - appeals_court["court_id"] - ) - logger.info( - "Appeals court ID for CCA docket %s is %s", + + coa_transfer_date = transfer_from["date"] + # If the transfer date is absent or empty, assume it matches the filing date + if not coa_transfer_date: + logger.warning( + "Missing transfer date for transfer of docket %s. Defaulting to filing date.", docket.docket_number, - appeals_court_id, ) + coa_transfer_date = docket_data["date_filed"] + + coa_transfer_origin_court_id = texas_js_court_id_to_court_id( + transfer_from["court_id"] + ) + try: - transfer_origin_court = Court.objects.get( - pk=appeals_court_id + coa_transfer_origin_court = Court.objects.get( + pk=coa_transfer_origin_court_id ) except Court.DoesNotExist: logger.error( "Court with ID %s not found while populating CaseTransfer.origin_court.", - appeals_court_id, + coa_transfer_origin_court_id, ) - transfer.origin_docket_number = appeals_court["case_number"] - elif docket_data["court_id"] == CourtID.SUPREME_COURT.value: - logger.info("Docket %s is from the SC", docket.docket_number) - if appeals_court["court_id"] == CourtID.UNKNOWN.value: - logger.warning( - "Found appellate court with unknown ID (docket %s)", - docket.docket_number, - ) - return MergeResult.failed() - else: - appeals_court_id = texas_js_court_id_to_court_id( - appeals_court["court_id"] - ) - logger.info( - "Appeals court ID for SC docket %s is %s", - docket.docket_number, - appeals_court_id, - ) - try: - transfer_origin_court = Court.objects.get(pk=appeals_court_id) - except Court.DoesNotExist: - logger.error( - "Court with ID %s not found while populating CaseTransfer.origin_court.", - appeals_court_id, - ) - transfer.origin_docket_number = appeals_court["case_number"] - else: + else: + # Texas Government Code 73.001 (accessed 2026-02-23) + coa_transfer_type = ( + CaseTransfer.JURISDICTION + if docket_data["court_id"] + == CourtID.FIFTEENTH_COURT_OF_APPEALS.value + else CaseTransfer.WORKLOAD + ) + transfers.append( + CaseTransfer( + origin_court=coa_transfer_origin_court, + origin_docket_number=transfer_from[ + "origin_docket" + ], + destination_court=docket.court, + destination_docket_number=docket.docket_number, + destination_docket=docket, + transfer_date=coa_transfer_date, + transfer_type=coa_transfer_type, + ) + ) + case _: logger.error( - "Unrecognized Texas final court ID %s while creating CaseTransfer", + "Unrecognized Texas court ID %s and type %s while creating CaseTransfer", docket_data["court_id"], + docket_data["court_type"], ) + return MergeResult.failed() - if transfer_origin_court: - transfer.origin_court = transfer_origin_court - transfers = [transfer] - else: - transfers = [] - elif docket_data["court_type"] == CourtType.APPELLATE.value: - logger.info("Docket %s is an appellate docket", docket.docket_number) - transfers = [] - if trial_court_id: - logger.info( - "Appellate docket %s has a valid trial court", - docket.docket_number, + + if appeal_transfer_origin_court_id: + try: + appeal_origin_court = Court.objects.get( + pk=appeal_transfer_origin_court_id ) - try: - appeal_origin_court = Court.objects.get(pk=trial_court_id) - except Court.DoesNotExist: - logger.error( - "Court with ID %s not found while populating CaseTransfer.origin_court.", - trial_court_id, - ) - else: - transfers.append( - CaseTransfer( - origin_court=Court.objects.get(pk=appeal_origin_court), - origin_docket_number=docket_data["originating_court"][ - "case" - ], - destination_court=docket.court, - destination_docket_number=docket.docket_number, - destination_docket=docket, - transfer_date=docket_data["date_filed"], - transfer_type=CaseTransfer.APPEAL, - ) - ) - if docket_data["transfer_from"]: - logger.info( - "Appellate docket %s has a transfer in", docket.docket_number + except Court.DoesNotExist: + logger.error( + "Court with ID %s not found while populating CaseTransfer.origin_court with appeal type.", + appeal_transfer_origin_court_id, ) - transfer_from_date = docket_data["transfer_from"]["date"] - if not transfer_from_date: - logger.warning( - "Missing transfer date for workload transfer of docket %s", - docket.docket_number, + else: + transfers.append( + CaseTransfer( + destination_court=docket.court, + destination_docket_number=docket.docket_number, + destination_docket=docket, + origin_court=appeal_origin_court, + origin_docket_number=appeal_transfer_origin_dn, + transfer_date=docket_data["date_filed"], + transfer_type=CaseTransfer.APPEAL, ) - workload_origin_court_id = texas_js_court_id_to_court_id( - docket_data["transfer_from"]["court_id"] ) - try: - workload_origin_court = Court.objects.get( - pk=workload_origin_court_id - ) - except Court.DoesNotExist: - logger.error( - "Court with ID %s not found while populating CaseTransfer.origin_court.", - workload_origin_court_id, - ) - else: - transfers.append( - CaseTransfer( - origin_court=workload_origin_court, - origin_docket_number=docket_data["transfer_from"][ - "origin_docket" - ], - destination_court=docket.court, - destination_docket_number=docket.docket_number, - destination_docket=docket, - # If the transfer date is absent or empty, assume it matches the filing date - transfer_date=transfer_from_date - if transfer_from_date - else docket_data["date_filed"], - # Texas Government Code 73.001 (accessed 2026-02-23) - transfer_type=CaseTransfer.JURISDICTION - if docket_data["court_id"] - == CourtID.FIFTEENTH_COURT_OF_APPEALS - else CaseTransfer.WORKLOAD, - ) - ) - else: - logger.error( - "Unrecognized Texas court type %s while creating CaseTransfer", - docket_data["court_type"], - ) - return MergeResult.failed() - any_created = False - for transfer in transfers: - case_transfer = CaseTransfer( - origin_court=transfer.origin_court, - origin_docket_number=transfer.origin_docket_number, - origin_docket=transfer.origin_docket, - destination_court=transfer.destination_court, - destination_docket_number=transfer.destination_docket_number, - destination_docket=transfer.destination_docket, - transfer_date=transfer.transfer_date, - transfer_type=transfer.transfer_type, - ) - merge_result = merge_case_transfer(case_transfer) - if merge_result.create: - any_created = True + results = [merge_case_transfer(transfer) for transfer in transfers] - return MergeResult(success=True, create=any_created, update=False, pk=None) + return MergeResult( + success=all([r.success for r in results]), + create=any([r.create for r in results]), + update=any([r.update for r in results]), + pk=None, + ) def generate_texas_appellate_brief_flags( diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 5e2bfcd1c2..ba3ea420b3 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -3058,6 +3058,7 @@ def test_merge_texas_case_transfers_supreme_court(self): court_id=CourtID.SUPREME_COURT.value, docket_number=docket_sc.docket_number, appeals_court=appeals_court, + is_direct_appeal=False, ) result = merge_texas_case_transfers(docket_sc, docket_data) @@ -3087,6 +3088,7 @@ def test_merge_texas_case_transfers_cca_from_appellate(self): court_id=CourtID.COURT_OF_CRIMINAL_APPEALS.value, docket_number=docket_cca.docket_number, appeals_court=appeals_court, + is_direct_appeal=False, ) result = merge_texas_case_transfers(docket_cca, docket_data) @@ -3117,7 +3119,7 @@ def test_merge_texas_case_transfers_cca_death_penalty_direct_appeal( court_id=CourtID.COURT_OF_CRIMINAL_APPEALS.value, docket_number=docket_cca.docket_number, originating_court=originating_court, - appeals_court=None, + is_direct_appeal=True, ) result = merge_texas_case_transfers(docket_cca, docket_data) @@ -3229,6 +3231,7 @@ def test_merge_texas_docket_final_court_sets_appeal_from(self): originating_court=TexasOriginatingDistrictCourtDictFactory( district=100 ), + is_direct_appeal=False, ) result = merge_texas_docket(docket_data) diff --git a/cl/search/state/texas/factories.py b/cl/search/state/texas/factories.py index dd00530ee1..02e27f5def 100644 --- a/cl/search/state/texas/factories.py +++ b/cl/search/state/texas/factories.py @@ -232,10 +232,11 @@ class TexasFinalCourtDocketDictFactory(TexasCommonDataDictFactory): CourtID.COURT_OF_CRIMINAL_APPEALS.value, ), ) + is_direct_appeal = Faker("pybool") @factory.post_generation @staticmethod - def set_sc(obj, create, extracted, **kwargs): + def post_gen(obj, create, extracted, **kwargs): if not create: return if obj["court_id"] == CourtID.SUPREME_COURT.value: @@ -257,3 +258,14 @@ def set_sc(obj, create, extracted, **kwargs): ), obj["appellate_briefs"], ) + if obj["is_direct_appeal"]: + obj["appeals_court"] = TexasAppellateCourtInfoDictFactory( + court_id=CourtID.UNKNOWN.value, + case_number="", + case_url="", + disposition="", + district="", + justice="", + opinion_cite="", + ) + del obj["is_direct_appeal"] From 62cfd9f42aa5491190498312f04ebf7bdeb71a2c Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Mar 2026 15:30:14 -0600 Subject: [PATCH 69/87] feat(texas): Add helper method to determine whether a docket has appellate info --- cl/corpus_importer/tasks.py | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 763c8c5970..17a0b978ee 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4377,6 +4377,30 @@ def normalize_texas_parties( ] +def texas_docket_has_appellate_info( + docket_data: TexasCourtOfAppealsDocket + | TexasCourtOfCriminalAppealsDocket + | TexasSupremeCourtDocket, +) -> bool: + """ + Helper method returning whether a scraped Texas docket has appellate case + info. + + Checks that the docket court is not an appellate court (cases in appellate + courts cannot be appealed to appellate courts) and that the "appeals_court" + entry of docket data is filled in. + + :param docket_data: The scraped docket data. + + :return: Whether the docket has appellate case information. + """ + + return ( + docket_data["court_type"] != CourtType.APPELLATE.value + and docket_data["appeals_court"]["court_id"] != CourtID.UNKNOWN.value + ) + + def merge_texas_parties( docket: Docket, parties: list[TexasCaseParty] ) -> MergeResult: @@ -4424,10 +4448,7 @@ def merge_texas_docket_originating_court( originating_court_information = docket.originating_court_information - if ( - docket_data["court_type"] == CourtType.APPELLATE.value - or docket_data["appeals_court"]["court_id"] == CourtID.UNKNOWN.value - ): + if not texas_docket_has_appellate_info(docket_data): ocd = docket_data["originating_court"] oc_dn = ocd["case"] oc_reporter = ocd["reporter"] @@ -4760,11 +4781,7 @@ def merge_texas_docket( court.pk, ) - if ( - docket_data["court_type"] == CourtType.APPELLATE.value - or docket_data["appeals_court"]["court_id"] - == CourtID.UNKNOWN.value - ): + if not texas_docket_has_appellate_info(docket_data): lower_court_data = docket_data["originating_court"] lower_court_id = texas_originating_court_to_court_id( lower_court_data From 7e232c5e6dda40c6dbb0d504bb16b78be21cd87d Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Mar 2026 15:35:26 -0600 Subject: [PATCH 70/87] feat(admin): Add admin page for TrialCourtData model Co-Authored-By: Claude Opus 4.6 --- cl/search/admin.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/cl/search/admin.py b/cl/search/admin.py index 673d672f6a..306a7dc44e 100644 --- a/cl/search/admin.py +++ b/cl/search/admin.py @@ -35,6 +35,7 @@ ScotusDocketMetadata, SCOTUSDocument, SearchQuery, + TrialCourtData, ) from cl.search.state.texas.models import TexasDocketEntry, TexasDocument from cl.search.utils import seal_documents @@ -562,6 +563,30 @@ def change_view(self, request, object_id, form_url="", extra_context=None): ) +@admin.register(TrialCourtData) +class TrialCourtDataAdmin(CursorPaginatorAdmin): + raw_id_fields = ( + "docket", + "judge", + ) + autocomplete_fields = ("court",) + readonly_fields = ( + "date_created", + "date_modified", + ) + list_display = ( + "__str__", + "docket_number_trial", + "court_name", + "date_filed", + ) + search_help_text = "Search by docket ID or trial court docket number." + search_fields = ( + "=docket__id", + "docket_number_trial", + ) + + @admin.register(OpinionsCited) class OpinionsCitedAdmin(CursorPaginatorAdmin): raw_id_fields = ( From 32a99b94fdfa0dcd1d4f4058925b2489992b2016 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Mar 2026 15:49:02 -0600 Subject: [PATCH 71/87] fix(texas): mypy issue --- cl/corpus_importer/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 17a0b978ee..e92f1d960b 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4515,7 +4515,7 @@ def merge_texas_case_transfers( ac_id = appeals_court.get("court_id", "") ac_dn = appeals_court.get("case_number", "") trial_court_id = texas_originating_court_to_court_id(originating_court) - appeal_transfer_origin_court_id = "" + appeal_transfer_origin_court_id: str | None = "" appeal_transfer_origin_dn = "" transfers = [] From 0d2b5f6acbe5f673583cc6b2f40f30bf6520ee5d Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Mar 2026 15:54:43 -0600 Subject: [PATCH 72/87] fix(texas): mypy again Missed one --- cl/corpus_importer/tasks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index e92f1d960b..d32aef7178 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4009,7 +4009,8 @@ def merge_texas_trial_court_data( except Court.DoesNotExist: logger.error("Court with ID %s not found.", court_id) court = None - court_name = court.full_name + else: + court_name = court.full_name if judge_name: judge = async_to_sync(lookup_judge_by_full_name)( name=judge_name, From e4950c02fd8ae1487f18c5469d8ddd317896d6a1 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Fri, 13 Mar 2026 18:04:46 -0600 Subject: [PATCH 73/87] refactor(texas): General code cleanup Tidy redundant logging statements; move some conditionals around; get rid of branches that will never be taken --- cl/corpus_importer/tasks.py | 156 +++++++++++++----------------------- 1 file changed, 57 insertions(+), 99 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index d32aef7178..732691e26d 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4172,13 +4172,6 @@ def merge_texas_document( media_id=input_document["media_id"], docket_entry=docket_entry, ) - except TexasDocument.MultipleObjectsReturned: - logger.error( - "Found multiple TexasDocument objects on the same docket entry (%s) with the same media_id (%s)", - docket_entry.pk, - input_document["media_id"], - ) - return MergeResult.failed() else: existed = True needs_update = ( @@ -4260,21 +4253,9 @@ def merge_texas_docket_entry( appellate_brief=appellate_brief, ) + docket_entry = None try: docket_entry = docket_entries.get() - except TexasDocketEntry.DoesNotExist: - logger.info( - "No existing TexasDocketEntry found for sequence number %s on Docket %s. Creating new entry.", - sequence_number, - docket.pk, - ) - docket_entry = TexasDocketEntry( - docket=docket, - date_filed=input_docket_entry["date"], - entry_type=input_docket_entry["type"], - appellate_brief=appellate_brief, - ) - created = True except TexasDocketEntry.MultipleObjectsReturned: # More filtering needed matching_sequence_number = docket_entries.filter( @@ -4292,37 +4273,32 @@ def merge_texas_docket_entry( docket.pk, ) docket_entry = matching_sequence_number - created = False - else: - logger.error( - "No existing TexasDocketEntry found for sequence number %s on Docket %s. Creating new entry.", - sequence_number, - docket.pk, - ) - docket_entry = TexasDocketEntry( - docket=docket, - date_filed=input_docket_entry["date"], - entry_type=input_docket_entry["type"], - appellate_brief=appellate_brief, - ) - created = True else: logger.info( "Found existing TexasDocketEntry for sequence number %s on Docket %s. Updating entry.", sequence_number, docket.pk, ) - created = False + + created = False + if not docket_entry: + logger.error( + "No existing TexasDocketEntry found for sequence number %s on Docket %s. Creating new entry.", + sequence_number, + docket.pk, + ) + docket_entry = TexasDocketEntry( + docket=docket, + date_filed=input_docket_entry["date"], + entry_type=input_docket_entry["type"], + appellate_brief=appellate_brief, + ) + created = True docket_entry.sequence_number = sequence_number docket_entry.description = input_docket_entry.get("description", "") docket_entry.disposition = input_docket_entry.get("disposition", "") docket_entry.remarks = input_docket_entry.get("remarks", "") - logger.info( - "Saving TexasDocketEntry %s on Docket %s", - docket_entry.pk, - docket.pk, - ) docket_entry.save() logger.info( @@ -4433,50 +4409,51 @@ def merge_texas_docket_originating_court( :param docket: The docket to add the originating court to. :param docket_data: The docket data from Juriscraper. :return: The result of the merge operation.""" - if ( + + if texas_docket_has_appellate_info(docket_data): + ocd = docket_data["appeals_court"] + oc_dn = ocd["case_number"] + oc_reporter = "" + oc_judge = ocd["justice"] + oc_id = texas_js_court_id_to_court_id(ocd["court_id"]) + elif ( docket_data["originating_court"]["court_type"] - == CourtType.UNKNOWN.value + != CourtType.UNKNOWN.value ): + ocd = docket_data["originating_court"] + oc_dn = ocd["case"] + oc_reporter = ocd["reporter"] + oc_judge = ocd["judge"] + oc_id = texas_originating_court_to_court_id(ocd) + else: logger.warning( "Skipping merge of OCI for Texas docket %s due to unknown originating court type.", - docket_data["docket_number"], + docket.docket_number, ) - return MergeResult(create=False, update=False, success=False, pk=None) + return MergeResult.failed() + created = False if not docket.originating_court_information: created = True docket.originating_court_information = OriginatingCourtInformation() - originating_court_information = docket.originating_court_information - - if not texas_docket_has_appellate_info(docket_data): - ocd = docket_data["originating_court"] - oc_dn = ocd["case"] - oc_reporter = ocd["reporter"] - oc_judge = ocd["judge"] - oc_id = texas_originating_court_to_court_id(ocd) - else: - ocd = docket_data["appeals_court"] - oc_dn = ocd["case_number"] - oc_reporter = "" - oc_judge = ocd["justice"] - oc_id = texas_js_court_id_to_court_id(ocd["court_id"]) + oci = docket.originating_court_information - originating_court_information.docket_number = oc_dn - originating_court_information.docket_number_raw = oc_dn - originating_court_information.court_reporter = oc_reporter - originating_court_information.assigned_to_str = oc_judge + oci.docket_number = oc_dn + oci.docket_number_raw = oc_dn + oci.court_reporter = oc_reporter + oci.assigned_to_str = oc_judge # Only update judge if we're able to associate them with a court. if oc_id: async_to_sync(lookup_judge_by_full_name_and_set_attr)( - item=originating_court_information, + item=oci, target_field="assigned_to", full_name=oc_judge, court_id=oc_id, event_date=None, require_living_judge=False, ) - originating_court_information.save() + oci.save() if created: docket.save() @@ -4725,13 +4702,13 @@ def merge_texas_docket( ) docket_number = docket_data["docket_number"] logger.info("Merging Texas docket %s", docket_number) + + if docket_data["court_type"] == CourtType.UNKNOWN.value: + logger.error("Texas docket %s has unknown court type", docket_number) + return MergeResult.failed() + with transaction.atomic(): docket = None - if docket_data["court_type"] == CourtType.UNKNOWN.value: - logger.error( - "Texas docket %s has unknown court type", docket_number - ) - return MergeResult.failed() if docket_data["court_type"] == CourtType.APPELLATE.value: logger.info( "Docket is appellate. Checking if disaggregation is necessary..." @@ -4775,26 +4752,21 @@ def merge_texas_docket( originating_court_merge_result = merge_texas_docket_originating_court( docket, docket_data ) - if not originating_court_merge_result.success: - logger.error( - "Failed to update originating court information for Texas docket %s in court %s", - docket.docket_number, - court.pk, - ) - if not texas_docket_has_appellate_info(docket_data): - lower_court_data = docket_data["originating_court"] - lower_court_id = texas_originating_court_to_court_id( - lower_court_data - ) - else: + if texas_docket_has_appellate_info(docket_data): lower_court_data = docket_data["appeals_court"] lower_court_id = texas_js_court_id_to_court_id( lower_court_data["court_id"] ) + else: + lower_court_data = docket_data["originating_court"] + lower_court_id = texas_originating_court_to_court_id( + lower_court_data + ) - lower_court_name = None - if lower_court_id is not None: + # Assumes that we will only fail to generate a court ID for trial courts and never appellate courts + lower_court_name = lower_court_data.get("name", "") + if lower_court_id: try: lower_court = Court.objects.get(pk=lower_court_id) except Court.DoesNotExist: @@ -4805,9 +4777,6 @@ def merge_texas_docket( else: docket.appeal_from = lower_court lower_court_name = lower_court.full_name - if not lower_court_name: - # Assumes that we will only fail to generate a court ID for trial courts and never appellate courts - lower_court_name = lower_court_data.get("name", "") docket.appeal_from_str = lower_court_name docket.save() @@ -4818,12 +4787,6 @@ def merge_texas_docket( trial_court_result = MergeResult.unnecessary(None) party_merge_result = merge_texas_parties(docket, docket_data["parties"]) - if not party_merge_result.success: - logger.error( - "Failed to merge party data for Texas docket %s in court %s", - docket.docket_number, - court.pk, - ) entry_merge_results = [ merge_texas_docket_entry( @@ -4845,12 +4808,6 @@ def merge_texas_docket( merge_case_transfer_result = merge_texas_case_transfers( docket, docket_data ) - if not merge_case_transfer_result.success: - logger.error( - "Failed to merge CaseTransfer data for Texas docket %s in court %s", - docket.docket_number, - court.pk, - ) create = ( party_merge_result.create @@ -4875,9 +4832,10 @@ def merge_texas_docket( ) if not success: logger.error( - "One or more steps in Texas case merging failed for docket %s (pk %s). Please review logs.", + "One or more steps in Texas case merging failed for docket %s (pk %s) in court %s. Please review logs.", docket_number, docket.pk, + court.pk, ) return MergeResult( From bf9a2983d64bf504669319029ec8d07089da14b2 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Mon, 16 Mar 2026 09:27:58 -0600 Subject: [PATCH 74/87] fix(texas): Restore DoesNotExist handler in merge_texas_docket_entry The previous refactor removed the TexasDocketEntry.DoesNotExist handler, causing unhandled exceptions when creating new docket entries. Co-Authored-By: Claude Opus 4.6 (1M context) --- cl/corpus_importer/tasks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 732691e26d..bc98d99bed 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4256,6 +4256,8 @@ def merge_texas_docket_entry( docket_entry = None try: docket_entry = docket_entries.get() + except TexasDocketEntry.DoesNotExist: + pass except TexasDocketEntry.MultipleObjectsReturned: # More filtering needed matching_sequence_number = docket_entries.filter( From 607436c13bf6738f5d08371879c00ce0d298800c Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Tue, 17 Mar 2026 11:29:16 -0600 Subject: [PATCH 75/87] fix(texas): Make sure description from appellate briefs is populated Update merge_texas_docket_entry method; add test for merging docket entries; remove generate_appellate_brief_flags; address PR feedback; update tests. --- cl/corpus_importer/tasks.py | 176 ++++++++++++----------- cl/corpus_importer/tests.py | 220 +++++++++++++++++++++-------- cl/search/state/texas/factories.py | 44 +++--- 3 files changed, 279 insertions(+), 161 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index bc98d99bed..f8a82f6023 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4218,19 +4218,19 @@ def merge_texas_document( def merge_texas_docket_entry( docket: Docket, sequence_number: str, - appellate_brief: bool, - input_docket_entry: TexasCaseEvent - | TexasAppellateBrief - | TexasSupremeCourtCaseEvent - | TexasSupremeCourtAppellateBrief, + case_event: TexasCaseEvent | TexasSupremeCourtCaseEvent, + appellate_brief: TexasAppellateBrief + | TexasSupremeCourtAppellateBrief + | None = None, download_attachments: bool = True, ) -> MergeResult: """Merges a Texas docket entry into CL. :param docket: The docket this entry belongs to. :param sequence_number: The sequence number of the docket entry. - :param appellate_brief: Whether the docket entry is an appellate brief. - :param input_docket_entry: The docket entry being merged. + :param case_event: The docket entry information being merged. + :param appellate_brief: Appellate brief information if the docket entry is + an appellate brief, None otherwise. :param download_attachments: Whether to download docket entry attachments. :return: Tuple with the following entries @@ -4245,12 +4245,13 @@ def merge_texas_docket_entry( sequence_number, docket.pk, ) + appellate_brief_flag = bool(appellate_brief) Docket.objects.select_for_update().get(pk=docket.pk) docket_entries = TexasDocketEntry.objects.filter( docket=docket, - date_filed=input_docket_entry["date"], - entry_type=input_docket_entry["type"], - appellate_brief=appellate_brief, + date_filed=case_event["date"], + entry_type=case_event["type"], + appellate_brief=appellate_brief_flag, ) docket_entry = None @@ -4260,21 +4261,20 @@ def merge_texas_docket_entry( pass except TexasDocketEntry.MultipleObjectsReturned: # More filtering needed - matching_sequence_number = docket_entries.filter( + matching_sequence_numbers = docket_entries.filter( sequence_number=sequence_number - ).first() - logger.info( - "Multiple matching TexasDocketEntries found for sequence number %s on Docket %s.", - sequence_number, - docket.pk, ) - if matching_sequence_number: - logger.info( - "Found existing TexasDocketEntry for sequence number %s on Docket %s. Updating entry.", + try: + docket_entry = matching_sequence_numbers.get() + except TexasDocketEntry.MultipleObjectsReturned: + logger.error( + "Multiple matching TexasDocketEntries found for sequence number %s on Docket %s.", sequence_number, docket.pk, ) - docket_entry = matching_sequence_number + docket_entry = matching_sequence_numbers.first() + except TexasDocketEntry.DoesNotExist: + pass else: logger.info( "Found existing TexasDocketEntry for sequence number %s on Docket %s. Updating entry.", @@ -4284,23 +4284,20 @@ def merge_texas_docket_entry( created = False if not docket_entry: - logger.error( - "No existing TexasDocketEntry found for sequence number %s on Docket %s. Creating new entry.", - sequence_number, - docket.pk, - ) docket_entry = TexasDocketEntry( docket=docket, - date_filed=input_docket_entry["date"], - entry_type=input_docket_entry["type"], - appellate_brief=appellate_brief, + date_filed=case_event["date"], + entry_type=case_event["type"], + appellate_brief=appellate_brief_flag, ) created = True docket_entry.sequence_number = sequence_number - docket_entry.description = input_docket_entry.get("description", "") - docket_entry.disposition = input_docket_entry.get("disposition", "") - docket_entry.remarks = input_docket_entry.get("remarks", "") + docket_entry.description = ( + appellate_brief["description"] if appellate_brief else "" + ) + docket_entry.disposition = case_event["disposition"] + docket_entry.remarks = case_event.get("remarks", "") docket_entry.save() logger.info( @@ -4312,7 +4309,7 @@ def merge_texas_docket_entry( merge_texas_document( docket_entry, document, download_attachments=download_attachments ) - for document in input_docket_entry["attachments"] + for document in case_event["attachments"] ] return MergeResult( @@ -4323,6 +4320,58 @@ def merge_texas_docket_entry( ) +def merge_texas_docket_entries( + docket: Docket, + case_events: list[TexasCaseEvent] | list[TexasSupremeCourtCaseEvent], + appellate_briefs: list[TexasAppellateBrief] + | list[TexasSupremeCourtAppellateBrief], + download_attachments: bool = True, +) -> MergeResult: + """ + Merges a list of Texas case events and Texas appellate briefs for a given + docket into CL. + + :param docket: The parent docket. + :param case_events: Scraped case events. + :param appellate_briefs: Scraped appellate briefs. + :param download_attachments: Whether to download attachments. + + :return: The result of the attempted merge operation. + """ + brief_iter = iter(appellate_briefs) + next_brief = next(brief_iter, None) + + create = False + update = False + success = True + for i, (case_event, sequence_number) in enumerate( + zip(case_events, create_docket_entry_sequence_numbers(case_events)) + ): + appellate_brief = None + if ( + next_brief is not None + and case_event["date"] == next_brief["date"] + and case_event["type"] == next_brief["type"] + and case_event["attachments"] == next_brief["attachments"] + ): + appellate_brief = next_brief + next_brief = next(brief_iter, None) + + merge_result = merge_texas_docket_entry( + docket, + sequence_number, + case_event, + appellate_brief, + download_attachments=download_attachments, + ) + + create = merge_result.create or create + update = merge_result.update or update + success = merge_result.success and success + + return MergeResult(create=create, update=update, success=success, pk=None) + + def normalize_texas_parties( parties: list[TexasCaseParty], ) -> list[dict[str, Any]]: @@ -4652,41 +4701,6 @@ def merge_texas_case_transfers( ) -def generate_texas_appellate_brief_flags( - case_events: list[TexasCaseEvent], - appellate_briefs: list[TexasAppellateBrief], -) -> list[bool]: - """Generates a list of booleans indicating whether the corresponding entry - in the list of TexasCaseEvents is in the list of TexasAppellateBriefs. - - The "Appellate Briefs" table in TAMES appears to always be a subset of the - case events table. Therefore, we simply use the case events table to - generate docket entries and set an "appellate_brief" flag to indicate - whether the entry appears in the appellate briefs table. This method - generates those flags given the list of case events and the list of - appellate briefs. - - :param case_events: A list of TexasCaseEvent objects. - :param appellate_briefs: A list of TexasAppellateBrief objects. - :return: A list of booleans indicating whether the corresponding entry is - an appellate brief.""" - brief_iter = iter(appellate_briefs) - next_brief = next(brief_iter, None) - flags = [] - for case_event in case_events: - if ( - next_brief is not None - and case_event["date"] == next_brief["date"] - and case_event["type"] == next_brief["type"] - and case_event["attachments"] == next_brief["attachments"] - ): - flags.append(True) - next_brief = next(brief_iter, None) - else: - flags.append(False) - return flags - - def merge_texas_docket( docket_data: TexasCourtOfAppealsDocket | TexasCourtOfCriminalAppealsDocket @@ -4790,22 +4804,12 @@ def merge_texas_docket( party_merge_result = merge_texas_parties(docket, docket_data["parties"]) - entry_merge_results = [ - merge_texas_docket_entry( - docket, - sequence_number, - appellate_brief, - entry, - download_attachments=download_attachments, - ) - for sequence_number, appellate_brief, entry in zip( - create_docket_entry_sequence_numbers(docket_data["case_events"]), - generate_texas_appellate_brief_flags( - docket_data["case_events"], docket_data["appellate_briefs"] - ), - docket_data["case_events"], - ) - ] + entry_merge_result = merge_texas_docket_entries( + docket, + docket_data["case_events"], + docket_data["appellate_briefs"], + download_attachments=download_attachments, + ) merge_case_transfer_result = merge_texas_case_transfers( docket, docket_data @@ -4816,21 +4820,21 @@ def merge_texas_docket( or trial_court_result.create or originating_court_merge_result.create or merge_case_transfer_result.create - or any(r.create for r in entry_merge_results) + or entry_merge_result.create ) update = ( party_merge_result.update or trial_court_result.update or originating_court_merge_result.update or merge_case_transfer_result.update - or any(r.update for r in entry_merge_results) + or entry_merge_result.update ) success = ( party_merge_result.success and trial_court_result.success and originating_court_merge_result.success and merge_case_transfer_result.success - and all(r.success for r in entry_merge_results) + and entry_merge_result.success ) if not success: logger.error( diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 9b4ee87d82..06ecedb3bf 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -86,7 +86,6 @@ classify_case_name_by_llm, download_texas_document_pdf, generate_ia_json, - generate_texas_appellate_brief_flags, get_and_save_free_document_report, merge_texas_case_transfers, merge_texas_docket, @@ -2239,9 +2238,34 @@ def test_normalize_texas_parties_empty_atty_name(self): ], ) - def test_generate_appellate_brief_flags(self): + @patch( + "cl.corpus_importer.tasks.merge_texas_parties", + return_value=MergeResult.created(1), + ) + @patch( + "cl.corpus_importer.tasks.merge_texas_case_transfers", + return_value=MergeResult.created(1), + ) + @patch( + "cl.corpus_importer.tasks.merge_texas_trial_court_data", + return_value=MergeResult.created(1), + ) + @patch( + "cl.corpus_importer.tasks.merge_texas_docket_originating_court", + return_value=MergeResult.created(1), + ) + def test_merge_docket_entries_integration( + self, + mock_texas_oci, + mock_texas_tcd, + mock_texas_transfers, + mock_texas_parties, + ): n_events = fake.random_int(min=0, max=30) - case_events = [TexasCaseEventDictFactory() for _ in range(n_events)] + case_events = sorted( + [TexasSupremeCourtCaseEventDictFactory() for _ in range(n_events)], + key=lambda ce: ce["date"], + ) if len(case_events) == 0: appellate_brief_indices = [] @@ -2251,27 +2275,72 @@ def test_generate_appellate_brief_flags(self): ) appellate_briefs = [ - TexasAppellateBriefDictFactory( + TexasSupremeCourtAppellateBriefDictFactory( date=case_events[i]["date"], type=case_events[i]["type"], attachments=case_events[i]["attachments"], + remarks=case_events[i]["remarks"], ) for i in appellate_brief_indices ] - appellate_brief_flags = generate_texas_appellate_brief_flags( - case_events, appellate_briefs - ) - actual_flags = [ True if i in appellate_brief_indices else False for i in range(len(case_events)) ] - assert appellate_brief_flags == actual_flags, ( - f"Incorrect appellate brief flags ({appellate_brief_flags}!={actual_flags}).\nCase events: {case_events}\nAppellate briefs: {appellate_briefs}" + docket_dict = TexasFinalCourtDocketDictFactory( + court_id=CourtID.SUPREME_COURT.value, + case_events=case_events, + appellate_briefs=appellate_briefs, + ) + original_docket_entries = [ + e["pk"] for e in TexasDocketEntry.objects.all().values("pk") + ] + merge_result = merge_texas_docket(docket_dict) + + docket_entries = list( + TexasDocketEntry.objects.exclude( + pk__in=original_docket_entries + ).order_by("sequence_number") + ) + self.assertEqual( + len(docket_entries), + len(case_events), + f"Generated {len(docket_entries)} docket entries from {len(case_events)} input case events.", ) + ab_index = 0 + for i, docket_entry in enumerate(docket_entries): + self.assertEqual( + docket_entry.appellate_brief, + actual_flags[i], + f"Docket entry {i} has the wrong appellate brief flag (found {docket_entry.appellate_brief}, expected {actual_flags[i]}).", + ) + self.assertEqual( + docket_entry.remarks, + case_events[i]["remarks"], + f"Docket entry {i} has the wrong remarks (found {docket_entry.remarks}, expected {case_events[i]['remarks']}).", + ) + self.assertEqual( + docket_entry.disposition, + case_events[i]["disposition"], + f"Docket entry {i} has the wrong disposition (found {docket_entry.disposition}, expected {case_events[i]['disposition']}).", + ) + if actual_flags[i]: + self.assertEqual( + docket_entry.description, + appellate_briefs[ab_index]["description"], + f"Docket entry {i} has the wrong description (found {docket_entry.description}, expected {appellate_briefs[ab_index]['description']}).", + ) + ab_index += 1 + else: + self.assertEqual( + docket_entry.description, + "", + f"Docket entry {i} should not have description (found {docket_entry.description}).", + ) + def test_merge_texas_document_new_document(self): """Can we correctly add a new attachment to an existing docket entry?""" docket_entry = self.docket_coa1_entry @@ -2428,15 +2497,22 @@ def get_test_pdf( def test_merge_texas_docket_entry_new_entry(self): """Can we correctly handle a docket entry?""" - docket_entry = self.get_random_docket_entry_dict( + case_event = TexasCaseEventDictFactory( attachments=[TexasCaseDocumentDictFactory()], date=date.fromisoformat("2025-01-02"), type="Brief", ) + appellate_brief = None + if fake.boolean(): + appellate_brief = TexasAppellateBriefDictFactory( + date=case_event["date"], + type=case_event["type"], + attachments=case_event["attachments"], + ) with self.captureOnCommitCallbacks(execute=True): output = merge_texas_docket_entry( - self.docket_coa1, "2025-01-02.000", True, docket_entry + self.docket_coa1, "2025-01-02.000", case_event, appellate_brief ) assert output.create is True @@ -2445,15 +2521,13 @@ def test_merge_texas_docket_entry_new_entry(self): assert output.pk is not None created_docket_entry = TexasDocketEntry.objects.get(pk=output.pk) assert created_docket_entry.docket_id == self.docket_coa1.id - assert created_docket_entry.entry_type == docket_entry["type"] - assert created_docket_entry.disposition == docket_entry.get( - "disposition", "" + assert created_docket_entry.entry_type == case_event["type"] + assert created_docket_entry.disposition == case_event["disposition"] + assert created_docket_entry.description == ( + appellate_brief["description"] if appellate_brief else "" ) - assert created_docket_entry.description == docket_entry.get( - "description", "" - ) - assert created_docket_entry.remarks == docket_entry.get("remarks", "") - assert created_docket_entry.date_filed == docket_entry["date"] + assert created_docket_entry.remarks == case_event.get("remarks", "") + assert created_docket_entry.date_filed == case_event["date"] n_attachments = TexasDocument.objects.filter( docket_entry_id=created_docket_entry.id ).count() @@ -2462,11 +2536,22 @@ def test_merge_texas_docket_entry_new_entry(self): def test_merge_texas_docket_entry_no_update(self): """Can we correctly handle a docket entry update noop?""" - js_docket_entry = self.get_random_docket_entry_dict() + case_event = TexasCaseEventDictFactory( + attachments=[TexasCaseDocumentDictFactory()], + date=date.fromisoformat("2025-01-02"), + type="Brief", + ) + appellate_brief = None + if fake.boolean(): + appellate_brief = TexasAppellateBriefDictFactory( + date=case_event["date"], + type=case_event["type"], + attachments=case_event["attachments"], + ) with self.captureOnCommitCallbacks(execute=True): result = merge_texas_docket_entry( - self.docket_coa1, "2025-01-02.000", True, js_docket_entry + self.docket_coa1, "2025-01-02.000", case_event, appellate_brief ) pk = result.pk documents = TexasDocument.objects.filter(docket_entry_id=pk) @@ -2479,7 +2564,7 @@ def test_merge_texas_docket_entry_no_update(self): # noop with self.captureOnCommitCallbacks(execute=True): output = merge_texas_docket_entry( - self.docket_coa1, "2025-01-02.000", True, js_docket_entry + self.docket_coa1, "2025-01-02.000", case_event, appellate_brief ) assert output.create is False @@ -2489,31 +2574,38 @@ def test_merge_texas_docket_entry_no_update(self): assert output.pk == pk created_docket_entry = TexasDocketEntry.objects.get(pk=output.pk) assert created_docket_entry.docket_id == self.docket_coa1.id - assert created_docket_entry.entry_type == js_docket_entry["type"] - assert created_docket_entry.disposition == js_docket_entry.get( - "disposition", "" - ) - assert created_docket_entry.description == js_docket_entry.get( - "description", "" + assert created_docket_entry.entry_type == case_event["type"] + assert created_docket_entry.disposition == case_event["disposition"] + assert created_docket_entry.description == ( + appellate_brief["description"] if appellate_brief else "" ) - assert created_docket_entry.remarks == js_docket_entry.get( - "remarks", "" - ) - assert created_docket_entry.date_filed == js_docket_entry["date"] + assert created_docket_entry.remarks == case_event.get("remarks", "") + assert created_docket_entry.date_filed == case_event["date"] n_attachments = TexasDocument.objects.filter( docket_entry_id=created_docket_entry.id ).count() - assert n_attachments == len(js_docket_entry["attachments"]) + assert n_attachments == len(case_event["attachments"]) assert self.extract_pdf_document_mock.call_count == 0 def test_merge_texas_docket_entry_add_document(self): """Can we correctly add a new document to an existing docket entry?""" - js_docket_entry = self.get_random_docket_entry_dict() - initial_n_attachments = len(js_docket_entry["attachments"]) + case_event = TexasCaseEventDictFactory( + attachments=[TexasCaseDocumentDictFactory()], + date=date.fromisoformat("2025-01-02"), + type="Brief", + ) + appellate_brief = None + if fake.boolean(): + appellate_brief = TexasAppellateBriefDictFactory( + date=case_event["date"], + type=case_event["type"], + attachments=case_event["attachments"], + ) + initial_n_attachments = len(case_event["attachments"]) with self.captureOnCommitCallbacks(execute=True): result = merge_texas_docket_entry( - self.docket_coa1, "2025-01-02.000", True, js_docket_entry + self.docket_coa1, "2025-01-02.000", case_event, appellate_brief ) pk = result.pk documents = TexasDocument.objects.filter(docket_entry_id=pk) @@ -2523,10 +2615,10 @@ def test_merge_texas_docket_entry_add_document(self): # Reset call count self.extract_pdf_document_mock.reset_mock() - js_docket_entry["attachments"].append(TexasCaseDocumentDictFactory()) + case_event["attachments"].append(TexasCaseDocumentDictFactory()) with self.captureOnCommitCallbacks(execute=True): output = merge_texas_docket_entry( - self.docket_coa1, "2025-01-02.000", True, js_docket_entry + self.docket_coa1, "2025-01-02.000", case_event, appellate_brief ) assert output.create is True @@ -2536,17 +2628,13 @@ def test_merge_texas_docket_entry_add_document(self): assert output.pk == pk created_docket_entry = TexasDocketEntry.objects.get(pk=output.pk) assert created_docket_entry.docket_id == self.docket_coa1.id - assert created_docket_entry.entry_type == js_docket_entry["type"] - assert created_docket_entry.remarks == js_docket_entry.get( - "remarks", "" - ) - assert created_docket_entry.description == js_docket_entry.get( - "description", "" + assert created_docket_entry.entry_type == case_event["type"] + assert created_docket_entry.remarks == case_event.get("remarks", "") + assert created_docket_entry.description == ( + appellate_brief["description"] if appellate_brief else "" ) - assert created_docket_entry.disposition == js_docket_entry.get( - "disposition", "" - ) - assert created_docket_entry.date_filed == js_docket_entry["date"] + assert created_docket_entry.disposition == case_event["disposition"] + assert created_docket_entry.date_filed == case_event["date"] n_attachments = TexasDocument.objects.filter( docket_entry_id=created_docket_entry.id ).count() @@ -2573,16 +2661,21 @@ def test_merge_texas_docket_entry_multiple_matches_with_sequence(self): disposition="Second entry", ) - js_docket_entry = TexasCaseEventDictFactory( + case_event = TexasCaseEventDictFactory( attachments=[], disposition="Updated disposition", date=date.fromisoformat("2025-01-02"), type="Brief", ) + appellate_brief = TexasAppellateBriefDictFactory( + date=case_event["date"], + type=case_event["type"], + attachments=case_event["attachments"], + ) # Should match the second entry by sequence number output = merge_texas_docket_entry( - self.docket_coa1, "2025-01-02.001", True, js_docket_entry + self.docket_coa1, "2025-01-02.001", case_event, appellate_brief ) assert output.create is False @@ -2590,7 +2683,7 @@ def test_merge_texas_docket_entry_multiple_matches_with_sequence(self): assert output.success is True assert output.pk == existing_entry_2.pk updated_entry = TexasDocketEntry.objects.get(pk=output.pk) - assert updated_entry.disposition == js_docket_entry["disposition"] + assert updated_entry.disposition == case_event["disposition"] assert updated_entry.sequence_number == "2025-01-02.001" # Ensure the first entry was not modified existing_entry_1.refresh_from_db() @@ -2608,16 +2701,21 @@ def test_merge_texas_docket_entry_single_match_updates_entry(self): disposition="Original description", ) - js_docket_entry = TexasCaseEventDictFactory( + case_event = TexasCaseEventDictFactory( attachments=[], disposition="Updated disposition", date=date.fromisoformat("2025-01-04"), type="Brief", ) + appellate_brief = TexasAppellateBriefDictFactory( + date=case_event["date"], + type=case_event["type"], + attachments=case_event["attachments"], + ) # Should update existing entry and change its sequence number output = merge_texas_docket_entry( - self.docket_coa1, "2025-01-04.001", True, js_docket_entry + self.docket_coa1, "2025-01-04.001", case_event, appellate_brief ) assert output.create is False @@ -2625,7 +2723,7 @@ def test_merge_texas_docket_entry_single_match_updates_entry(self): assert output.success is True assert output.pk == existing_entry.pk updated_entry = TexasDocketEntry.objects.get(pk=output.pk) - assert updated_entry.disposition == js_docket_entry["disposition"] + assert updated_entry.disposition == case_event["disposition"] assert updated_entry.sequence_number == "2025-01-04.001" def test_merge_texas_docket_entry_multiple_matches_without_sequence(self): @@ -2647,17 +2745,21 @@ def test_merge_texas_docket_entry_multiple_matches_without_sequence(self): sequence_number="2025-01-03.001", disposition="Second entry", ) - - js_docket_entry = TexasCaseEventDictFactory( + case_event = TexasCaseEventDictFactory( attachments=[], disposition="New third entry", date=date.fromisoformat("2025-01-03"), type="Brief", ) + appellate_brief = TexasAppellateBriefDictFactory( + date=case_event["date"], + type=case_event["type"], + attachments=case_event["attachments"], + ) # Should create a new entry since no sequence number matches output = merge_texas_docket_entry( - self.docket_coa1, "2025-01-03.002", True, js_docket_entry + self.docket_coa1, "2025-01-03.002", case_event, appellate_brief ) assert output.create is True @@ -2666,7 +2768,7 @@ def test_merge_texas_docket_entry_multiple_matches_without_sequence(self): assert output.pk is not None assert output.pk not in (existing_entry_1.pk, existing_entry_2.pk) new_entry = TexasDocketEntry.objects.get(pk=output.pk) - assert new_entry.disposition == js_docket_entry["disposition"] + assert new_entry.disposition == case_event["disposition"] assert new_entry.sequence_number == "2025-01-03.002" # Ensure existing entries were not modified existing_entry_1.refresh_from_db() diff --git a/cl/search/state/texas/factories.py b/cl/search/state/texas/factories.py index 02e27f5def..68ba762ab6 100644 --- a/cl/search/state/texas/factories.py +++ b/cl/search/state/texas/factories.py @@ -240,23 +240,35 @@ def post_gen(obj, create, extracted, **kwargs): if not create: return if obj["court_id"] == CourtID.SUPREME_COURT.value: - obj["case_events"] = map( - lambda ce: TexasSupremeCourtCaseEventDictFactory( - date=ce["date"], - type=ce["type"], - attachments=ce["attachments"], - disposition=ce["disposition"], - ), - obj["case_events"], + obj["case_events"] = list( + map( + lambda ce: TexasSupremeCourtCaseEventDictFactory( + date=ce["date"], + type=ce["type"], + attachments=ce["attachments"], + disposition=ce["disposition"], + remarks=ce.get( + "remarks", + TexasSupremeCourtCaseEventDictFactory.remarks, + ), + ), + obj["case_events"], + ) ) - obj["appellate_briefs"] = map( - lambda ab: TexasSupremeCourtAppellateBriefDictFactory( - date=ab["date"], - type=ab["type"], - attachments=ab["attachments"], - description=ab["description"], - ), - obj["appellate_briefs"], + obj["appellate_briefs"] = list( + map( + lambda ab: TexasSupremeCourtAppellateBriefDictFactory( + date=ab["date"], + type=ab["type"], + attachments=ab["attachments"], + description=ab["description"], + remarks=ab.get( + "remarks", + TexasSupremeCourtAppellateBriefDictFactory.remarks, + ), + ), + obj["appellate_briefs"], + ) ) if obj["is_direct_appeal"]: obj["appeals_court"] = TexasAppellateCourtInfoDictFactory( From d01b3d6908f8241143de9591776b0eec935e1654 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Wed, 18 Mar 2026 09:43:19 -0600 Subject: [PATCH 76/87] fix(texas): Add error log and be more cautious with lower courts Add error log to merge_texas_docket_entry when creating a potential duplicate docket entry; set lower court name when we fail to find an appellate court just to be cautious. --- cl/corpus_importer/tasks.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index f8a82f6023..9a739f8225 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4274,7 +4274,11 @@ def merge_texas_docket_entry( ) docket_entry = matching_sequence_numbers.first() except TexasDocketEntry.DoesNotExist: - pass + logger.error( + "Could not find matching TexasDocketEntry with sequence number %s on Docket %s. Creating new docket entry, which may be a duplicate...", + sequence_number, + docket.pk, + ) else: logger.info( "Found existing TexasDocketEntry for sequence number %s on Docket %s. Updating entry.", @@ -4790,6 +4794,8 @@ def merge_texas_docket( "Could not find lower court with ID %s to set appeal_from for Texas docket.", lower_court_id, ) + if lower_court_data["court_type"] == CourtType.APPELLATE.value: + lower_court_name = lower_court_data["district"] else: docket.appeal_from = lower_court lower_court_name = lower_court.full_name From 4c35c80d2354b894df8a920abb02d00d0defb163 Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Wed, 18 Mar 2026 13:45:52 -0600 Subject: [PATCH 77/87] fix(texas): Set appeal_from_str correctly and add test --- cl/corpus_importer/tasks.py | 5 ++--- cl/corpus_importer/tests.py | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index 9a739f8225..dd154b5377 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -4778,14 +4778,15 @@ def merge_texas_docket( lower_court_id = texas_js_court_id_to_court_id( lower_court_data["court_id"] ) + lower_court_name = lower_court_data["district"] else: lower_court_data = docket_data["originating_court"] lower_court_id = texas_originating_court_to_court_id( lower_court_data ) + lower_court_name = lower_court_data.get("name", "") # Assumes that we will only fail to generate a court ID for trial courts and never appellate courts - lower_court_name = lower_court_data.get("name", "") if lower_court_id: try: lower_court = Court.objects.get(pk=lower_court_id) @@ -4794,8 +4795,6 @@ def merge_texas_docket( "Could not find lower court with ID %s to set appeal_from for Texas docket.", lower_court_id, ) - if lower_court_data["court_type"] == CourtType.APPELLATE.value: - lower_court_name = lower_court_data["district"] else: docket.appeal_from = lower_court lower_court_name = lower_court.full_name diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index 06ecedb3bf..cd8552fbcf 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -3366,6 +3366,25 @@ def test_merge_texas_docket_final_court_sets_appeal_from(self): assert docket_sc.appeal_from_id == "txctapp1" assert docket_sc.appeal_from_str == self.texas_coa1.full_name + def test_merge_texas_docket_appeal_from_missing_court(self): + docket_dict = TexasFinalCourtDocketDictFactory.create( + is_direct_appeal=False, + appeals_court=TexasAppellateCourtInfoDictFactory( + court_id="texas_coa17", district="Not Real Court of Appeals" + ), + ) + + result = merge_texas_docket(docket_dict) + + assert result.success is True + + docket = Docket.objects.get(pk=result.pk) + + assert ( + docket.appeal_from_str == docket_dict["appeals_court"]["district"] + ) + assert docket.appeal_from is None + @patch( "cl.corpus_importer.tasks.merge_texas_case_transfers", return_value=MergeResult.created(1), From 4fc25c803e04cd67811a9d8ced51e5e3ab6d5c5e Mon Sep 17 00:00:00 2001 From: Morgan Bennet <98787614+MorganBennetDev@users.noreply.github.com> Date: Wed, 18 Mar 2026 16:27:09 -0600 Subject: [PATCH 78/87] feat(texas): Always select smallest DN in multi-DN strings Also add tests and warnings --- cl/lib/model_helpers.py | 30 +++++++++++++++++++++++------- cl/lib/tests.py | 11 +++++++++-- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/cl/lib/model_helpers.py b/cl/lib/model_helpers.py index 9f146ebaa4..75fea09a5a 100644 --- a/cl/lib/model_helpers.py +++ b/cl/lib/model_helpers.py @@ -197,16 +197,32 @@ def clean_texas_docket_number(docket_number: str | None) -> str: if regex.fullmatch(docket_number): return docket_number - # Try fullmatch on each whitespace-separated token (dirty input - # like "Case Number: 04-97-00972-CV"). We use fullmatch rather - # than search because these regexes were designed for fullmatch - # and can produce false positives with partial matching. - for token in docket_number.split(): + tokens = [ + # Strip leading and trailing punctuation from tokens since it's likely invalid. + re.compile(r"^[^a-z0-9]+|[^a-z0-9]+$", re.IGNORECASE).sub("", token) + for token in docket_number.split() + ] + matching_parts = [] + for token in tokens: for regex in TEXAS_DN_REGEXES: if regex.fullmatch(token): - return token + matching_parts.append(token) - return "" + if len(matching_parts) == 0: + logger.warning( + "Could not find valid Texas docket number in string %s. Using empty string as clean docket number", + docket_number, + ) + return "" + + matching_parts.sort() + if len(matching_parts) > 1: + logger.warning( + "Found multiple docket numbers combined %s. Using %s as clean docket number.", + docket_number, + matching_parts[0], + ) + return matching_parts[0] def make_texas_docket_number_core(docket_number: str | None) -> str: diff --git a/cl/lib/tests.py b/cl/lib/tests.py index 6ce719e8fb..553c7aee35 100644 --- a/cl/lib/tests.py +++ b/cl/lib/tests.py @@ -451,7 +451,10 @@ def test_is_texas_court(self) -> None: def test_clean_texas_docket_number(self) -> None: """Can we extract Texas docket numbers from dirty input?""" test_cases = [ + ("Case Number: AP-77,129; 04-97-00972-CV", "04-97-00972-CV"), ("Case Number: 04-97-00972-CV", "04-97-00972-CV"), + ("Case Number: 04-97-00972-CV; AP-77,129", "04-97-00972-CV"), + ("AP-77,129, 04-97-00972-CV, and WR-70,849-04", "04-97-00972-CV"), ("04-97-00972-CV", "04-97-00972-CV"), ("AP-77,129", "AP-77,129"), ("WR-70,849-04", "WR-70,849-04"), @@ -460,9 +463,13 @@ def test_clean_texas_docket_number(self) -> None: ("", ""), ("garbage text", ""), ] - for input_dn, expected in test_cases: + for i, (input_dn, expected) in enumerate(test_cases): with self.subTest(input_dn=input_dn): - self.assertEqual(clean_texas_docket_number(input_dn), expected) + self.assertEqual( + clean_texas_docket_number(input_dn), + expected, + f"Failed test case {i}", + ) def test_texas_docket_number_core(self) -> None: """Can we correctly normalize Texas docket numbers?""" From 8f7a662b4829d2513f8ea2f021b6737e9d558395 Mon Sep 17 00:00:00 2001 From: grossir <14970769+grossir@users.noreply.github.com> Date: Thu, 19 Mar 2026 01:43:44 +0000 Subject: [PATCH 79/87] Update freelawproject dependencies --- uv.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/uv.lock b/uv.lock index d1a5876bc3..ea3dca1b20 100644 --- a/uv.lock +++ b/uv.lock @@ -1855,7 +1855,7 @@ wheels = [ [[package]] name = "juriscraper" -version = "3.0.0" +version = "3.0.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "certifi" }, @@ -1874,9 +1874,9 @@ dependencies = [ { name = "selenium" }, { name = "tldextract" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/9f/ff/7f34ccf378c996cee9e679a563fc3d6991043f2ff9f0d275b4ae8bbc3e52/juriscraper-3.0.0.tar.gz", hash = "sha256:02c2c67ecfd3a43ef444bdde7776b69f9997855529644f86a49d71e02643f075", size = 385434, upload-time = "2026-03-18T15:10:23.134Z" } +sdist = { url = "https://files.pythonhosted.org/packages/ab/f5/4c80d7a2520ad74713ce4e4ccec453f8e97d43920bfe5899fa8b4a981753/juriscraper-3.0.1.tar.gz", hash = "sha256:41a7f75b655bac3b8237dbb910242d290eee42fa4dc6e5ffbbb80af78592b60e", size = 385421, upload-time = "2026-03-19T01:38:46.037Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/9b/4e/151154b483ab6e391a301f192c5ddab12f773f13c6f9269297ad997e56ce/juriscraper-3.0.0-py3-none-any.whl", hash = "sha256:1c59c60a99f644de13689f777ec237c8e08bd3f08ce6d7214f19c6ce02cce326", size = 611776, upload-time = "2026-03-18T15:10:21.408Z" }, + { url = "https://files.pythonhosted.org/packages/42/f9/5813aed6741d04855d77880c0169edea0f06fecaff723e8f9e20476a819b/juriscraper-3.0.1-py3-none-any.whl", hash = "sha256:2e7cf5aed11df670e92d2c217b8e51827d808088f01bc8f23a4caddde7a6be5c", size = 611793, upload-time = "2026-03-19T01:38:44.016Z" }, ] [[package]] From c0b89ed38759c18fc6b65a1b7f2dea48360f9fcb Mon Sep 17 00:00:00 2001 From: Eduardo Rosendo Date: Thu, 19 Mar 2026 02:17:31 -0400 Subject: [PATCH 80/87] tests(stats): Mocks microservice in semantic search Prometheus test The test_api_semantic_search_increments_metric test was making a real call to the embedding microservice. This commit mocks the microservice response with a random embedding vector to make the test self-contained and avoid external dependencies. --- cl/stats/tests.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/cl/stats/tests.py b/cl/stats/tests.py index ca4c9a5ec3..12d290ee2c 100644 --- a/cl/stats/tests.py +++ b/cl/stats/tests.py @@ -1,9 +1,12 @@ +import random from datetime import datetime, timedelta from http import HTTPStatus +from unittest import mock from unittest.mock import MagicMock, patch import pytest import time_machine +from django.conf import settings from django.core import mail from django.core.management import call_command from django.test import TestCase, override_settings @@ -325,10 +328,20 @@ async def test_api_keyword_search_increments_metric(self) -> None: final_count = await self._get_metric_count("keyword", "api") self.assertEqual(final_count, initial_count + 1) - async def test_api_semantic_search_increments_metric(self) -> None: + @mock.patch("cl.lib.elasticsearch_utils.microservice") + async def test_api_semantic_search_increments_metric( + self, mock_microservice + ) -> None: """Verify semantic API searches increment the Prometheus counter""" initial_count = await self._get_metric_count("semantic", "api") + inception_mock = MagicMock() + inception_mock.json.return_value = { + "embedding": [ + random.random() for _ in range(settings.EMBEDDING_DIMENSIONS) + ] + } + mock_microservice.return_value = inception_mock search_url = reverse("search-list", kwargs={"version": "v4"}) await self.async_client.get( search_url, From d3804b4407e5d57eeb71392b06f23ed22e1f0f92 Mon Sep 17 00:00:00 2001 From: Nathan Dahlberg Date: Thu, 19 Mar 2026 11:25:50 -0400 Subject: [PATCH 81/87] feat(simple_pages): update robots.txt to allow AI bots to access whitelisted pages Fixes: #7111 --- cl/simple_pages/templates/robots.txt | 32 ++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/cl/simple_pages/templates/robots.txt b/cl/simple_pages/templates/robots.txt index daf18f8e29..c61dc1c253 100644 --- a/cl/simple_pages/templates/robots.txt +++ b/cl/simple_pages/templates/robots.txt @@ -95,6 +95,38 @@ Disallow: /wpd/ Disallow: /txt/ Disallow: /doc/ +# AI Bots (training, search/indexing, and user-initiated) +User-agent: ClaudeBot +User-agent: Claude-SearchBot +User-agent: Claude-User +User-agent: GPTBot +User-agent: OAI-SearchBot +User-agent: ChatGPT-User +User-agent: PerplexityBot +User-agent: Perplexity-User +User-agent: Google-Extended +User-agent: Gemini-Deep-Research +User-agent: Applebot +User-agent: Applebot-Extended +User-agent: FacebookBot +User-agent: meta-externalagent +User-agent: Amazonbot +User-agent: Bytespider +User-agent: CCBot +User-agent: MistralAI-User +Allow: /robots.txt +Allow: /sitemap.xml +Allow: /help/ +Allow: /faq/ +Allow: /feeds/ +Allow: /podcasts/ +Allow: /contact/ +Allow: /terms/ +Allow: /privacy/ +Allow: /removal/ +Allow: /donate/ +Disallow: / + # Baidu, Blekko, Others # No support for robots meta tag nor x-robots-tag. # Be conservative; Block everything. From 2aee8ae5e47bd4c9a3189175ae61332c167c93fd Mon Sep 17 00:00:00 2001 From: Nathan Dahlberg Date: Thu, 19 Mar 2026 11:41:19 -0400 Subject: [PATCH 82/87] feat(simple_pages): remove AppleBot and FacebookBot from AI bot rules --- cl/simple_pages/templates/robots.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/cl/simple_pages/templates/robots.txt b/cl/simple_pages/templates/robots.txt index c61dc1c253..71d19cb9ea 100644 --- a/cl/simple_pages/templates/robots.txt +++ b/cl/simple_pages/templates/robots.txt @@ -106,9 +106,7 @@ User-agent: PerplexityBot User-agent: Perplexity-User User-agent: Google-Extended User-agent: Gemini-Deep-Research -User-agent: Applebot User-agent: Applebot-Extended -User-agent: FacebookBot User-agent: meta-externalagent User-agent: Amazonbot User-agent: Bytespider From 281b197eb0b1e129552c07c9bd4aac4c0437d3c9 Mon Sep 17 00:00:00 2001 From: Nathan Dahlberg Date: Thu, 19 Mar 2026 11:45:28 -0400 Subject: [PATCH 83/87] feat(simple_pages): group AI bot agents by type --- cl/simple_pages/templates/robots.txt | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/cl/simple_pages/templates/robots.txt b/cl/simple_pages/templates/robots.txt index 71d19cb9ea..aea9c441f6 100644 --- a/cl/simple_pages/templates/robots.txt +++ b/cl/simple_pages/templates/robots.txt @@ -95,22 +95,25 @@ Disallow: /wpd/ Disallow: /txt/ Disallow: /doc/ -# AI Bots (training, search/indexing, and user-initiated) +# AI Bots +# Training/Scraping bots — crawl to build datasets for model training User-agent: ClaudeBot -User-agent: Claude-SearchBot -User-agent: Claude-User User-agent: GPTBot -User-agent: OAI-SearchBot -User-agent: ChatGPT-User -User-agent: PerplexityBot -User-agent: Perplexity-User User-agent: Google-Extended -User-agent: Gemini-Deep-Research User-agent: Applebot-Extended User-agent: meta-externalagent -User-agent: Amazonbot User-agent: Bytespider User-agent: CCBot +# Search/Indexing bots — crawl to power AI-assisted search results +User-agent: Claude-SearchBot +User-agent: OAI-SearchBot +User-agent: PerplexityBot +User-agent: Amazonbot +User-agent: Gemini-Deep-Research +# User-initiated agents — fetch pages on behalf of a real user in a conversation +User-agent: Claude-User +User-agent: ChatGPT-User +User-agent: Perplexity-User User-agent: MistralAI-User Allow: /robots.txt Allow: /sitemap.xml From a657c48123fe2c37e8ef73fcd9b0e359bea5c24a Mon Sep 17 00:00:00 2001 From: Elisa Anguita Date: Thu, 19 Mar 2026 17:32:26 -0300 Subject: [PATCH 84/87] style(help): use descendant selector for link color and fix tabnabbing - Replace text-primary-600 with [&_a]:text-primary-600 on
    elements so the color targets links, not list item text - Add rel="noreferrer" to external Judge Coverage link Cherry-picked from 3f9c96d (PR #6572) to unblock other PRs touching the help index page. Co-Authored-By: Claude Opus 4.6 --- cl/simple_pages/templates/help/index.html | 2 +- cl/simple_pages/templates/v2_help/index.html | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cl/simple_pages/templates/help/index.html b/cl/simple_pages/templates/help/index.html index 1f4d509279..de0ad07453 100644 --- a/cl/simple_pages/templates/help/index.html +++ b/cl/simple_pages/templates/help/index.html @@ -48,7 +48,7 @@

    About our Data

  1. PACER Data Coverage

  2. Case Law Coverage

  3. Financial Disclosure Coverage

  4. -
  5. Judge Coverage

  6. +
  7. Judge Coverage

  8. Oral Argument Recording Coverage

diff --git a/cl/simple_pages/templates/v2_help/index.html b/cl/simple_pages/templates/v2_help/index.html index 020f2d191d..040efeaf67 100644 --- a/cl/simple_pages/templates/v2_help/index.html +++ b/cl/simple_pages/templates/v2_help/index.html @@ -20,7 +20,7 @@

Getting Help on

    + class="list-decimal [&_a]:text-primary-600 marker:text-greyscale-800 marker:font-normal text-sm font-normal space-y-2">
  1. Help with search and docket alerts
  2. Help with @recap.email
  3. Help with advanced search parameters
  4. @@ -43,12 +43,12 @@

    About our Data

    We've built some of the biggest open datasets in the world. Learn more about them:

@@ -62,7 +62,7 @@

Developer Documenta
    + [&_a]:text-primary-600 space-y-4">
  1. API Documentation

      From 40fdfe6ac5737ae97e69163230c183c7c4fd0193 Mon Sep 17 00:00:00 2001 From: Alberto Islas Date: Thu, 19 Mar 2026 16:30:34 -0600 Subject: [PATCH 85/87] refactor(search): rename opinion ES index to case_law_index --- cl/search/es_indices.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cl/search/es_indices.py b/cl/search/es_indices.py index 8adb416307..73b6600059 100644 --- a/cl/search/es_indices.py +++ b/cl/search/es_indices.py @@ -48,7 +48,7 @@ # Define people elasticsearch index # Define opinion elasticsearch index -opinion_index = Index("opinion_index") +opinion_index = Index("case_law_index") opinion_index.settings( number_of_shards=settings.ELASTICSEARCH_OPINION_NUMBER_OF_SHARDS, number_of_replicas=settings.ELASTICSEARCH_OPINION_NUMBER_OF_REPLICAS, From ebf61ae0d85575183d175dd5aaf4db0444064a37 Mon Sep 17 00:00:00 2001 From: grossir <14970769+grossir@users.noreply.github.com> Date: Fri, 20 Mar 2026 15:15:06 +0000 Subject: [PATCH 86/87] Update freelawproject dependencies --- uv.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/uv.lock b/uv.lock index ea3dca1b20..cd45ab7525 100644 --- a/uv.lock +++ b/uv.lock @@ -1855,7 +1855,7 @@ wheels = [ [[package]] name = "juriscraper" -version = "3.0.1" +version = "3.0.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "certifi" }, @@ -1874,9 +1874,9 @@ dependencies = [ { name = "selenium" }, { name = "tldextract" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ab/f5/4c80d7a2520ad74713ce4e4ccec453f8e97d43920bfe5899fa8b4a981753/juriscraper-3.0.1.tar.gz", hash = "sha256:41a7f75b655bac3b8237dbb910242d290eee42fa4dc6e5ffbbb80af78592b60e", size = 385421, upload-time = "2026-03-19T01:38:46.037Z" } +sdist = { url = "https://files.pythonhosted.org/packages/5a/0d/450d9a0a7c9b8f78ac00e420d3865148b3953db5e0172d3d2c0750b6b8dd/juriscraper-3.0.2.tar.gz", hash = "sha256:3273ad81e59cdf1c6789a54aaf421c4e46a80940ef668947d84cdb23056fab48", size = 386253, upload-time = "2026-03-20T15:11:22.283Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/42/f9/5813aed6741d04855d77880c0169edea0f06fecaff723e8f9e20476a819b/juriscraper-3.0.1-py3-none-any.whl", hash = "sha256:2e7cf5aed11df670e92d2c217b8e51827d808088f01bc8f23a4caddde7a6be5c", size = 611793, upload-time = "2026-03-19T01:38:44.016Z" }, + { url = "https://files.pythonhosted.org/packages/90/6a/b7836b8301357d26463e329b0341e21f51ad3f23bc55387ae56be405dbfb/juriscraper-3.0.2-py3-none-any.whl", hash = "sha256:5b202d41cee5d2d22dd77778d162407b0530518e83f8b476fde7dcd342deee92", size = 612795, upload-time = "2026-03-20T15:11:20.583Z" }, ] [[package]] From 812e80ab6d77ea5cb53d7d50498c5b08a588639f Mon Sep 17 00:00:00 2001 From: ttys0dev <126845556+ttys0dev@users.noreply.github.com> Date: Wed, 26 Mar 2025 17:06:52 -0600 Subject: [PATCH 87/87] Migrate to async juriscraper --- .../commands/handle_old_docket_alerts.py | 21 +- .../management/commands/monitor_pacer.py | 9 +- cl/api/utils.py | 25 +- cl/api/webhooks.py | 19 +- cl/corpus_importer/bulk_utils.py | 5 +- .../management/commands/760_project.py | 5 +- .../management/commands/adelman_david.py | 3 +- .../management/commands/buchwald_project.py | 9 +- .../commands/claims_activity_project.py | 3 +- .../management/commands/everything_project.py | 5 +- .../management/commands/export_control.py | 3 +- .../management/commands/get_pacer_doc_ids.py | 7 +- .../management/commands/import_patent.py | 3 +- .../management/commands/jackson_project.py | 3 +- .../management/commands/kessler_ilnb.py | 9 +- .../commands/list_of_creditors_project.py | 3 +- .../management/commands/nos_700.py | 7 +- .../management/commands/nywb_chapter_7.py | 5 +- cl/corpus_importer/tasks.py | 259 +++++++++--------- cl/corpus_importer/utils.py | 14 +- cl/favorites/tasks.py | 5 +- cl/lasc/tasks.py | 17 +- cl/lib/pacer_session.py | 50 ++-- cl/lib/tests.py | 6 +- cl/recap/api_serializers.py | 9 +- .../commands/merge_idb_into_dockets.py | 5 +- cl/recap/mergers.py | 7 +- cl/recap/tasks.py | 258 ++++++++--------- cl/recap/tests/test_recap_email.py | 99 +++---- cl/recap_rss/tasks.py | 8 +- cl/scrapers/tasks.py | 14 +- cl/scrapers/utils.py | 4 +- .../management/commands/pacer_bulk_fetch.py | 3 +- cl/search/tests/test_pacer_bulk_fetch.py | 18 +- .../tests/tests_semantic_search_opinion.py | 48 ++-- cl/tests/fakes.py | 26 +- pyproject.toml | 5 +- uv.lock | 37 +-- 38 files changed, 519 insertions(+), 517 deletions(-) diff --git a/cl/alerts/management/commands/handle_old_docket_alerts.py b/cl/alerts/management/commands/handle_old_docket_alerts.py index 50fd22fdc9..e68fad1df7 100644 --- a/cl/alerts/management/commands/handle_old_docket_alerts.py +++ b/cl/alerts/management/commands/handle_old_docket_alerts.py @@ -1,5 +1,6 @@ from argparse import RawTextHelpFormatter +from asgiref.sync import async_to_sync, sync_to_async from django.conf import settings from django.contrib.auth.models import User from django.core.mail import EmailMultiAlternatives @@ -78,7 +79,7 @@ def build_user_report(user, delete=False): return report -def send_old_alert_warning_email_and_webhook(user, report) -> int: +async def send_old_alert_warning_email_and_webhook(user, report) -> int: """Send alerts emails and webhooks for old alerts :param user: The user with terminated dockets @@ -91,17 +92,21 @@ def send_old_alert_warning_email_and_webhook(user, report) -> int: ) webhook_count = 0 if report.very_old_alerts or report.disabled_alerts: - for user_webhook in user_webhooks: - send_old_alerts_webhook_event(user_webhook, report) + async for user_webhook in user_webhooks: + await send_old_alerts_webhook_event(user_webhook, report) webhook_count += 1 count = report.total_count() subject_template = loader.get_template("emails/old_email_subject.txt") subject = subject_template.render({"count": count}).strip() - txt = loader.get_template("emails/old_alert_email.txt").render( + txt = await sync_to_async( + loader.get_template("emails/old_alert_email.txt").render + )( {"report_data": report}, ) - html = loader.get_template("emails/old_alert_email.html").render( + html = await sync_to_async( + loader.get_template("emails/old_alert_email.html").render + )( {"report_data": report}, ) msg = EmailMultiAlternatives( @@ -174,9 +179,9 @@ def handle(self, *args, **options): count = report.total_count() if options["send_alerts"] and count > 0: emails_sent += 1 - webhooks_count = send_old_alert_warning_email_and_webhook( - user, report - ) + webhooks_count = async_to_sync( + send_old_alert_warning_email_and_webhook + )(user, report) webhooks_sent += webhooks_count logger.info( diff --git a/cl/alerts/management/commands/monitor_pacer.py b/cl/alerts/management/commands/monitor_pacer.py index 514b063c71..8ff85fe1fe 100644 --- a/cl/alerts/management/commands/monitor_pacer.py +++ b/cl/alerts/management/commands/monitor_pacer.py @@ -1,6 +1,7 @@ import datetime import time +from asgiref.sync import async_to_sync from django.conf import settings from django.core.mail import send_mail from django.template import loader @@ -48,12 +49,12 @@ def handle(self, *args, **options): s = ProxyPacerSession( username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD ) - s.login() + async_to_sync(s.login)() report = CaseQueryAdvancedBankruptcy("canb", s) t1 = now() while True: query = "Pacific" - report.query( + async_to_sync(report.query)( name_last=query, filed_from=datetime.date(2019, 1, 28), filed_to=datetime.date(2019, 1, 30), @@ -66,7 +67,7 @@ def handle(self, *args, **options): exit(0) query = "PG&E" - report.query( + async_to_sync(report.query)( name_last=query, filed_from=datetime.date(2019, 1, 28), filed_to=datetime.date(2019, 1, 30), @@ -83,5 +84,5 @@ def handle(self, *args, **options): min_login_frequency = 60 * 30 # thirty minutes if (t2 - t1).seconds > min_login_frequency: print("Logging in again.") - s.login() + async_to_sync(s.login)() t1 = now() diff --git a/cl/api/utils.py b/cl/api/utils.py index fa3d908dd0..75f3e1ad58 100644 --- a/cl/api/utils.py +++ b/cl/api/utils.py @@ -7,6 +7,7 @@ from typing import Any, TypedDict import eyecite +from asgiref.sync import async_to_sync, sync_to_async from dateutil import parser from dateutil.rrule import DAILY, rrule from django.conf import settings @@ -25,7 +26,7 @@ from django.views.decorators.vary import vary_on_headers from django_ratelimit.core import get_header from eyecite.tokenizers import HyperscanTokenizer -from requests import Response +from httpx import Response from rest_framework import serializers from rest_framework.exceptions import Throttled, ValidationError from rest_framework.metadata import SimpleMetadata @@ -1185,7 +1186,7 @@ def get_next_webhook_retry_date(retry_counter: int) -> datetime: WEBHOOK_MAX_RETRY_COUNTER = 7 -def check_webhook_failure_count_and_notify( +async def check_webhook_failure_count_and_notify( webhook_event: WebhookEvent, ) -> None: """Check if a Webhook needs to be disabled and/or send a notification about @@ -1208,7 +1209,7 @@ def check_webhook_failure_count_and_notify( 6: False, 7: True, # Send webhook disabled notification } - webhook = webhook_event.webhook + webhook = await Webhook.objects.aget(pk=webhook_event.webhook_id) if not webhook.enabled or webhook_event.debug: return @@ -1218,33 +1219,33 @@ def check_webhook_failure_count_and_notify( current_try_counter = webhook_event.retry_counter notify = notify_on[current_try_counter] if notify: - oldest_enqueued_for_retry = WebhookEvent.objects.filter( - webhook=webhook_event.webhook, + oldest_enqueued_for_retry = await WebhookEvent.objects.filter( + webhook=webhook, event_status=WEBHOOK_EVENT_STATUS.ENQUEUED_RETRY, debug=False, - ).earliest("date_created") + ).aearliest("date_created") if current_try_counter >= WEBHOOK_MAX_RETRY_COUNTER: webhook.enabled = False update_fields.append("enabled") update_fields.append("date_modified") # If the parent webhook is disabled mark all current ENQUEUED_RETRY # events as ENDPOINT_DISABLED - WebhookEvent.objects.filter( - webhook=webhook_event.webhook, + await WebhookEvent.objects.filter( + webhook=webhook, event_status=WEBHOOK_EVENT_STATUS.ENQUEUED_RETRY, debug=False, - ).update( + ).aupdate( event_status=WEBHOOK_EVENT_STATUS.ENDPOINT_DISABLED, date_modified=now(), ) if oldest_enqueued_for_retry.pk == webhook_event.pk: failure_counter = current_try_counter + 1 - notify_failing_webhook.delay( + await sync_to_async(notify_failing_webhook.delay)( webhook_event.pk, failure_counter, webhook.enabled ) # Save webhook and avoid emailing admins via signal in cl.users.signals - webhook.save(update_fields=update_fields) + await webhook.asave(update_fields=update_fields) def update_webhook_event_after_request( @@ -1288,7 +1289,7 @@ def update_webhook_event_after_request( if error is None: error = "" webhook_event.error_message = error - check_webhook_failure_count_and_notify(webhook_event) + async_to_sync(check_webhook_failure_count_and_notify)(webhook_event) if webhook_event.retry_counter >= WEBHOOK_MAX_RETRY_COUNTER: # If the webhook has reached the max retry counter, mark as failed webhook_event.event_status = WEBHOOK_EVENT_STATUS.FAILED diff --git a/cl/api/webhooks.py b/cl/api/webhooks.py index f6ff5f50f7..5c2c17be34 100644 --- a/cl/api/webhooks.py +++ b/cl/api/webhooks.py @@ -2,7 +2,9 @@ import random import requests +from asgiref.sync import sync_to_async from django.conf import settings +from django.contrib.auth.models import User from elasticsearch_dsl.response import Response from rest_framework.renderers import JSONRenderer @@ -80,7 +82,7 @@ def send_webhook_event( update_webhook_event_after_request(webhook_event, error=error_str) -def send_old_alerts_webhook_event( +async def send_old_alerts_webhook_event( webhook: Webhook, report: OldAlertReport ) -> None: """Send webhook event for old alerts @@ -115,14 +117,14 @@ def send_old_alerts_webhook_event( post_content, accepted_media_type="application/json;", ) - webhook_event = WebhookEvent.objects.create( + webhook_event = await WebhookEvent.objects.acreate( webhook=webhook, content=post_content, ) - send_webhook_event(webhook_event, json_bytes) + await sync_to_async(send_webhook_event)(webhook_event, json_bytes) -def send_recap_fetch_webhooks(fq: PacerFetchQueue) -> None: +async def send_recap_fetch_webhooks(fq: PacerFetchQueue) -> None: """Send webhook event for processed PacerFetchQueue objects. :param fq: The PacerFetchQueue object related to the event. @@ -137,10 +139,11 @@ def send_recap_fetch_webhooks(fq: PacerFetchQueue) -> None: PROCESSING_STATUS.INVALID_CONTENT, PROCESSING_STATUS.NEEDS_INFO, ]: - user_webhooks = fq.user.webhooks.filter( + user = await User.objects.aget(pk=fq.user_id) + user_webhooks = user.webhooks.filter( event_type=WebhookEventType.RECAP_FETCH, enabled=True ) - for webhook in user_webhooks: + async for webhook in user_webhooks: payload = PacerFetchQueueSerializer(fq).data post_content = { "webhook": generate_webhook_key_content(webhook), @@ -151,11 +154,11 @@ def send_recap_fetch_webhooks(fq: PacerFetchQueue) -> None: post_content, accepted_media_type="application/json;", ) - webhook_event = WebhookEvent.objects.create( + webhook_event = await WebhookEvent.objects.acreate( webhook=webhook, content=post_content, ) - send_webhook_event(webhook_event, json_bytes) + await sync_to_async(send_webhook_event)(webhook_event, json_bytes) def send_search_alert_webhook( diff --git a/cl/corpus_importer/bulk_utils.py b/cl/corpus_importer/bulk_utils.py index 5c5919d1d6..1ab1574df6 100644 --- a/cl/corpus_importer/bulk_utils.py +++ b/cl/corpus_importer/bulk_utils.py @@ -1,3 +1,4 @@ +from asgiref.sync import async_to_sync from celery import chain from cl.corpus_importer.tasks import get_pacer_doc_by_rd @@ -45,7 +46,7 @@ def get_petitions( session = ProxyPacerSession( username=pacer_username, password=pacer_password ) - session.login() + async_to_sync(session.login)() for i, rd_pk in enumerate(rds): if i < options["offset"]: i += 1 @@ -57,7 +58,7 @@ def get_petitions( session = ProxyPacerSession( username=pacer_username, password=pacer_password ) - session.login() + async_to_sync(session.login)() logger.info(f"Sent {i} tasks to celery so far.") logger.info("Doing row %s", i) throttle.maybe_wait() diff --git a/cl/corpus_importer/management/commands/760_project.py b/cl/corpus_importer/management/commands/760_project.py index 90ecf391b7..bd1fdc4352 100644 --- a/cl/corpus_importer/management/commands/760_project.py +++ b/cl/corpus_importer/management/commands/760_project.py @@ -2,6 +2,7 @@ import csv import os +from asgiref.sync import async_to_sync from celery.canvas import chain from django.conf import settings @@ -34,7 +35,7 @@ def get_dockets(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() session_data = SessionData(session.cookies, session.proxy_address) for i, row in enumerate(reader): if i < options["offset"]: @@ -101,7 +102,7 @@ def get_att_pages(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() get_district_attachment_pages( options=options, rd_pks=rd_pks, tag_names=[TAG], session=session ) diff --git a/cl/corpus_importer/management/commands/adelman_david.py b/cl/corpus_importer/management/commands/adelman_david.py index b37279b9a6..ae4c0c6530 100644 --- a/cl/corpus_importer/management/commands/adelman_david.py +++ b/cl/corpus_importer/management/commands/adelman_david.py @@ -1,6 +1,7 @@ import csv import os +from asgiref.sync import async_to_sync from celery.canvas import chain from django.conf import settings @@ -31,7 +32,7 @@ def download_dockets(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() session_data = SessionData(session.cookies, session.proxy_address) for i, row in enumerate(reader): if i < options["offset"]: diff --git a/cl/corpus_importer/management/commands/buchwald_project.py b/cl/corpus_importer/management/commands/buchwald_project.py index 3bb5f2f292..7ff1690d53 100644 --- a/cl/corpus_importer/management/commands/buchwald_project.py +++ b/cl/corpus_importer/management/commands/buchwald_project.py @@ -1,6 +1,7 @@ import os from argparse import RawTextHelpFormatter +from asgiref.sync import async_to_sync from celery import chain from django.conf import settings @@ -36,7 +37,7 @@ def add_all_nysd_to_cl(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() # IDs obtained by binary search of docket numbers on PACER website. earliest_id = 405990 @@ -53,7 +54,7 @@ def add_all_nysd_to_cl(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() throttle.maybe_wait() logger.info("Doing pacer_case_id: %s", pacer_case_id) @@ -72,7 +73,7 @@ def get_dockets(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() buchwald_id = 450 ds = ( @@ -95,7 +96,7 @@ def get_dockets(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() throttle.maybe_wait() logger.info("%s: Doing docket with pk: %s", i, d.pk) diff --git a/cl/corpus_importer/management/commands/claims_activity_project.py b/cl/corpus_importer/management/commands/claims_activity_project.py index 6ba8a66565..a9e57979f2 100644 --- a/cl/corpus_importer/management/commands/claims_activity_project.py +++ b/cl/corpus_importer/management/commands/claims_activity_project.py @@ -6,6 +6,7 @@ from datetime import date import pandas as pd +from asgiref.sync import async_to_sync from django.conf import settings from juriscraper.pacer import ClaimsActivity @@ -45,7 +46,7 @@ def query_and_parse_claims_activity( } s = ProxyPacerSession(username=PACER_USERNAME, password=PACER_PASSWORD) - s.login() + async_to_sync(s.login)() for court_id in courts: court = map_cl_to_pacer_id(court_id) for alias, creditor_name in creditor_names.items(): diff --git a/cl/corpus_importer/management/commands/everything_project.py b/cl/corpus_importer/management/commands/everything_project.py index bbe84f8daf..315cdef323 100644 --- a/cl/corpus_importer/management/commands/everything_project.py +++ b/cl/corpus_importer/management/commands/everything_project.py @@ -1,5 +1,6 @@ import os +from asgiref.sync import async_to_sync from celery.canvas import chain from django.conf import settings @@ -115,7 +116,7 @@ def get_dockets(options, items, tags, sample_size=0, doc_num_end=""): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() for i, row in enumerate(items): if i < options["offset"]: continue @@ -128,7 +129,7 @@ def get_dockets(options, items, tags, sample_size=0, doc_num_end=""): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) diff --git a/cl/corpus_importer/management/commands/export_control.py b/cl/corpus_importer/management/commands/export_control.py index 882aa74741..4c606bcd41 100644 --- a/cl/corpus_importer/management/commands/export_control.py +++ b/cl/corpus_importer/management/commands/export_control.py @@ -2,6 +2,7 @@ import csv import os +from asgiref.sync import async_to_sync from django.conf import settings from cl.corpus_importer.task_canvases import get_docket_and_claims @@ -70,7 +71,7 @@ def get_data(options, row_transform, tags): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() for i, row in enumerate(reader): if i < options["offset"]: continue diff --git a/cl/corpus_importer/management/commands/get_pacer_doc_ids.py b/cl/corpus_importer/management/commands/get_pacer_doc_ids.py index 764465d978..79b674112b 100644 --- a/cl/corpus_importer/management/commands/get_pacer_doc_ids.py +++ b/cl/corpus_importer/management/commands/get_pacer_doc_ids.py @@ -1,5 +1,6 @@ import os +from asgiref.sync import async_to_sync from django.conf import settings from cl.corpus_importer.tasks import get_pacer_doc_id_with_show_case_doc_url @@ -12,7 +13,7 @@ PACER_PASSWORD = os.environ.get("PACER_PASSWORD", settings.PACER_PASSWORD) -def get_pacer_doc_ids(options): +async def get_pacer_doc_ids(options): """Get pacer_doc_ids for any item that needs them.""" q = options["queue"] throttle = CeleryThrottle(queue_name=q) @@ -37,7 +38,7 @@ def get_pacer_doc_ids(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() logger.info( f"Sent {completed} tasks to celery so far. Latest pk: {row_pk}" ) @@ -72,4 +73,4 @@ def add_arguments(self, parser): def handle(self, *args, **options): super().handle(*args, **options) - get_pacer_doc_ids(options) + async_to_sync(get_pacer_doc_ids)(options) diff --git a/cl/corpus_importer/management/commands/import_patent.py b/cl/corpus_importer/management/commands/import_patent.py index 045654a0d6..15485205c0 100644 --- a/cl/corpus_importer/management/commands/import_patent.py +++ b/cl/corpus_importer/management/commands/import_patent.py @@ -1,6 +1,7 @@ import os import time +from asgiref.sync import async_to_sync from celery import chain from django.conf import settings @@ -42,7 +43,7 @@ def get_dockets(options: dict) -> None: session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() session_data = SessionData(session.cookies, session.proxy_address) NOS_CODES = [PATENT, PATENT_ANDA] DISTRICTS = ["ded", "txwd"] diff --git a/cl/corpus_importer/management/commands/jackson_project.py b/cl/corpus_importer/management/commands/jackson_project.py index 97ca50d29b..2caede24e4 100644 --- a/cl/corpus_importer/management/commands/jackson_project.py +++ b/cl/corpus_importer/management/commands/jackson_project.py @@ -1,5 +1,6 @@ import os +from asgiref.sync import async_to_sync from celery import chain from django.conf import settings @@ -22,7 +23,7 @@ def get_dockets(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() jackson_id = 1609 ds = Docket.objects.filter(court_id="dcd", assigned_to_id=jackson_id) diff --git a/cl/corpus_importer/management/commands/kessler_ilnb.py b/cl/corpus_importer/management/commands/kessler_ilnb.py index e93c5e0824..8778b59b78 100644 --- a/cl/corpus_importer/management/commands/kessler_ilnb.py +++ b/cl/corpus_importer/management/commands/kessler_ilnb.py @@ -2,6 +2,7 @@ import csv import os +from asgiref.sync import async_to_sync from celery.canvas import chain from django.conf import settings @@ -37,7 +38,7 @@ def get_dockets(options): pacer_session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - pacer_session.login() + async_to_sync(pacer_session.login)() for i, row in enumerate(reader): if i < options["offset"]: continue @@ -48,7 +49,7 @@ def get_dockets(options): pacer_session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - pacer_session.login() + async_to_sync(pacer_session.login)() logger.info(f"Sent {i} tasks to celery so far.") logger.info("Doing row %s", i) throttle.maybe_wait() @@ -93,7 +94,7 @@ def get_final_docs(options): pacer_session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - pacer_session.login() + async_to_sync(pacer_session.login)() for i, de in enumerate(des): if i < options["offset"]: i += 1 @@ -104,7 +105,7 @@ def get_final_docs(options): pacer_session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - pacer_session.login() + async_to_sync(pacer_session.login)() logger.info(f"Sent {i} tasks to celery so far.") logger.info("Doing row %s", i) rd_pks = ( diff --git a/cl/corpus_importer/management/commands/list_of_creditors_project.py b/cl/corpus_importer/management/commands/list_of_creditors_project.py index 93e79dda20..e0d82e188a 100644 --- a/cl/corpus_importer/management/commands/list_of_creditors_project.py +++ b/cl/corpus_importer/management/commands/list_of_creditors_project.py @@ -5,6 +5,7 @@ import re from typing import TypedDict, cast +from asgiref.sync import async_to_sync from django.conf import settings from cl.corpus_importer.bulk_utils import make_bankr_docket_number @@ -72,7 +73,7 @@ def query_and_save_creditors_data(options: OptionsType) -> None: session = ProxyPacerSession( username=CLIENT_PACER_USERNAME, password=CLIENT_PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() throttle = CeleryThrottle(queue_name=q) completed = 0 for i, rows in enumerate( diff --git a/cl/corpus_importer/management/commands/nos_700.py b/cl/corpus_importer/management/commands/nos_700.py index 6d383ffe95..57d5d8d077 100644 --- a/cl/corpus_importer/management/commands/nos_700.py +++ b/cl/corpus_importer/management/commands/nos_700.py @@ -1,5 +1,6 @@ import os +from asgiref.sync import async_to_sync from celery.canvas import chain from django.conf import settings @@ -231,7 +232,7 @@ def get_dockets(options, items, tags, sample_size=0): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() for i, row in enumerate(items): if i < options["offset"]: continue @@ -244,7 +245,7 @@ def get_dockets(options, items, tags, sample_size=0): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() # All tests pass. Get the docket. logger.info("Doing row %s: %s", i, row) @@ -281,7 +282,7 @@ def get_attachment_pages(options, tag): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() get_district_attachment_pages( options=options, rd_pks=rd_pks, tag_names=[tag], session=session ) diff --git a/cl/corpus_importer/management/commands/nywb_chapter_7.py b/cl/corpus_importer/management/commands/nywb_chapter_7.py index 9fa791a63c..dfafc1d13d 100644 --- a/cl/corpus_importer/management/commands/nywb_chapter_7.py +++ b/cl/corpus_importer/management/commands/nywb_chapter_7.py @@ -2,6 +2,7 @@ import csv import os +from asgiref.sync import async_to_sync from celery.canvas import chain from cl.corpus_importer.bulk_utils import ( @@ -32,7 +33,7 @@ def get_dockets(options): pacer_session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - pacer_session.login() + async_to_sync(pacer_session.login)() for i, row in enumerate(reader): if i < options["offset"]: continue @@ -43,7 +44,7 @@ def get_dockets(options): pacer_session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - pacer_session.login() + async_to_sync(pacer_session.login)() logger.info(f"Sent {i} tasks to celery so far.") logger.info("Doing row %s", i) throttle.maybe_wait() diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index dd154b5377..15452e01a6 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -20,9 +20,10 @@ import botocore.exceptions import environ import eyecite +import httpx import internetarchive as ia import requests -from asgiref.sync import async_to_sync +from asgiref.sync import async_to_sync, sync_to_async from celery import Task, chain from celery.exceptions import SoftTimeLimitExceeded from django.conf import settings @@ -34,10 +35,15 @@ from django.utils.timezone import localtime, now from eyecite.tokenizers import HyperscanTokenizer from httpx import ( + ConnectError, + HTTPError, HTTPStatusError, NetworkError, ReadError, + ReadTimeout, RemoteProtocolError, + RequestError, + Response, TimeoutException, ) from juriscraper.lib.exceptions import PacerLoginException, ParsingException @@ -92,17 +98,8 @@ ) from pydantic import ValidationError from redis import ConnectionError as RedisConnectionError -from requests import Response -from requests.exceptions import ( - ConnectionError, - HTTPError, - ReadTimeout, - RequestException, - Timeout, -) from rest_framework.renderers import JSONRenderer from sentry_sdk import capture_exception -from urllib3.exceptions import ReadTimeoutError from cl.alerts.tasks import enqueue_docket_alert, send_alert_and_webhook from cl.audio.models import Audio @@ -362,9 +359,7 @@ def upload_recap_json(self, pk: int, database: str = "default") -> None: increment_failure_count(d) -@app.task(bind=True, max_retries=5) -def download_recap_item( - self, +async def download_recap_item( url: str, filename: str, clobber: bool = False, @@ -374,33 +369,27 @@ def download_recap_item( try: if os.path.isfile(location) and not clobber: raise OSError(f" IOError: File already exists at {location}") - r = requests.get( - url, - stream=True, - timeout=60, - headers={"User-Agent": "Free Law Project"}, - ) + async with httpx.AsyncClient() as client: + r = await client.get( + url, + timeout=60, + headers={"User-Agent": "Free Law Project"}, + ) r.raise_for_status() - except requests.Timeout as e: - logger.warning(" Timed out attempting to get: %s\n", url) - raise self.retry(exc=e, countdown=2) - except requests.RequestException as e: - logger.warning(" Unable to get %s\nException was:\n%s", url, e) + except TimeoutException as e: + logger.warning(f" Timed out attempting to get: {url}\n") + except RequestError as e: + logger.warning(f" Unable to get {url}\nException was:\n{e}") except OSError as e: - logger.warning(" %s", e) + logger.warning(f" {e}") else: with NamedTemporaryFile(prefix="recap_download_") as tmp: r.raw.decode_content = True - try: - shutil.copyfileobj(r.raw, tmp) - tmp.flush() - except ReadTimeoutError as exc: - # The download failed part way through. - raise self.retry(exc=exc) - else: - # Successful download. Copy from tmp to the right spot. Note - # that this will clobber. - shutil.copyfile(tmp.name, location) + shutil.copyfileobj(r.raw, tmp) + tmp.flush() + # Successful download. Copy from tmp to the right spot. Note + # that this will clobber. + shutil.copyfile(tmp.name, location) @app.task( @@ -421,7 +410,7 @@ def get_and_save_free_document_report( :param log_id: a PACERFreeDocumentLog object id :return: The status code of the scrape """ - session_data = get_or_cache_pacer_cookies( + session_data = async_to_sync(get_or_cache_pacer_cookies)( "pacer_scraper", username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD, @@ -435,11 +424,11 @@ def get_and_save_free_document_report( report = FreeOpinionReport(court_id, s) msg = "" try: - report.query(start, end, sort="case_number") + async_to_sync(report.query)(start, end, sort="case_number") except ( TypeError, - RequestException, - ReadTimeoutError, + RequestError, + ReadTimeout, PacerLoginException, ParsingException, SoftTimeLimitExceeded, @@ -450,7 +439,7 @@ def get_and_save_free_document_report( "TypeError getting free document report results, likely due " "to failure to get Nonce." ) - elif isinstance(exc, (RequestException | ReadTimeoutError)): + elif isinstance(exc, (RequestError | ReadTimeout)): msg = "Unable to get free document report results" elif isinstance(exc, PacerLoginException): msg = "PacerLoginException while getting free docs" @@ -724,13 +713,13 @@ def get_and_process_free_pdf( return None raise self.retry() - cookies_data = get_or_cache_pacer_cookies( + cookies_data = async_to_sync(get_or_cache_pacer_cookies)( "pacer_scraper", username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD, ) try: - r, r_msg = download_pacer_pdf_by_rd( + r, r_msg = async_to_sync(download_pacer_pdf_by_rd)( rd.pk, result.pacer_case_id, result.pacer_doc_id, @@ -772,14 +761,14 @@ def get_and_process_free_pdf( msg = "PacerLoginException while getting free docs." logger.info(f"{msg} Retrying.") # noqa: G004 # Refresh cookies before retrying - get_or_cache_pacer_cookies( + async_to_sync(get_or_cache_pacer_cookies)( "pacer_scraper", username=settings.PACER_USERNAME, password=settings.PACER_PASSWORD, refresh=True, ) raise self.retry(exc=exc) - except (ReadTimeoutError, requests.RequestException) as exc: + except (ReadTimeout, RequestError) as exc: msg = "Request exception getting free PDF" if self.request.retries == self.max_retries: logger.warning(msg) @@ -792,8 +781,7 @@ def get_and_process_free_pdf( if r: pdf_bytes = r.content attachment_number = 0 # Always zero for free opinions - success, msg = update_rd_metadata( - self, + success, msg = async_to_sync(update_rd_metadata)( rd.pk, pdf_bytes, r_msg, @@ -806,6 +794,7 @@ def get_and_process_free_pdf( if success is False: PACERFreeDocumentRow.objects.filter(pk=row_pk).update(error_msg=msg) + self.request.chain = None return None rd.refresh_from_db() @@ -879,7 +868,7 @@ def upload_to_ia( source_url: str, media_type: str, description: str, -) -> list[Response] | None: +) -> list[requests.Response] | None: """Upload an item and its files to the Internet Archive On the Internet Archive there are Items and files. Items have a global @@ -1106,7 +1095,7 @@ def get_pacer_case_id_and_title( ) if not session_data and user_pk: - session_data = get_pacer_cookie_from_cache(user_pk) + session_data = async_to_sync(get_pacer_cookie_from_cache)(user_pk) if not session_data: raise Exception("Cookies not available in cache") else: @@ -1120,9 +1109,9 @@ def get_pacer_case_id_and_title( report = PossibleCaseNumberApi(map_cl_to_pacer_id(court_id), s) msg = "" try: - report.query(docket_number) - except (RequestException, ReadTimeoutError, PacerLoginException) as exc: - if isinstance(exc, (RequestException | ReadTimeoutError)): + async_to_sync(report.query)(docket_number) + except (RequestError, ReadTimeout, PacerLoginException) as exc: + if isinstance(exc, (RequestError | ReadTimeout)): msg = ( "Network error while running possible case number query on: " "%s.%s" @@ -1155,7 +1144,7 @@ def get_pacer_case_id_and_title( @app.task( bind=True, - autoretry_for=(PacerLoginException, RequestException), + autoretry_for=(PacerLoginException, RequestError), max_retries=5, interval_start=5 * 60, interval_step=10 * 60, @@ -1199,7 +1188,7 @@ def do_case_query_by_pacer_case_id( except Docket.MultipleObjectsReturned: d = None - report.query(pacer_case_id) + async_to_sync(report.query)(pacer_case_id) docket_data = report.data logger.info( "Querying and parsing complete for %s.%s", court_id, pacer_case_id @@ -1287,7 +1276,7 @@ def filter_docket_by_tags( return data -def query_case_query_report( +async def query_case_query_report( court_id: str, pacer_case_id: int ) -> tuple[dict[str, Any], str]: """Query the iquery page for a given PACER case ID. @@ -1297,7 +1286,7 @@ def query_case_query_report( :return: A two tuple, the report data and the report HTML text. """ - session_data = get_or_cache_pacer_cookies( + session_data = await get_or_cache_pacer_cookies( "pacer_scraper", settings.PACER_USERNAME, password=settings.PACER_PASSWORD, @@ -1309,7 +1298,7 @@ def query_case_query_report( proxy=session_data.proxy_address, ) report = CaseQuery(map_cl_to_pacer_id(court_id), s) - report.query(pacer_case_id) + await report.query(pacer_case_id) return report.data, report.response.text @@ -1339,10 +1328,10 @@ def make_docket_by_iquery_base( """ try: - report_data, report_text = query_case_query_report( + report_data, report_text = async_to_sync(query_case_query_report)( court_id, pacer_case_id ) - except (requests.Timeout, requests.RequestException) as exc: + except (TimeoutException, RequestError) as exc: logger.warning( "Timeout or unknown RequestException on iquery crawl. " "Trying again if retries not exceeded." @@ -1482,7 +1471,7 @@ def make_docket_by_iquery_sweep( ) -@retry((requests.Timeout, PacerLoginException), tries=3, delay=0.25, backoff=1) +@retry((TimeoutException, PacerLoginException), tries=3, delay=0.25, backoff=1) def query_iquery_page( court_id: str, pacer_case_id: int ) -> tuple[bool, None] | tuple[dict[str, Any], str]: @@ -1495,7 +1484,9 @@ def query_iquery_page( and the report HTML text. """ - report_data, report_text = query_case_query_report(court_id, pacer_case_id) + report_data, report_text = async_to_sync(query_case_query_report)( + court_id, pacer_case_id + ) if not report_data: logger.info( "No valid data found in iquery page for %s.%s", @@ -1580,6 +1571,13 @@ def probe_or_scrape_iquery_pages( report_data, report_text = query_iquery_page( court_id, pacer_case_id_to_lookup ) + except TimeoutException: + logger.warning( + "The court %s website is probably down. Aborting the probe task.", + court_id, + ) + break + except HTTPError: # Set expiration accordingly and value to 2 to difference from # other waiting times. @@ -1624,13 +1622,6 @@ def probe_or_scrape_iquery_pages( delete_redis_semaphore("CACHE", make_iquery_probing_key(court_id)) return None - except requests.Timeout: - logger.warning( - "The court %s website is probably down. Aborting the probe task.", - court_id, - ) - break - if report_data: # Find and update/store the Docket. reports_data.append( @@ -1818,8 +1809,8 @@ def get_docket_by_pacer_case_id( ) report = DocketReport(map_cl_to_pacer_id(court_id), s) try: - report.query(pacer_case_id, **kwargs) - except (RequestException, ReadTimeoutError) as exc: + async_to_sync(report.query)(pacer_case_id, **kwargs) + except (RequestError, ReadTimeout) as exc: msg = "Network error getting docket: %s" if self.request.retries == self.max_retries: logger.error(f"{msg} Aborting chain.", logging_id) # noqa: G004 @@ -1898,8 +1889,8 @@ def get_appellate_docket_by_docket_number( logger.info("Querying docket report %s", logging_id) try: - report.query(docket_number, **kwargs) - except requests.RequestException as e: + async_to_sync(report.query)(docket_number, **kwargs) + except RequestError as e: logger.warning("Problem getting docket %s", logging_id) if self.request.retries == self.max_retries: self.request.chain = None @@ -1947,7 +1938,7 @@ def get_appellate_docket_by_docket_number( } -def get_att_report_by_rd( +async def get_att_report_by_rd( rd: RECAPDocument, session_data: SessionData, ) -> AttachmentPage | None: @@ -1964,9 +1955,11 @@ def get_att_report_by_rd( s = ProxyPacerSession( cookies=session_data.cookies, proxy=session_data.proxy_address ) - pacer_court_id = map_cl_to_pacer_id(rd.docket_entry.docket.court_id) - is_appellate_case = is_appellate_court(pacer_court_id) - is_acms_document = rd.is_acms_document() + de = await DocketEntry.objects.aget(id=rd.docket_entry_id) + d = await Docket.objects.aget(id=de.docket_id) + pacer_court_id = map_cl_to_pacer_id(d.court_id) + is_appellate_case = await is_appellate_court(pacer_court_id) + is_acms_document = await sync_to_async(rd.is_acms_document)() if is_acms_document: report_class = ACMSAttachmentPage @@ -1978,11 +1971,11 @@ def get_att_report_by_rd( att_report = report_class(pacer_court_id, s) if is_acms_document: - docket_case_id = rd.docket_entry.docket.pacer_case_id + docket_case_id = d.pacer_case_id rd_entry_id = rd.pacer_doc_id - att_report.query(docket_case_id, rd_entry_id) + await att_report.query(docket_case_id, rd_entry_id) else: - att_report.query(rd.pacer_doc_id) + await att_report.query(rd.pacer_doc_id) return att_report @@ -2013,7 +2006,7 @@ def get_attachment_page_by_rd( self.request.chain = None return None try: - att_report = get_att_report_by_rd(rd, session_data) + att_report = async_to_sync(get_att_report_by_rd)(rd, session_data) except HTTPError as exc: if exc.response and exc.response.status_code in [ HTTPStatus.INTERNAL_SERVER_ERROR, @@ -2033,7 +2026,7 @@ def get_attachment_page_by_rd( logger.error(msg, str(exc)) self.request.chain = None return None - except requests.RequestException as exc: + except RequestError as exc: logger.warning("Unable to get attachment page for %s", rd) raise self.retry(exc=exc) return att_report @@ -2081,8 +2074,8 @@ def get_bankr_claims_registry( logger.info("Querying claims information for %s", logging_id) report = ClaimsRegister(map_cl_to_pacer_id(d.court_id), s) try: - report.query(d.pacer_case_id, d.docket_number_raw) - except (RequestException, ReadTimeoutError) as exc: + async_to_sync(report.query)(d.pacer_case_id, d.docket_number) + except (RequestError, ReadTimeout) as exc: if self.request.retries == self.max_retries: self.request.chain = None logger.error( @@ -2205,7 +2198,7 @@ def save_attachment_pq_from_text( return pq.pk -def download_acms_pdf_by_rd( +async def download_acms_pdf_by_rd( court_id: str, acms_entry_id: str, acms_doc_id: str, @@ -2227,11 +2220,11 @@ def download_acms_pdf_by_rd( cookies=session_data.cookies, proxy=session_data.proxy_address ) report = ACMSDocketReport(pacer_court_id, s) - r, r_msg = report.download_pdf(acms_entry_id, acms_doc_id) + r, r_msg = await report.download_pdf(acms_entry_id, acms_doc_id) return r, r_msg -def download_pacer_pdf_by_rd( +async def download_pacer_pdf_by_rd( rd_pk: int, pacer_case_id: str, pacer_doc_id: str, @@ -2249,34 +2242,36 @@ def download_pacer_pdf_by_rd( and proxy. :param magic_number: The magic number to fetch PACER documents for free this is an optional field, only used by RECAP Email documents - :return: A two-tuple of requests.Response object usually containing a PDF, + :return: A two-tuple of httpx.Response object usually containing a PDF, or None if that wasn't possible, and a string representing the error if there was one. """ - rd = RECAPDocument.objects.get(pk=rd_pk) - pacer_court_id = map_cl_to_pacer_id(rd.docket_entry.docket.court_id) + rd = await RECAPDocument.objects.aget(pk=rd_pk) + de = await DocketEntry.objects.aget(id=rd.docket_entry_id) + d = await Docket.objects.aget(id=de.docket_id) + pacer_court_id = map_cl_to_pacer_id(d.court_id) s = ProxyPacerSession( cookies=session_data.cookies, proxy=session_data.proxy_address ) - if is_appellate_court(pacer_court_id): + if await is_appellate_court(pacer_court_id): report = AppellateDocketReport(pacer_court_id, s) pacer_doc_id = ( pacer_doc_id if not rd.attachment_number else f"{pacer_doc_id[:3]}1{pacer_doc_id[4:]}" ) - r, r_msg = report.download_pdf( + r, r_msg = await report.download_pdf( pacer_doc_id=pacer_doc_id, pacer_case_id=pacer_case_id ) else: report = FreeOpinionReport(pacer_court_id, s) - r, r_msg = report.download_pdf( + r, r_msg = await report.download_pdf( pacer_case_id, pacer_doc_id, magic_number, de_seq_num=de_seq_num ) return r, r_msg -def download_pdf_by_magic_number( +async def download_pdf_by_magic_number( court_id: str, pacer_doc_id: str, pacer_case_id: str, @@ -2298,7 +2293,7 @@ def download_pdf_by_magic_number( :param de_seq_num: The sequential number assigned by the PACER system to identify the docket entry within a case. :param acms: Whether the download belongs to an ACMS notification. - :return: A two-tuple of requests.Response object usually containing a PDF, + :return: A two-tuple of httpx.Response object usually containing a PDF, or None if that wasn't possible, and a string representing the error if there was one. """ @@ -2306,13 +2301,13 @@ def download_pdf_by_magic_number( cookies=session_data.cookies, proxy=session_data.proxy_address ) report = FreeOpinionReport(court_id, s) - r, r_msg = report.download_pdf( + r, r_msg = await report.download_pdf( pacer_case_id, pacer_doc_id, magic_number, appellate, de_seq_num, acms ) return r, r_msg -def get_document_number_from_confirmation_page( +async def get_document_number_from_confirmation_page( court_id: str, pacer_doc_id: str ) -> str: """Get the PACER document number from the PACER download confirmation page. @@ -2322,20 +2317,20 @@ def get_document_number_from_confirmation_page( :return: The PACER document number is available or an empty string if not. """ - recap_email_user = User.objects.get(username="recap-email") - session_data = get_or_cache_pacer_cookies( + recap_email_user = await User.objects.aget(username="recap-email") + session_data = await get_or_cache_pacer_cookies( recap_email_user.pk, settings.PACER_USERNAME, settings.PACER_PASSWORD ) s = ProxyPacerSession( cookies=session_data.cookies, proxy=session_data.proxy_address ) doc_num_report = DownloadConfirmationPage(court_id, s) - doc_num_report.query(pacer_doc_id) + await doc_num_report.query(pacer_doc_id) data = doc_num_report.data return data.get("document_number", "") -def get_document_number_for_appellate( +async def get_document_number_for_appellate( court_id: str, pacer_doc_id: str, pq: ProcessingQueue, @@ -2360,7 +2355,7 @@ def get_document_number_for_appellate( pdf_bytes = local_path.read() if pdf_bytes: # For other jurisdictions try first to get it from the PDF document. - dn_response = async_to_sync(microservice)( + dn_response = await microservice( service="document-number", file_type="pdf", file=pdf_bytes, @@ -2371,7 +2366,7 @@ def get_document_number_for_appellate( if not document_number and pacer_doc_id and not acms: # If we still don't have the document number fall back on the # download confirmation page - document_number = get_document_number_from_confirmation_page( + document_number = await get_document_number_from_confirmation_page( court_id, pacer_doc_id ) @@ -2392,7 +2387,7 @@ def get_document_number_for_appellate( return document_number -def is_pacer_doc_sealed(court_id: str, pacer_doc_id: str) -> bool: +async def is_pacer_doc_sealed(court_id: str, pacer_doc_id: str) -> bool: """Check if a pacer doc is sealed, querying the document in PACER. If a receipt is returned the document is not sealed, otherwise is sealed. @@ -2401,22 +2396,22 @@ def is_pacer_doc_sealed(court_id: str, pacer_doc_id: str) -> bool: :return: True if the document is sealed on PACER, False otherwise. """ - recap_email_user = User.objects.get(username="recap-email") - session_data = get_or_cache_pacer_cookies( + recap_email_user = await User.objects.aget(username="recap-email") + session_data = await get_or_cache_pacer_cookies( recap_email_user.pk, settings.PACER_USERNAME, settings.PACER_PASSWORD ) s = ProxyPacerSession( cookies=session_data.cookies, proxy=session_data.proxy_address ) receipt_report = DownloadConfirmationPage(court_id, s) - receipt_report.query(pacer_doc_id) + await receipt_report.query(pacer_doc_id) data = receipt_report.data if data == {}: return True return False -def is_docket_entry_sealed( +async def is_docket_entry_sealed( court_id: str, case_id: str, doc_id: str | None ) -> bool: """Check if a docket entry is sealed, querying the download confirmation @@ -2433,8 +2428,8 @@ def is_docket_entry_sealed( if not doc_id: return False - recap_email_user = User.objects.get(username="recap-email") - session_data = get_or_cache_pacer_cookies( + recap_email_user = await User.objects.aget(username="recap-email") + session_data = await get_or_cache_pacer_cookies( recap_email_user.pk, settings.PACER_USERNAME, settings.PACER_PASSWORD ) @@ -2445,8 +2440,7 @@ def is_docket_entry_sealed( return report.is_entry_sealed(case_id, doc_id) -def update_rd_metadata( - self: Task, +async def update_rd_metadata( rd_pk: int, pdf_bytes: bytes | None, r_msg: str, @@ -2459,7 +2453,6 @@ def update_rd_metadata( ) -> tuple[bool, str]: """After querying PACER and downloading a document, save it to the DB. - :param self: The celery task :param rd_pk: The primary key of the RECAPDocument to work on :param pdf_bytes: The byte array of the PDF. :param r_msg: A message from the download function about an error that was @@ -2475,8 +2468,8 @@ def update_rd_metadata( error/success message string. """ - rd = RECAPDocument.objects.get(pk=rd_pk) - if pdf_bytes is None: + rd = await RECAPDocument.objects.aget(pk=rd_pk) + if not pdf_bytes: if r_msg and "An attachment page was returned instead" in r_msg: msg = ( "This PACER document is part of an attachment page. " @@ -2490,14 +2483,13 @@ def update_rd_metadata( f"Unable to get PDF for RECAP Document '{rd_pk}' " f"at '{court_id}' with doc id '{pacer_doc_id}'" ) - self.request.chain = None return False, msg file_name = get_document_filename( court_id, pacer_case_id, document_number, attachment_number ) cf = ContentFile(pdf_bytes) - rd.filepath_local.save(file_name, cf, save=False) + await sync_to_async(rd.filepath_local.save)(file_name, cf, save=False) rd.file_size = rd.filepath_local.size rd.is_available = True # We've got the PDF. rd.date_upload = rd.date_upload or now() @@ -2506,7 +2498,7 @@ def update_rd_metadata( # request.content is sometimes a str, sometimes unicode, so # force it all to be bytes, pleasing hashlib. rd.sha1 = sha1(pdf_bytes) - response = async_to_sync(doc_page_count_service)(rd) + response = await doc_page_count_service(rd) if response.is_success: rd.page_count = int(response.text) assert isinstance(rd.page_count, (int | type(None))), ( @@ -2514,12 +2506,10 @@ def update_rd_metadata( ) # Save and extract, skipping OCR. - rd.save() + await rd.asave() # Make sure we mark the docket as needing upload - async_to_sync(mark_ia_upload_needed)( - rd.docket_entry.docket, save_docket=True - ) + await mark_ia_upload_needed(rd.docket_entry.docket, save_docket=True) return True, "Saved item successfully" @@ -2541,7 +2531,7 @@ def add_tags(rd: RECAPDocument, tag_name: str | None) -> None: @app.task( bind=True, - autoretry_for=(PacerLoginException, RequestException, HTTPError), + autoretry_for=(PacerLoginException, RequestError, HTTPError), max_retries=3, interval_start=5, interval_step=5, @@ -2572,7 +2562,7 @@ def get_pacer_doc_by_rd( pacer_case_id = rd.docket_entry.docket.pacer_case_id de_seq_num = rd.docket_entry.pacer_sequence_number - r, r_msg = download_pacer_pdf_by_rd( + r, r_msg = async_to_sync(download_pacer_pdf_by_rd)( rd.pk, pacer_case_id, rd.pacer_doc_id, @@ -2584,8 +2574,7 @@ def get_pacer_doc_by_rd( pdf_bytes = None if r: pdf_bytes = r.content - success, msg = update_rd_metadata( - self, + success, msg = async_to_sync(update_rd_metadata)( rd_pk, pdf_bytes, r_msg, @@ -2605,7 +2594,7 @@ def get_pacer_doc_by_rd( @app.task( bind=True, - autoretry_for=(ConnectionError, ReadTimeout, HTTPError, RequestException), + autoretry_for=(ConnectError, ReadTimeout, HTTPError, RequestError), max_retries=15, interval_start=5, interval_step=5, @@ -2687,7 +2676,7 @@ def get_pacer_doc_by_rd_and_description( pacer_case_id = rd.docket_entry.docket.pacer_case_id de_seq_num = rd.docket_entry.pacer_sequence_number - r, r_msg = download_pacer_pdf_by_rd( + r, r_msg = async_to_sync(download_pacer_pdf_by_rd)( rd.pk, pacer_case_id, att_found["pacer_doc_id"], @@ -2699,8 +2688,7 @@ def get_pacer_doc_by_rd_and_description( pdf_bytes = None if r: pdf_bytes = r.content - success, msg = update_rd_metadata( - self, + success, msg = async_to_sync(update_rd_metadata)( rd_pk, pdf_bytes, r_msg, @@ -2712,6 +2700,7 @@ def get_pacer_doc_by_rd_and_description( ) if success is False: + self.request.chain = None return # Skip OCR for now. It'll happen in a second step. @@ -2748,12 +2737,12 @@ def get_pacer_doc_id_with_show_case_doc_url( last_try = self.request.retries == self.max_retries try: if rd.document_type == rd.ATTACHMENT: - report.query( + async_to_sync(report.query)( d.pacer_case_id, rd.document_number, rd.attachment_number ) else: - report.query(d.pacer_case_id, rd.document_number) - except (RequestException, ReadTimeoutError) as exc: + async_to_sync(report.query)(d.pacer_case_id, rd.document_number) + except (RequestError, ReadTimeout) as exc: msg = "Unable to get PDF for %s" if last_try: logger.error(msg, rd) @@ -2874,7 +2863,7 @@ def query_and_save_list_of_creditors( if not os.path.exists(html_file): try: report_hidden_api = PossibleCaseNumberApi(court_id, s) - report_hidden_api.query(docket_number) + async_to_sync(report_hidden_api.query)(docket_number) result = report_hidden_api.data( office_number=row["OFFICE"], docket_number_letters="bk", @@ -2928,7 +2917,7 @@ def query_and_save_list_of_creditors( # First get the POST param to ensure the same cost as in the browser. try: - post_param = report.query_post_param() + post_param = async_to_sync(report.query_post_param)() except IndexError as exc: # Sometimes this query fails, retry if there are retries available. if self.request.retries == self.max_retries: @@ -2956,7 +2945,7 @@ def query_and_save_list_of_creditors( logger.info("Invalid POST param for %s, aborting...", court_id) return None - report.query( + async_to_sync(report.query)( pacer_case_id=pacer_case_id, docket_number=docket_number, post_param=post_param, @@ -3275,7 +3264,7 @@ def download_pdf_in_stream( """ @retry( - (ConnectionError, Timeout), + (ConnectionError, requests.Timeout), tries=3, delay=0.25, backoff=1, diff --git a/cl/corpus_importer/utils.py b/cl/corpus_importer/utils.py index 8543a599f3..1db2828fe7 100644 --- a/cl/corpus_importer/utils.py +++ b/cl/corpus_importer/utils.py @@ -145,19 +145,7 @@ def is_bankruptcy_court(court_id: str) -> bool: return bankr_court_ids.filter(pk=court_id).exists() -def is_appellate_court(court_id: str) -> bool: - """Checks if the given court_id belongs to an appellate court. - - :param court_id: The unique identifier of the court. - - :return: True if the court_id corresponds to an appellate court, - False otherwise. - """ - appellate_court_ids = Court.federal_courts.appellate_courts() - return appellate_court_ids.filter(pk=court_id).exists() - - -async def ais_appellate_court(court_id: str) -> bool: +async def is_appellate_court(court_id: str) -> bool: """Checks if the given court_id belongs to an appellate court. :param court_id: The unique identifier of the court. diff --git a/cl/favorites/tasks.py b/cl/favorites/tasks.py index 71aaf8884c..79992ad8fd 100644 --- a/cl/favorites/tasks.py +++ b/cl/favorites/tasks.py @@ -1,3 +1,4 @@ +from asgiref.sync import async_to_sync from django.conf import settings from django.contrib.auth.models import User from django.utils.timezone import now @@ -38,14 +39,14 @@ def check_prayer_pacer(self, rd_pk: int, user_pk: int): court_id = map_cl_to_pacer_id(rd.docket_entry.docket.court_id) pacer_doc_id = rd.pacer_doc_id recap_user = User.objects.get(username="recap") - session_data = get_or_cache_pacer_cookies( + session_data = async_to_sync(get_or_cache_pacer_cookies)( recap_user.pk, settings.PACER_USERNAME, settings.PACER_PASSWORD ) s = ProxyPacerSession( cookies=session_data.cookies, proxy=session_data.proxy_address ) receipt_report = DownloadConfirmationPage(court_id, s) - receipt_report.query(pacer_doc_id) + async_to_sync(receipt_report.query)(pacer_doc_id) data = receipt_report.data if data == {} and not is_pdf(receipt_report.response): diff --git a/cl/lasc/tasks.py b/cl/lasc/tasks.py index 22690880fd..b9defb6a34 100644 --- a/cl/lasc/tasks.py +++ b/cl/lasc/tasks.py @@ -3,13 +3,14 @@ import os import pickle +from asgiref.sync import async_to_sync from django.apps import apps from django.conf import settings from django.core.files.base import ContentFile from django.db import transaction +from httpx import RequestError from juriscraper.lasc.fetch import LASCSearch from juriscraper.lasc.http import LASCSession -from requests import RequestException from cl.celery_init import app from cl.lasc.models import ( @@ -110,8 +111,8 @@ def download_pdf(self, pdf_pk): return try: - pdf_data = lasc.get_pdf_from_url(q_pdf.document_url) - except RequestException as exc: + pdf_data = async_to_sync(lasc.get_pdf_from_url)(q_pdf.document_url) + except RequestError as exc: logger.warning( "Got RequestException trying to get PDF for PDF Queue %s", q_pdf.pk, @@ -194,9 +195,11 @@ def add_or_update_case_db(self, case_id): clean_data = {} try: - clean_data = lasc.get_json_from_internal_case_id(case_id) + clean_data = async_to_sync(lasc.get_json_from_internal_case_id)( + case_id + ) logger.info("Successful Query") - except RequestException as e: + except RequestError as e: retries_remaining = self.max_retries - self.request.retries if retries_remaining == 0: logger.error("RequestException, unable to get case at %s", case_id) @@ -349,8 +352,8 @@ def fetch_date_range(self, start, end): lasc = make_lasc_search() try: - cases = lasc.query_cases_by_date(start, end) - except RequestException as exc: + cases = async_to_sync(lasc.query_cases_by_date)(start, end) + except RequestError as exc: logger.warning( "Got RequestException trying to get cases by date " "between %s and %s", diff --git a/cl/lib/pacer_session.py b/cl/lib/pacer_session.py index 50f3056fad..3c1c3829a8 100644 --- a/cl/lib/pacer_session.py +++ b/cl/lib/pacer_session.py @@ -1,12 +1,12 @@ import pickle import random from dataclasses import dataclass -from urllib.parse import urlparse +from asgiref.sync import async_to_sync, sync_to_async from django.conf import settings +from httpx import URL, Cookies, Request from juriscraper.pacer import PacerSession from redis import Redis -from requests.cookies import RequestsCookieJar from cl.lib.redis_utils import get_redis_interface @@ -26,7 +26,7 @@ class SessionData: `ProxyPacerSession` class. """ - cookies: RequestsCookieJar + cookies: Cookies proxy_address: str = "" def __post_init__(self): @@ -68,10 +68,10 @@ def __init__( } self.headers["X-WhSentry-TLS"] = "true" - def send(self, request, **kwargs): + async def send(self, request: Request, **kwargs): """Send a given PreparedRequest.""" request.url = self._change_protocol(request.url) - return super().send(request, **kwargs) + return await super().send(request, **kwargs) def _pick_proxy_connection(self) -> str: """ @@ -85,7 +85,7 @@ def _pick_proxy_connection(self) -> str: """ return random.choice(settings.EGRESS_PROXY_HOSTS) - def _change_protocol(self, url: str) -> str: + def _change_protocol(self, url: URL | str) -> URL: """Converts a URL from HTTPS to HTTP protocol. By default, HTTP clients create a CONNECT tunnel when a proxy is @@ -100,26 +100,25 @@ def _change_protocol(self, url: str) -> str: https://github.com/juggernaut/webhook-sentry?tab=readme-ov-file#https-target Args: - url (str): The URL to modify. + url (URL): The URL to modify. Returns: - str: The URL with the protocol changed from HTTPS to HTTP. + URL: The URL with the protocol changed from HTTPS to HTTP. """ - new_url = urlparse(url) - return new_url._replace(scheme="http").geturl() + return URL(url, scheme="http") - def _prepare_login_request(self, url, *args, **kwargs): - return super(PacerSession, self).post( + async def _prepare_login_request(self, url, *args, **kwargs): + return await super(PacerSession, self).post( self._change_protocol(url), **kwargs ) - def post(self, url, *args, **kwargs): - return super().post(self._change_protocol(url), **kwargs) + async def post(self, url, *args, **kwargs): + return await super().post(self._change_protocol(url), **kwargs) - def get(self, url, *args, **kwargs): - return super().get(self._change_protocol(url), **kwargs) + async def get(self, url, *args, **kwargs): + return await super().get(self._change_protocol(url), **kwargs) - def _get_saml_auth_request_parameters( + async def _get_saml_auth_request_parameters( self, court_id: str ) -> dict[str, str]: """ @@ -129,14 +128,16 @@ def _get_saml_auth_request_parameters( workflow can be reused in subsequent requests through a proxy connection by setting their 'secure' attribute to False. """ - saml_credentials = super()._get_saml_auth_request_parameters(court_id) + saml_credentials = await super()._get_saml_auth_request_parameters( + court_id + ) # Update cookies so they can be sent over non-HTTPS connections for cookie in self.cookies: cookie.secure = False return saml_credentials -def log_into_pacer( +async def log_into_pacer( username: str, password: str, client_code: str | None = None, @@ -154,10 +155,11 @@ def log_into_pacer( password=password, client_code=client_code, ) - s.login() + await s.login() return SessionData(s.cookies, s.proxy_address) +@sync_to_async def get_or_cache_pacer_cookies( user_pk: str | int, username: str, @@ -184,7 +186,7 @@ def get_or_cache_pacer_cookies( :return: A SessionData object containing the session's cookies and proxy. """ r = get_redis_interface("CACHE", decode_responses=False) - cookies_data = get_pacer_cookie_from_cache(user_pk, r=r) + cookies_data = async_to_sync(get_pacer_cookie_from_cache)(user_pk, r=r) ttl_seconds = r.ttl(session_key % user_pk) if cookies_data and ttl_seconds >= 300 and not refresh: # cookies were found in cache and ttl >= 5 minutes, return them @@ -192,7 +194,9 @@ def get_or_cache_pacer_cookies( # Unable to find cookies in cache, are about to expire or refresh needed # Login and cache new values. - session_data = log_into_pacer(username, password, client_code) + session_data = async_to_sync(log_into_pacer)( + username, password, client_code + ) cookie_expiration = 60 * 60 r.set( session_key % user_pk, @@ -202,6 +206,7 @@ def get_or_cache_pacer_cookies( return session_data +@sync_to_async def get_pacer_cookie_from_cache( user_pk: str | int, r: Redis | None = None, @@ -220,6 +225,7 @@ def get_pacer_cookie_from_cache( return pickle.loads(pickled_cookie) +@sync_to_async def delete_pacer_cookie_from_cache( user_pk: str | int, r: Redis | None = None, diff --git a/cl/lib/tests.py b/cl/lib/tests.py index 553c7aee35..91b71424ea 100644 --- a/cl/lib/tests.py +++ b/cl/lib/tests.py @@ -242,7 +242,7 @@ def test_compute_new_cookies_with_new_format(self, mock_log_into_pacer): self.test_cookies, "http://proxy_1:9090", ) - session_data = get_or_cache_pacer_cookies( + session_data = async_to_sync(get_or_cache_pacer_cookies)( "test_user_new_cookie", username="test", password="password" ) self.assertEqual(mock_log_into_pacer.call_count, 1) @@ -252,7 +252,7 @@ def test_compute_new_cookies_with_new_format(self, mock_log_into_pacer): @patch("cl.lib.pacer_session.log_into_pacer") def test_parse_cookie_proxy_pair_properly(self, mock_log_into_pacer): """Can we parse the dataclass from cache properly?""" - session_data = get_or_cache_pacer_cookies( + session_data = async_to_sync(get_or_cache_pacer_cookies)( "test_user_new_format", username="test", password="password" ) self.assertEqual(mock_log_into_pacer.call_count, 0) @@ -270,7 +270,7 @@ def test_compute_cookies_for_almost_expired_data( # Attempts to get almost expired cookies with the new format from cache # Expects refresh. - session_data = get_or_cache_pacer_cookies( + session_data = async_to_sync(get_or_cache_pacer_cookies)( "test_new_format_almost_expired", username="test", password="password", diff --git a/cl/recap/api_serializers.py b/cl/recap/api_serializers.py index 88a7c4256e..6710fbb8d3 100644 --- a/cl/recap/api_serializers.py +++ b/cl/recap/api_serializers.py @@ -1,3 +1,4 @@ +from asgiref.sync import async_to_sync from juriscraper.lib.exceptions import PacerLoginException from rest_framework import serializers from rest_framework.exceptions import ValidationError @@ -128,7 +129,7 @@ def validate(self, attrs): UPLOAD_TYPE.APPELLATE_CASE_QUERY_RESULT_PAGE, ]: # Appellate court dockets. Is the court valid? - if not is_appellate_court(attrs["court"].pk): + if not async_to_sync(is_appellate_court)(attrs["court"].pk): raise ValidationError( "{} is not an appellate court ID. Did you mean to use the " "upload_type for district dockets?".format(attrs["court"]) @@ -319,7 +320,7 @@ def validate(self, attrs): if ( attrs.get("pacer_case_id") and not attrs.get("docket_number") - and is_appellate_court(attrs.get("court").pk) + and async_to_sync(is_appellate_court)(attrs.get("court").pk) ): # The user is trying to purchase an appellate docket using only the # PACER case ID, which is not supported. @@ -331,7 +332,7 @@ def validate(self, attrs): court_id = get_court_id_from_fetch_queue(attrs) if ( attrs.get("de_number_end") or attrs.get("de_number_start") - ) and is_appellate_court(court_id): + ) and async_to_sync(is_appellate_court)(court_id): raise ValidationError( "Docket entry filtering by number is not supported for " "appellate courts. Use date range filtering with " @@ -371,7 +372,7 @@ def validate(self, attrs): # Do the PACER credentials work? try: - _ = get_or_cache_pacer_cookies( + _ = async_to_sync(get_or_cache_pacer_cookies)( attrs["user"].pk, username=attrs.pop("pacer_username"), password=attrs.pop("pacer_password"), diff --git a/cl/recap/management/commands/merge_idb_into_dockets.py b/cl/recap/management/commands/merge_idb_into_dockets.py index ba90c71071..d8fc587108 100644 --- a/cl/recap/management/commands/merge_idb_into_dockets.py +++ b/cl/recap/management/commands/merge_idb_into_dockets.py @@ -1,5 +1,6 @@ import os +from asgiref.sync import async_to_sync from celery.canvas import chain from django.conf import settings from juriscraper.lib.string_utils import CaseNameTweaker @@ -119,7 +120,7 @@ def update_any_missing_pacer_case_ids(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() for i, d in enumerate(ds.iterator()): if i < options["offset"]: continue @@ -132,7 +133,7 @@ def update_any_missing_pacer_case_ids(options): session = ProxyPacerSession( username=PACER_USERNAME, password=PACER_PASSWORD ) - session.login() + async_to_sync(session.login)() throttle.maybe_wait() logger.info("Getting pacer_case_id for item %s", d) diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py index 4d9f1ae4aa..fd8f84c05f 100644 --- a/cl/recap/mergers.py +++ b/cl/recap/mergers.py @@ -20,7 +20,7 @@ set_skip_percolation_if_parties_data, ) from cl.corpus_importer.utils import ( - ais_appellate_court, + is_appellate_court, is_long_appellate_document_number, mark_ia_upload_needed, ) @@ -1001,7 +1001,8 @@ async def add_docket_entries( # entry, we avoid creating the main RD a second+ time when we get the # docket sheet a second+ time. - appellate_court_id_exists = await ais_appellate_court(d.court_id) + appellate_court_id_exists = await is_appellate_court(d.court_id) + appellate_rd_att_exists = False if de_created is False and appellate_court_id_exists: # In existing appellate entry merges, check if the entry has at # least one attachment. @@ -1962,7 +1963,7 @@ async def merge_attachment_page_data( ContentFile(text.encode()), ) - court_is_appellate = await ais_appellate_court(court.pk) + court_is_appellate = await is_appellate_court(court.pk) main_rd_to_att = False for attachment in attachment_dicts: sanity_checks = [ diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index 3d281aa91f..2b607e7392 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -12,7 +12,6 @@ from zipfile import ZipFile import httpx -import requests from asgiref.sync import async_to_sync, sync_to_async from botocore import exceptions as botocore_exception from celery import Task @@ -42,8 +41,6 @@ from juriscraper.pacer.email import DocketType from lxml.etree import ParserError from redis import ConnectionError as RedisConnectionError -from requests import HTTPError -from requests.packages.urllib3.exceptions import ReadTimeoutError from cl.alerts.tasks import enqueue_docket_alert, send_alert_and_webhook from cl.alerts.utils import ( @@ -64,7 +61,6 @@ update_rd_metadata, ) from cl.corpus_importer.utils import ( - ais_appellate_court, is_appellate_court, is_bankruptcy_court, is_long_appellate_document_number, @@ -188,7 +184,7 @@ def do_pacer_fetch(fq: PacerFetchQueue): court_id = get_court_id_from_fetch_queue(fq) c = ( chain(fetch_appellate_docket.si(fq.pk)) - if is_appellate_court(court_id) + if async_to_sync(is_appellate_court)(court_id) else chain(fetch_docket.si(fq.pk)) ) c = c | mark_fq_successful.si(fq.pk) @@ -421,7 +417,7 @@ async def process_recap_pdf(pk, subdocket_replication: bool = False): # from PQ if this task is part of a subdocket replication. In subdockets, # this metadata may differ even when the document is the same. if ( - not await ais_appellate_court(court_id) + not await is_appellate_court(court_id) or not is_long_appellate_document_number(rd.document_number) ) and not subdocket_replication: rd.document_number = str(pq.document_number) @@ -820,7 +816,7 @@ async def find_subdocket_pdf_rds( ) subdocket_replication = False - if await ais_appellate_court(pq.court_id): + if await is_appellate_court(pq.court_id): # Abort the process for appellate documents. Subdockets cannot be found # in appellate cases. return [(pq.pk, subdocket_replication)] @@ -1978,15 +1974,15 @@ def fetch_pacer_doc_by_rd_base( if not is_pacer_court_accessible(rd.docket_entry.docket.court_id): if self.request.retries == self.max_retries: msg = f"Blocked by court: {rd.docket_entry.docket.court_id}" - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None raise self.retry() - mark_fq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS) + async_to_sync(mark_fq_status)(fq, "", PROCESSING_STATUS.IN_PROGRESS) if rd.is_available: msg = "PDF already marked as 'is_available'. Doing nothing." - mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.SUCCESSFUL) self.request.chain = None return @@ -1998,14 +1994,16 @@ def fetch_pacer_doc_by_rd_base( "document associated with it, or it may need to be updated via " "the docket report to acquire a pacer_doc_id. Aborting request." ) - mark_fq_status(fq, msg, PROCESSING_STATUS.INVALID_CONTENT) + async_to_sync(mark_fq_status)( + fq, msg, PROCESSING_STATUS.INVALID_CONTENT + ) self.request.chain = None return - session_data = get_pacer_cookie_from_cache(fq.user_id) + session_data = async_to_sync(get_pacer_cookie_from_cache)(fq.user_id) if not session_data: msg = "Unable to find cached cookies. Aborting request." - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return @@ -2014,14 +2012,14 @@ def fetch_pacer_doc_by_rd_base( court_id = rd.docket_entry.docket.court_id try: if rd.is_acms_document(): - r, r_msg = download_acms_pdf_by_rd( + r, r_msg = async_to_sync(download_acms_pdf_by_rd)( court_id=court_id, acms_entry_id=rd.pacer_doc_id, acms_doc_id=rd.acms_document_guid, session_data=session_data, ) else: - r, r_msg = download_pacer_pdf_by_rd( + r, r_msg = async_to_sync(download_pacer_pdf_by_rd)( rd.pk, pacer_case_id, pacer_doc_id, @@ -2029,19 +2027,19 @@ def fetch_pacer_doc_by_rd_base( magic_number, de_seq_num=de_seq_num, ) - except (requests.RequestException, HTTPError): + except (httpx.RequestError, httpx.HTTPError): msg = "Failed to get PDF from network." - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return except PacerLoginException as exc: msg = f"PacerLoginException while getting document for rd: {rd.pk}." if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) - delete_pacer_cookie_from_cache(fq.user_id) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(delete_pacer_cookie_from_cache)(fq.user_id) self.request.chain = None return None - mark_fq_status( + async_to_sync(mark_fq_status)( fq, f"{msg} Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) @@ -2049,8 +2047,7 @@ def fetch_pacer_doc_by_rd_base( pdf_bytes = None if r: pdf_bytes = r.content - success, msg = update_rd_metadata( - self, + success, msg = async_to_sync(update_rd_metadata)( rd_pk, pdf_bytes, r_msg, @@ -2063,13 +2060,13 @@ def fetch_pacer_doc_by_rd_base( ) if success is False: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return # Logic to replicate the PDF sub-dockets matched by RECAPDocument subdocket_pqs_to_replicate = [] - if not is_appellate_court(court_id): + if not async_to_sync(is_appellate_court)(court_id): subdocket_pqs_to_replicate = find_subdocket_pdf_rds_from_data( fq.user_id, court_id, pacer_doc_id, [pacer_case_id], pdf_bytes ) @@ -2144,7 +2141,7 @@ def fetch_pacer_doc_by_rd_and_mark_fq_completed( # case, fetch_pacer_doc_by_rd_base will return None. fq = PacerFetchQueue.objects.get(pk=fq_pk) msg = "Successfully completed fetch and save." - mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.SUCCESSFUL) return None @@ -2180,15 +2177,15 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[tuple[int, bool]]: if not is_pacer_court_accessible(court_id): if self.request.retries == self.max_retries: msg = f"Blocked by court: {court_id}" - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return [] raise self.retry() - mark_fq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS) + async_to_sync(mark_fq_status)(fq, "", PROCESSING_STATUS.IN_PROGRESS) if not pacer_doc_id: msg = f"Unable to get attachment page: Unknown pacer_doc_id for RECAP Document object {rd.pk}" - mark_fq_status(fq, msg, PROCESSING_STATUS.NEEDS_INFO) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.NEEDS_INFO) self.request.chain = None return [] @@ -2199,15 +2196,15 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[tuple[int, bool]]: self.request.chain = None return [] - session_data = get_pacer_cookie_from_cache(fq.user_id) + session_data = async_to_sync(get_pacer_cookie_from_cache)(fq.user_id) if not session_data: msg = "Unable to find cached cookies. Aborting request." - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return [] try: - r = get_att_report_by_rd(rd, session_data) + r = async_to_sync(get_att_report_by_rd)(rd, session_data) except ParserError as exc: if self.request.retries == self.max_retries: msg = "ParserError while getting attachment page" @@ -2215,14 +2212,24 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[tuple[int, bool]]: self.request.chain = None return [] raise self.retry(exc=exc) - except HTTPError as exc: + except httpx.RequestError as exc: + if self.request.retries == self.max_retries: + msg = "Failed to get attachment page from network." + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) + self.request.chain = None + return [] + logger.info("Ran into a RequestException. Retrying.") + raise self.retry(exc=exc) + except httpx.HTTPError as exc: msg = "Failed to get attachment page from network." if exc.response.status_code in [ HTTPStatus.INTERNAL_SERVER_ERROR, HTTPStatus.GATEWAY_TIMEOUT, ]: if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)( + fq, msg, PROCESSING_STATUS.FAILED + ) self.request.chain = None return [] logger.info( @@ -2230,30 +2237,22 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[tuple[int, bool]]: ) raise self.retry(exc=exc) else: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) - self.request.chain = None - return [] - except requests.RequestException as exc: - if self.request.retries == self.max_retries: - msg = "Failed to get attachment page from network." - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return [] - logger.info("Ran into a RequestException. Retrying.") - raise self.retry(exc=exc) except PacerLoginException as exc: msg = "PacerLoginException while getting attachment page" if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) - delete_pacer_cookie_from_cache(fq.user_id) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(delete_pacer_cookie_from_cache)(fq.user_id) self.request.chain = None return [] - mark_fq_status( + async_to_sync(mark_fq_status)( fq, f"{msg} Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) - is_appellate = is_appellate_court(court_id) + is_appellate = async_to_sync(is_appellate_court)(court_id) if not is_acms_case: text = r.response.text # Determine the appropriate parser function based on court jurisdiction @@ -2270,7 +2269,9 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[tuple[int, bool]]: if att_data == {}: msg = "Not a valid attachment page upload" - mark_fq_status(fq, msg, PROCESSING_STATUS.INVALID_CONTENT) + async_to_sync(mark_fq_status)( + fq, msg, PROCESSING_STATUS.INVALID_CONTENT + ) self.request.chain = None return [] @@ -2297,22 +2298,24 @@ def fetch_attachment_page(self: Task, fq_pk: int) -> list[tuple[int, bool]]: "Too many documents found when attempting to associate " "attachment data" ) - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return [] except RECAPDocument.DoesNotExist as exc: msg = "Could not find docket to associate with attachment metadata" if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return [] - mark_fq_status(fq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY) + async_to_sync(mark_fq_status)( + fq, msg, PROCESSING_STATUS.QUEUED_FOR_RETRY + ) raise self.retry(exc=exc) msg = "Successfully completed fetch and save." - mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.SUCCESSFUL) # Logic to replicate the attachment page to sub-dockets matched by RECAPDocument - if is_appellate_court(court_id): + if async_to_sync(is_appellate_court)(court_id): # Subdocket replication for appellate courts is currently not supported. self.request.chain = None return [] @@ -2410,7 +2413,7 @@ def get_fq_appellate_docket_kwargs(fq: PacerFetchQueue): } -def fetch_pacer_case_id_and_title(s, fq, court_id): +async def fetch_pacer_case_id_and_title(s, fq, court_id): """Use PACER's hidden API to learn the pacer_case_id of a case :param s: A PacerSession object to use @@ -2427,7 +2430,7 @@ def fetch_pacer_case_id_and_title(s, fq, court_id): ) report = PossibleCaseNumberApi(map_cl_to_pacer_id(court_id), s) - report.query(docket_number) + await report.query(docket_number) return report.data() return {} @@ -2483,7 +2486,7 @@ def fetch_docket_by_pacer_case_id( :return: a dict with information about the docket and the new data """ report = DocketReport(map_cl_to_pacer_id(court_id), session) - report.query(pacer_case_id, **get_fq_docket_kwargs(fq)) + async_to_sync(report.query)(pacer_case_id, **get_fq_docket_kwargs(fq)) docket_data = report.data if not docket_data: @@ -2512,7 +2515,7 @@ def purchase_appellate_docket_by_docket_number( if should_check_acms_court(court_id): acms_search = AcmsCaseSearch(court_id=court_id, pacer_session=session) - acms_search.query(docket_number) + async_to_sync(acms_search.query)(docket_number) acms_case_id = ( acms_search.data["pcx_caseid"] if acms_search.data else None ) @@ -2524,9 +2527,9 @@ def purchase_appellate_docket_by_docket_number( if acms_case_id: # ACMSDocketReport only accepts the case ID; filters are not currently # supported for ACMS docket reports. - report.query(acms_case_id) + async_to_sync(report.query)(acms_case_id) else: - report.query(docket_number, **kwargs) + async_to_sync(report.query)(docket_number, **kwargs) docket_data = report.data if not docket_data: @@ -2564,17 +2567,17 @@ def fetch_appellate_docket(self, fq_pk): if not is_pacer_court_accessible(court_id): if self.request.retries == self.max_retries: msg = f"Blocked by court: {court_id}" - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None raise self.retry() async_to_sync(mark_pq_status)(fq, "", PROCESSING_STATUS.IN_PROGRESS) - session_data = get_pacer_cookie_from_cache(fq.user_id) + session_data = async_to_sync(get_pacer_cookie_from_cache)(fq.user_id) if session_data is None: msg = f"Cookie cache expired before task could run for user: {fq.user_id}" - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None @@ -2594,13 +2597,13 @@ def fetch_appellate_docket(self, fq_pk): fq=fq, **get_fq_appellate_docket_kwargs(fq), ) - except (requests.RequestException, ReadTimeoutError) as exc: + except (httpx.RequestError, httpx.ReadTimeout) as exc: msg = f"Network error while purchasing docket for fq: {fq_pk}." if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None - mark_fq_status( + async_to_sync(mark_fq_status)( fq, f"{msg}Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) @@ -2609,16 +2612,16 @@ def fetch_appellate_docket(self, fq_pk): f"PacerLoginException while getting pacer_case_id for fq: {fq_pk}." ) if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None - mark_fq_status( + async_to_sync(mark_fq_status)( fq, f"{msg} Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) except ParsingException: msg = f"Unable to purchase docket for fq: {fq_pk}." - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None @@ -2662,17 +2665,17 @@ def fetch_docket(self, fq_pk): if not is_pacer_court_accessible(court_id): if self.request.retries == self.max_retries: msg = f"Blocked by court: {court_id}" - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None raise self.retry() async_to_sync(mark_pq_status)(fq, "", PROCESSING_STATUS.IN_PROGRESS) - session_data = get_pacer_cookie_from_cache(fq.user_id) + session_data = async_to_sync(get_pacer_cookie_from_cache)(fq.user_id) if session_data is None: msg = f"Cookie cache expired before task could run for user: {fq.user_id}" - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None @@ -2680,14 +2683,14 @@ def fetch_docket(self, fq_pk): cookies=session_data.cookies, proxy=session_data.proxy_address ) try: - result = fetch_pacer_case_id_and_title(s, fq, court_id) - except (requests.RequestException, ReadTimeoutError) as exc: + result = async_to_sync(fetch_pacer_case_id_and_title)(s, fq, court_id) + except (httpx.RequestError, httpx.ReadTimeout) as exc: msg = f"Network error getting pacer_case_id for fq: {fq_pk}." if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None - mark_fq_status( + async_to_sync(mark_fq_status)( fq, f"{msg} Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) @@ -2696,16 +2699,16 @@ def fetch_docket(self, fq_pk): f"PacerLoginException while getting pacer_case_id for fq: {fq_pk}." ) if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None - mark_fq_status( + async_to_sync(mark_fq_status)( fq, f"{msg} Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) except ParsingException: msg = "Unable to parse pacer_case_id for docket." - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None @@ -2716,7 +2719,7 @@ def fetch_docket(self, fq_pk): if result is None: msg = "Cannot find case by docket number (perhaps it's sealed?)" - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None @@ -2728,26 +2731,26 @@ def fetch_docket(self, fq_pk): if not pacer_case_id: msg = "Unable to determine pacer_case_id for docket." - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None start_time = now() try: result = fetch_docket_by_pacer_case_id(s, court_id, pacer_case_id, fq) - except (requests.RequestException, ReadTimeoutError) as exc: + except (httpx.RequestError, httpx.ReadTimeout) as exc: msg = "Network error getting pacer_case_id for fq: %s." if self.request.retries == self.max_retries: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None - mark_fq_status( + async_to_sync(mark_fq_status)( fq, f"{msg}Retrying.", PROCESSING_STATUS.QUEUED_FOR_RETRY ) raise self.retry(exc=exc) except ParsingException: msg = "Unable to parse pacer_case_id for docket." - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.FAILED) self.request.chain = None return None @@ -2770,10 +2773,10 @@ def fetch_docket(self, fq_pk): def mark_fq_successful(fq_pk): fq = PacerFetchQueue.objects.get(pk=fq_pk) msg = "Successfully completed fetch and save." - mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) + async_to_sync(mark_fq_status)(fq, msg, PROCESSING_STATUS.SUCCESSFUL) -def mark_fq_status(fq, msg, status): +async def mark_fq_status(fq, msg, status): """Update the PacerFetchQueue item with the status and message provided :param fq: The PacerFetchQueue item to update @@ -2786,8 +2789,8 @@ def mark_fq_status(fq, msg, status): fq.status = status if status == PROCESSING_STATUS.SUCCESSFUL: fq.date_completed = now() - fq.save() - send_recap_fetch_webhooks(fq) + await fq.asave() + await send_recap_fetch_webhooks(fq) def get_recap_email_recipients( @@ -2809,7 +2812,9 @@ def get_recap_email_recipients( return recap_email_recipients -def get_attachment_page_by_url(att_page_url: str, court_id: str) -> str | None: +async def get_attachment_page_by_url( + att_page_url: str, court_id: str +) -> str | None: """Get the attachment page report for recap.email documents without being logged into PACER. @@ -2823,7 +2828,8 @@ def get_attachment_page_by_url(att_page_url: str, court_id: str) -> str | None: att_page_url, ) req_timeout = (60, 300) - att_response = requests.get(att_page_url, timeout=req_timeout) + async with httpx.AsyncClient() as client: + att_response = await client.get(att_page_url, timeout=req_timeout) att_data = get_data_from_att_report(att_response.text, court_id) if att_data == {}: msg = "Not a valid attachment page upload for recap.email" @@ -2832,7 +2838,7 @@ def get_attachment_page_by_url(att_page_url: str, court_id: str) -> str | None: return att_response.text -def set_rd_sealed_status( +async def set_rd_sealed_status( rd: RECAPDocument, magic_number: str | None, potentially_sealed: bool ) -> None: """Set RD is_sealed status according to the following conditions: @@ -2848,25 +2854,26 @@ def set_rd_sealed_status( :return: None """ - rd.refresh_from_db() + await rd.arefresh_from_db() if not rd.pacer_doc_id: return if not potentially_sealed: rd.is_sealed = False - rd.save() + await rd.asave() return rd.is_sealed = True - if not magic_number and not is_pacer_doc_sealed( - rd.docket_entry.docket.court.pk, rd.pacer_doc_id + docket_entry = await DocketEntry.objects.aget(id=rd.docket_entry_id) + docket = await Docket.objects.aget(id=docket_entry.docket_id) + if not magic_number and not await is_pacer_doc_sealed( + docket.court_id, rd.pacer_doc_id ): rd.is_sealed = False - rd.save() + await rd.asave() -def save_pacer_doc_from_pq( - self: Task, +async def save_pacer_doc_from_pq( rd: RECAPDocument, fq: PacerFetchQueue, pq: ProcessingQueue, @@ -2885,23 +2892,22 @@ def save_pacer_doc_from_pq( if rd.is_available: msg = "PDF already marked as 'is_available'. Doing nothing." - mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) + await mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) return if pq.status == PROCESSING_STATUS.FAILED or not pq.filepath_local: - set_rd_sealed_status(rd, magic_number, potentially_sealed=True) - mark_fq_status(fq, pq.error_message, PROCESSING_STATUS.FAILED) + await set_rd_sealed_status(rd, magic_number, potentially_sealed=True) + await mark_fq_status(fq, pq.error_message, PROCESSING_STATUS.FAILED) return with pq.filepath_local.open(mode="rb") as local_path: pdf_bytes = local_path.read() - mark_fq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS) + await mark_fq_status(fq, "", PROCESSING_STATUS.IN_PROGRESS) pacer_case_id = rd.docket_entry.docket.pacer_case_id court_id = rd.docket_entry.docket.court_id - success, msg = update_rd_metadata( - self, + success, msg = await update_rd_metadata( rd.pk, pdf_bytes, pq.error_message, @@ -2913,12 +2919,12 @@ def save_pacer_doc_from_pq( ) if success is False: - mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) + await mark_fq_status(fq, msg, PROCESSING_STATUS.FAILED) return msg = "Successfully completed fetch and save." - mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) - set_rd_sealed_status(rd, magic_number, potentially_sealed=False) + await mark_fq_status(fq, msg, PROCESSING_STATUS.SUCCESSFUL) + await set_rd_sealed_status(rd, magic_number, potentially_sealed=False) return rd.pk @@ -3001,7 +3007,7 @@ def download_pacer_pdf_and_save_to_pq( date_created__gt=cutoff_date, ) if created and magic_number and not is_bankr_short_doc_id: - response, r_msg = download_pdf_by_magic_number( + response, r_msg = async_to_sync(download_pdf_by_magic_number)( court_id, pacer_doc_id, pacer_case_id, @@ -3059,7 +3065,7 @@ def get_and_copy_recap_attachment_docs( :return: None """ - session_data = get_pacer_cookie_from_cache(user_pk) + session_data = async_to_sync(get_pacer_cookie_from_cache)(user_pk) appellate = False unique_pqs = [] for rd_att in att_rds: @@ -3081,7 +3087,7 @@ def get_and_copy_recap_attachment_docs( request_type=REQUEST_TYPE.PDF, recap_document=rd_att, ) - save_pacer_doc_from_pq(self, rd_att, fq, pq, magic_number) + async_to_sync(save_pacer_doc_from_pq)(rd_att, fq, pq, magic_number) if pq not in unique_pqs: unique_pqs.append(pq) @@ -3152,7 +3158,7 @@ def open_and_validate_email_notification( return data, body -def fetch_attachment_data( +async def fetch_attachment_data( document_url: str, court_id: str, dockets_updated: list[DocketUpdatedData], @@ -3169,18 +3175,18 @@ def fetch_attachment_data( :param user_pk: The user to associate with the ProcessingQueue object. :return: The HTML page text. """ - session_data = get_pacer_cookie_from_cache(user_pk) + session_data = await get_pacer_cookie_from_cache(user_pk) # Try to fetch the attachment page without being logged into PACER using # the free look URL. - att_report_text = get_attachment_page_by_url(document_url, court_id) + att_report_text = await get_attachment_page_by_url(document_url, court_id) if att_report_text is None: main_rd = ( - dockets_updated[0] + await dockets_updated[0] .des_returned[0] - .recap_documents.earliest("date_created") + .recap_documents.aearliest("date_created") ) # Get the attachment page being logged into PACER - att_report = get_att_report_by_rd(main_rd, session_data) + att_report = await get_att_report_by_rd(main_rd, session_data) att_report_text = att_report.response.text return att_report_text @@ -3327,7 +3333,7 @@ def get_acms_pacer_case_id( cookies=session_data.cookies, proxy=session_data.proxy_address ) acms_search = AcmsCaseSearch(court_id=court_id, pacer_session=s) - acms_search.query(docket_number) + async_to_sync(acms_search.query)(docket_number) return acms_search.data["pcx_caseid"] if acms_search.data else None @@ -3336,9 +3342,9 @@ def get_acms_pacer_case_id( autoretry_for=( botocore_exception.HTTPClientError, botocore_exception.ConnectionError, - requests.ConnectionError, - requests.RequestException, - requests.ReadTimeout, + httpx.ConnectError, + httpx.RequestError, + httpx.ReadTimeout, PacerLoginException, RedisConnectionError, ), @@ -3390,7 +3396,7 @@ def process_recap_email( start_time = now() # Ensures we have PACER cookies ready to go. - cookies_data = get_or_cache_pacer_cookies( + cookies_data = async_to_sync(get_or_cache_pacer_cookies)( user_pk, settings.PACER_USERNAME, settings.PACER_PASSWORD ) court_id = epq.court_id @@ -3420,7 +3426,9 @@ def process_recap_email( acms=acms, ) is_potentially_sealed_entry = ( - is_docket_entry_sealed(epq.court_id, pacer_case_id, pacer_doc_id) + async_to_sync(is_docket_entry_sealed)( + epq.court_id, pacer_case_id, pacer_doc_id + ) if pq.status == PROCESSING_STATUS.FAILED and not appellate and not bankr_short_doc_id @@ -3432,7 +3440,7 @@ def process_recap_email( ] if (appellate or acms) and doc_num_from_data is None: # Get the document number for appellate documents. - appellate_doc_num = get_document_number_for_appellate( + appellate_doc_num = async_to_sync(get_document_number_for_appellate)( epq.court_id, pacer_doc_id, pq, acms ) if appellate_doc_num: @@ -3517,7 +3525,7 @@ def process_recap_email( request_type=REQUEST_TYPE.PDF, recap_document=rd, ) - save_pacer_doc_from_pq(self, rd, fq, pq, magic_number) + async_to_sync(save_pacer_doc_from_pq)(rd, fq, pq, magic_number) rd.refresh_from_db() main_rds_available.append(rd.is_available) @@ -3532,7 +3540,7 @@ def process_recap_email( and not is_potentially_sealed_entry and not bankr_short_doc_id ): - att_report_text = fetch_attachment_data( + att_report_text = async_to_sync(fetch_attachment_data)( document_url, epq.court_id, dockets_updated, user_pk ) all_attachment_rds = merge_rd_attachments( @@ -3561,7 +3569,7 @@ def process_recap_email( pacer_doc_id and content_to_replicate and got_content_updated - and not is_appellate_court(court_id) + and not async_to_sync(is_appellate_court)(court_id) ): replicate_recap_email_to_subdockets( user_pk, diff --git a/cl/recap/tests/test_recap_email.py b/cl/recap/tests/test_recap_email.py index 514ccff062..83ea20568a 100644 --- a/cl/recap/tests/test_recap_email.py +++ b/cl/recap/tests/test_recap_email.py @@ -3,6 +3,7 @@ from pathlib import Path from unittest import mock +import httpx from asgiref.sync import async_to_sync, sync_to_async from django.conf import settings from django.contrib.auth.hashers import make_password @@ -1323,8 +1324,8 @@ async def test_docket_alert_toggle_confirmation_fails( @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b""), + return_value=( + httpx.Response(200, content=b"Hello World"), "OK", ), ) @@ -1333,9 +1334,9 @@ async def test_docket_alert_toggle_confirmation_fails( side_effect=lambda *args, **kwargs: MockResponse(200, mock_raw=True), ) @mock.patch( - "cl.recap.tasks.requests.get", - side_effect=lambda *args, **kwargs: MockResponse( - 200, mock_bucket_open("nyed_123019137279.html", "r", True) + "cl.recap.tasks.httpx.AsyncClient.get", + return_value=httpx.Response( + 200, content=mock_bucket_open("nyed_123019137279.html", "r", True) ), ) async def test_new_recap_email_with_attachments( @@ -1743,15 +1744,15 @@ async def test_new_nda_recap_email_case_no_auto_subscription( ) @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b"Hello World"), + return_value=( + httpx.Response(200, content=b"Hello World"), "OK", ), ) @mock.patch( - "cl.recap.tasks.requests.get", - side_effect=lambda *args, **kwargs: MockResponse( - 200, mock_bucket_open("jpml_85001321035.html", "r", True) + "cl.recap.tasks.httpx.AsyncClient.get", + return_value=httpx.Response( + 200, content=mock_bucket_open("jpml_85001321035.html", "r", True) ), ) @mock.patch( @@ -2055,9 +2056,9 @@ async def test_mark_as_sealed_nda_document_not_available_from_magic_link( side_effect=lambda *args, **kwargs: MockResponse(200, mock_raw=True), ) @mock.patch( - "cl.recap.tasks.requests.get", - side_effect=lambda *args, **kwargs: MockResponse( - 200, mock_bucket_open("nyed_123019137279.html", "r", True) + "cl.recap.tasks.httpx.AsyncClient.get", + return_value=httpx.Response( + 200, content=mock_bucket_open("nyed_123019137279.html", "r", True) ), ) async def test_mark_as_sealed_nef_documents_not_available_from_magic_link( @@ -3139,10 +3140,10 @@ def setUp(self) -> None: @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse( + return_value=( + httpx.Response( 200, - mock_bucket_open("nda_document.pdf", "rb", True), + content=mock_bucket_open("nda_document.pdf", "rb", True), ), "OK", ), @@ -3177,10 +3178,10 @@ async def test_nda_get_document_number_from_pdf( @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse( + return_value=( + httpx.Response( 200, - mock_bucket_open( + content=mock_bucket_open( "gov.uscourts.ca8.17-2543.00803263743.0.pdf", "rb", True ), ), @@ -3224,10 +3225,10 @@ async def test_nda_get_document_number_from_confirmation_page( @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse( + return_value=( + httpx.Response( 200, - mock_bucket_open( + content=mock_bucket_open( "gov.uscourts.ca8.17-2543.00803263743.0.pdf", "rb", True ), ), @@ -3270,8 +3271,8 @@ async def test_nda_get_document_number_fallback( @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b""), + return_value=( + httpx.Response(200, content=b""), "OK", ), ) @@ -3309,8 +3310,8 @@ async def test_nda_not_document_number_available( @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b""), + return_value=( + httpx.Response(200, content=b""), "OK", ), ) @@ -3359,7 +3360,7 @@ async def test_receive_same_recap_email_nda_notification_different_users( @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( + return_value=( None, "Document not available from magic link.", ), @@ -3397,13 +3398,13 @@ async def test_nda_document_not_available_get_from_confirmation_page( self.assertEqual(recap_document_first.docket_entry.entry_number, 148) -def mock_method_set_rd_sealed_status( +async def mock_method_set_rd_sealed_status( rd: RECAPDocument, magic_number: str | None, potentially_sealed: bool ) -> None: if rd.document_type == RECAPDocument.PACER_DOCUMENT: - set_rd_sealed_status(rd, magic_number, potentially_sealed=True) + await set_rd_sealed_status(rd, magic_number, potentially_sealed=True) return - return set_rd_sealed_status(rd, magic_number, potentially_sealed) + return await set_rd_sealed_status(rd, magic_number, potentially_sealed) @mock.patch("cl.recap.tasks.enqueue_docket_alert", return_value=True) @@ -3493,14 +3494,14 @@ def setUp(self) -> None: ) @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b"Hello World"), + return_value=( + httpx.Response(200, content=b"Hello World"), "OK", ), ) @mock.patch( - "cl.recap.tasks.requests.get", - side_effect=lambda *args, **kwargs: MockResponse(200, b"Att content."), + "cl.recap.tasks.httpx.AsyncClient.get", + return_value=httpx.Response(200, content=b"Att content."), ) async def test_nef_subdocket_replication_no_att( self, @@ -3639,14 +3640,14 @@ async def test_nef_subdocket_replication_no_att( ) @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b"Hello World"), + return_value=( + httpx.Response(200, content=b"Hello World"), "OK", ), ) @mock.patch( - "cl.recap.tasks.requests.get", - side_effect=lambda *args, **kwargs: MockResponse(200, b"Att content."), + "cl.recap.tasks.httpx.AsyncClient.get", + return_value=httpx.Response(200, content=b"Att content."), ) async def test_multi_nef_subdocket_replication( self, @@ -3848,14 +3849,14 @@ async def test_multi_nef_subdocket_replication( ) @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b"Hello World"), + return_value=( + httpx.Response(200, content=b"Hello World"), "OK", ), ) @mock.patch( - "cl.recap.tasks.requests.get", - side_effect=lambda *args, **kwargs: MockResponse(200, b"Att content."), + "cl.recap.tasks.httpx.AsyncClient.get", + return_value=httpx.Response(200, content=b"Att content."), ) async def test_avoid_triggering_replication_for_minute_entries( self, @@ -4002,8 +4003,8 @@ async def test_avoid_replication_seal_document_and_sealed_attachments( @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b"Hello World"), + return_value=( + httpx.Response(200, content=b"Hello World"), "OK", ), ) @@ -4013,7 +4014,7 @@ async def test_avoid_replication_seal_document_and_sealed_attachments( ) @mock.patch( "cl.recap.tasks.set_rd_sealed_status", - side_effect=mock_method_set_rd_sealed_status, + wraps=mock_method_set_rd_sealed_status, ) async def test_replication_sealed_document_with_no_sealed_attachments( self, @@ -4259,14 +4260,14 @@ async def test_avoid_replication_for_sealed_entry_with_attachments( ) @mock.patch( "cl.recap.tasks.download_pdf_by_magic_number", - side_effect=lambda z, x, c, v, b, d, e, a: ( - MockResponse(200, b"Hello World"), + return_value=( + httpx.Response(200, content=b"Hello World"), "OK", ), ) @mock.patch( - "cl.recap.tasks.requests.get", - side_effect=lambda *args, **kwargs: MockResponse(200, b"Att content."), + "cl.recap.tasks.httpx.AsyncClient.get", + return_value=httpx.Response(200, content=b"Att content."), ) async def test_recap_email_avoid_replication_on_pdf_available( self, diff --git a/cl/recap_rss/tasks.py b/cl/recap_rss/tasks.py index 4301d16256..42646c7862 100644 --- a/cl/recap_rss/tasks.py +++ b/cl/recap_rss/tasks.py @@ -6,7 +6,7 @@ from calendar import SATURDAY, SUNDAY from datetime import datetime, timedelta -import requests +import httpx from asgiref.sync import async_to_sync from celery import Task from dateparser import parse @@ -15,10 +15,10 @@ from django.core.mail import send_mail from django.db import IntegrityError, transaction from django.utils.timezone import now +from httpx import HTTPError from juriscraper.pacer import PacerRssFeed from pytz import timezone from redis import Redis -from requests import HTTPError from cl.alerts.tasks import enqueue_docket_alert from cl.celery_init import app @@ -205,8 +205,8 @@ def check_if_feed_changed( feed_status = RssFeedStatus.objects.get(pk=feed_status_pk) rss_feed = PacerRssFeed(map_cl_to_pacer_id(court_pk)) try: - rss_feed.query() - except requests.RequestException: + async_to_sync(rss_feed.query)() + except httpx.RequestError: logger.warning( "Network error trying to get RSS feed at %s", rss_feed.url ) diff --git a/cl/scrapers/tasks.py b/cl/scrapers/tasks.py index 1dc71f57ec..5b5f287f47 100644 --- a/cl/scrapers/tasks.py +++ b/cl/scrapers/tasks.py @@ -114,7 +114,7 @@ def update_document_from_text( @app.task( bind=True, - autoretry_for=(requests.ConnectionError, requests.ReadTimeout), + autoretry_for=(httpx.ConnectError, httpx.ReadTimeout), max_retries=5, retry_backoff=10, ) @@ -253,7 +253,7 @@ def extract_opinion_content( # TODO: Remove after the new extract_opinion_content is deployed. @app.task( bind=True, - autoretry_for=(requests.ConnectionError, requests.ReadTimeout), + autoretry_for=(httpx.ConnectError, httpx.ReadTimeout), max_retries=5, retry_backoff=10, ) @@ -584,8 +584,8 @@ async def extract_pdf_document_base( @app.task( bind=True, autoretry_for=( - requests.ConnectionError, - requests.ReadTimeout, + httpx.ConnectError, + httpx.ReadTimeout, httpx.TimeoutException, ), max_retries=3, @@ -664,7 +664,7 @@ def update_docket_info_iquery(self, d_pk: int, court_id: str) -> None: :param court_id: The court of the docket. Needed for throttling by court. :return: None """ - session_data = get_or_cache_pacer_cookies( + session_data = async_to_sync(get_or_cache_pacer_cookies)( "pacer_scraper", settings.PACER_USERNAME, password=settings.PACER_PASSWORD, @@ -678,8 +678,8 @@ def update_docket_info_iquery(self, d_pk: int, court_id: str) -> None: d = Docket.objects.get(pk=d_pk, court_id=court_id) report = CaseQuery(map_cl_to_pacer_id(d.court_id), s) try: - report.query(d.pacer_case_id) - except (requests.Timeout, requests.RequestException) as exc: + async_to_sync(report.query)(d.pacer_case_id) + except (httpx.Timeout, httpx.RequestError) as exc: logger.warning( "Timeout or unknown RequestException on iquery crawl. " "Trying again if retries not exceeded." diff --git a/cl/scrapers/utils.py b/cl/scrapers/utils.py index 9d7952e65e..a12790877a 100644 --- a/cl/scrapers/utils.py +++ b/cl/scrapers/utils.py @@ -164,12 +164,12 @@ async def test_for_meta_redirections( :param r: A response object :return: A boolean and value """ - response = await microservice( + mime_response = await microservice( service="buffer-extension", file=r.content, params={"mime": True}, ) - extension = response.text + extension = mime_response.text if extension == ".html": html_tree = html.fromstring(r.text) diff --git a/cl/search/management/commands/pacer_bulk_fetch.py b/cl/search/management/commands/pacer_bulk_fetch.py index 2bbe40c2dd..f04bd16e8f 100644 --- a/cl/search/management/commands/pacer_bulk_fetch.py +++ b/cl/search/management/commands/pacer_bulk_fetch.py @@ -5,6 +5,7 @@ from enum import Enum from typing import Any +from asgiref.sync import async_to_sync from django.contrib.auth.models import User from django.core.cache import cache from django.core.management.base import CommandError @@ -186,7 +187,7 @@ def setup_celery(self) -> None: def handle_pacer_session(self) -> None: """Make sure we have an active PACER session for the user.""" - get_or_cache_pacer_cookies( + async_to_sync(get_or_cache_pacer_cookies)( self.user.pk, username=self.pacer_username, password=self.pacer_password, diff --git a/cl/search/tests/test_pacer_bulk_fetch.py b/cl/search/tests/test_pacer_bulk_fetch.py index 728ee373ee..c15fc7d431 100644 --- a/cl/search/tests/test_pacer_bulk_fetch.py +++ b/cl/search/tests/test_pacer_bulk_fetch.py @@ -1,12 +1,13 @@ from datetime import timedelta from unittest.mock import MagicMock, patch +import httpx import time_machine from django.core.cache import cache as django_cache from django.core.management import call_command from django.utils import timezone from django.utils.timezone import now -from requests import HTTPError +from httpx import HTTPError from cl.lib.utils import append_value_in_cache from cl.recap.factories import PacerFetchQueueFactory @@ -23,7 +24,6 @@ ) from cl.search.models import RECAPDocument from cl.tests.cases import TestCase -from cl.tests.utils import MockResponse from cl.users.factories import UserFactory @@ -781,11 +781,8 @@ def tearDown(self): @patch( "cl.recap.tasks.download_pacer_pdf_by_rd", - side_effect=lambda z, x, c, v, b, de_seq_num: ( - MockResponse( - 200, - b"binary content", - ), + return_value=( + httpx.Response(200, content=b"binary content"), "OK", ), ) @@ -919,11 +916,8 @@ def test_abort_fqs_after_error( @patch( "cl.recap.tasks.download_pacer_pdf_by_rd", - side_effect=lambda z, x, c, v, b, de_seq_num: ( - MockResponse( - 200, - None, - ), + return_value=( + httpx.Response(200, content=None), "Document is sealed", ), ) diff --git a/cl/search/tests/tests_semantic_search_opinion.py b/cl/search/tests/tests_semantic_search_opinion.py index d7f880aa34..e82aebf80d 100644 --- a/cl/search/tests/tests_semantic_search_opinion.py +++ b/cl/search/tests/tests_semantic_search_opinion.py @@ -6,6 +6,7 @@ from unittest import mock from unittest.mock import MagicMock +from asgiref.sync import async_to_sync from django.conf import settings from django.core.management import call_command from django.test import TestCase, override_settings @@ -381,9 +382,11 @@ def _get_mock_for_inception(self, vectors: dict[str, Any] | None = None): inception_response.json.return_value = vectors return inception_response - def _test_api_results_count(self, params, expected_count, field_name): + async def _test_api_results_count( + self, params, expected_count, field_name + ): """Get the result count in a API query response""" - r = self.client.get( + r = await self.async_client.get( reverse("search-list", kwargs={"version": "v4"}), params ) got = len(r.data["results"]) @@ -400,7 +403,8 @@ def _test_api_results_count(self, params, expected_count, field_name): @override_flag("store-search-api-queries", active=True) @override_settings(WAFFLE_CACHE_PREFIX="test_semantic_search_opinion") - def test_can_perform_a_regular_semantic_query( + @async_to_sync + async def test_can_perform_a_regular_semantic_query( self, inception_mock ) -> None: """Can we perform a semantic search using the API?""" @@ -411,10 +415,12 @@ def test_can_perform_a_regular_semantic_query( # Perform search and check that exactly two results are returned search_params = {"q": self.situational_query, "semantic": True} - r = self._test_api_results_count(search_params, 2, "semantic query") + r = await self._test_api_results_count( + search_params, 2, "semantic query" + ) # Ensure a SearchQuery row was logged with SEMANTIC querymode - last_query = SearchQuery.objects.last() + last_query = await SearchQuery.objects.alast() self.assertEqual(last_query.query_mode, SearchQuery.SEMANTIC) content = r.content.decode() @@ -429,7 +435,7 @@ def test_can_perform_a_regular_semantic_query( cluster_id=cluster["cluster_id"], msg="Snippet content test." ): for opinion in cluster["opinions"]: - record = Opinion.objects.get(id=opinion["id"]) + record = await Opinion.objects.aget(id=opinion["id"]) self.assertNotEqual( opinion["snippet"], record.plain_text[: settings.NO_MATCH_HL_SIZE], @@ -439,7 +445,9 @@ def test_can_perform_a_regular_semantic_query( self.assertNotIn(f'"cluster_id":{self.opinion_4.cluster.id}', content) self.assertNotIn(f'"cluster_id":{self.opinion_5.cluster.id}', content) - def test_can_apply_filter_to_semantic_query(self, inception_mock) -> None: + async def test_can_apply_filter_to_semantic_query( + self, inception_mock + ) -> None: """Can we apply filtering to semantic search results?""" inception_mock.return_value = self._get_mock_for_inception( self.situational_query_vectors @@ -453,7 +461,7 @@ def test_can_apply_filter_to_semantic_query(self, inception_mock) -> None: } # Should return only the opinion from the Ohio court - r = self._test_api_results_count( + r = await self._test_api_results_count( search_params, 1, "semantic query with court filter" ) content = r.content.decode() @@ -468,14 +476,16 @@ def test_can_apply_filter_to_semantic_query(self, inception_mock) -> None: } # Should return only the result matching the docket number - r = self._test_api_results_count( + r = await self._test_api_results_count( search_params, 1, "semantic query with docket number filter" ) content = r.content.decode() self.assertNotIn(f'"cluster_id":{self.opinion_2.cluster.id}', content) self.assertIn(f'"cluster_id":{self.opinion_3.cluster.id}', content) - def test_can_sort_semantic_search_results(self, inception_mock) -> None: + async def test_can_sort_semantic_search_results( + self, inception_mock + ) -> None: """Can we sort semantic search results by cite count?""" inception_mock.return_value = self._get_mock_for_inception( self.situational_query_vectors @@ -487,7 +497,9 @@ def test_can_sort_semantic_search_results(self, inception_mock) -> None: "semantic": True, "order_by": "citeCount desc", } - r = self._test_api_results_count(search_params, 2, "citeCount desc") + r = await self._test_api_results_count( + search_params, 2, "citeCount desc" + ) content = r.content.decode() # Opinion with higher cite count should appear first @@ -504,7 +516,9 @@ def test_can_sort_semantic_search_results(self, inception_mock) -> None: "semantic": True, "order_by": "citeCount asc", } - r = self._test_api_results_count(search_params, 2, "citeCount asc") + r = await self._test_api_results_count( + search_params, 2, "citeCount asc" + ) content = r.content.decode() # Opinion with lower cite count should appear first @@ -515,14 +529,14 @@ def test_can_sort_semantic_search_results(self, inception_mock) -> None: " ordered by ascending citeCount.", ) - def test_is_semantic_score_standarized(self, inception_mock) -> None: + async def test_is_semantic_score_standarized(self, inception_mock) -> None: """Ensure that semantic scores are consistently returned as floats""" inception_mock.return_value = self._get_mock_for_inception( self.situational_query_vectors ) search_params = {"q": self.hybrid_query, "semantic": True} - r = self._test_api_results_count( + r = await self._test_api_results_count( search_params, 3, "hybrid semantic search query" ) @@ -542,7 +556,7 @@ def test_is_semantic_score_standarized(self, inception_mock) -> None: else: self.assertEqual(semantic_score, 0.0) - def test_can_do_hybrid_search_query(self, inception_mock) -> None: + async def test_can_do_hybrid_search_query(self, inception_mock) -> None: """Can we combine semantic and keyword matches in hybrid search?""" inception_mock.return_value = self._get_mock_for_inception( self.situational_query_vectors @@ -550,7 +564,7 @@ def test_can_do_hybrid_search_query(self, inception_mock) -> None: # Hybrid query should return semantic and keyword matches (3 total) search_params = {"q": self.hybrid_query, "semantic": True} - r = self._test_api_results_count( + r = await self._test_api_results_count( search_params, 3, "hybrid semantic search query" ) content = r.content.decode() @@ -573,7 +587,7 @@ def test_can_do_hybrid_search_query(self, inception_mock) -> None: cluster_id=cluster["cluster_id"], msg="Snippet content test." ): for opinion in cluster["opinions"]: - record = Opinion.objects.get(id=opinion["id"]) + record = await Opinion.objects.aget(id=opinion["id"]) if record.id == self.opinion_5.id: self.assertEqual( opinion["snippet"], diff --git a/cl/tests/fakes.py b/cl/tests/fakes.py index 59e1ca3727..20647b8478 100644 --- a/cl/tests/fakes.py +++ b/cl/tests/fakes.py @@ -1,7 +1,7 @@ from datetime import date from unittest.mock import MagicMock -from requests.exceptions import HTTPError, Timeout +from httpx import HTTPError, Response, TimeoutException from cl.corpus_importer.factories import ( CaseQueryDataFactory, @@ -18,7 +18,7 @@ class FakeDocketReport: def __init__(self, *args, **kwargs): pass - def query(self, *args, **kwargs): + async def query(self, *args, **kwargs): pass @property @@ -120,7 +120,7 @@ class FakePossibleCaseNumberApi: def __init__(self, *args, **kwargs): pass - def query(self, *args, **kwargs): + async def query(self, *args, **kwargs): pass def data(self, *args, **kwargs): @@ -139,7 +139,7 @@ class FakeAttachmentPage: def __init__(self, *args, **kwargs): pass - def query(self, *args, **kwargs): + async def query(self, *args, **kwargs): pass @property @@ -158,7 +158,7 @@ class FakeAppellateAttachmentPage: def __init__(self, *args, **kwargs): pass - def query(self, *args, **kwargs): + async def query(self, *args, **kwargs): pass @property @@ -222,10 +222,10 @@ class FakeFreeOpinionReport: def __init__(self, *args, **kwargs): pass - def download_pdf(self, *args, **kwargs) -> tuple[MagicMock, str]: - return MagicMock(content=b""), "" + async def download_pdf(self, *args, **kwargs) -> tuple[Response, str]: + return Response(200, content=b"Hello World"), "" - def query(self, *args, **kwargs): + async def query(self, *args, **kwargs): pass @property @@ -241,7 +241,7 @@ class FakeConfirmationPage: def __init__(self, *args, **kwargs): pass - def query(self, *args, **kwargs): + async def query(self, *args, **kwargs): pass @property @@ -325,7 +325,7 @@ def data(self, *args, **kwargs): 16: False, }, "gamb": HTTPError, - "hib": Timeout, + "hib": TimeoutException, "cacd": { 1: False, 2: False, @@ -369,16 +369,16 @@ def __init__(self, court_id, pacer_session=None): self.pacer_case_id = None self.court_id = court_id - def query(self, pacer_case_id): + async def query(self, pacer_case_id): self.pacer_case_id = pacer_case_id @property def data(self): test_pattern = test_patterns.get(self.court_id, {}) if not isinstance(test_pattern, dict) and issubclass( - test_pattern, Exception + test_pattern, (HTTPError | TimeoutException) ): - raise test_pattern() + raise test_pattern(message="Test Pattern Exception") if test_pattern and test_pattern.get(self.pacer_case_id): return CaseQueryDataFactory() diff --git a/pyproject.toml b/pyproject.toml index 71a5ff5ab9..7aabdcd337 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -112,7 +112,7 @@ dependencies = [ "django-cotton>=2.6.0", "django-cursor-pagination>=0.3.0", "django-elasticsearch-dsl>=8.0", - "juriscraper>=3.0.0", + "juriscraper @ git+https://github.com/ttys0dev/juriscraper.git@a61aba66097fd5db8032573eb7fc323f4fed4d10", "instructor>=1.14.1", "django-s3-express-cache>=0.1.0", "zohocrmsdk8-0==4.0.0", @@ -140,6 +140,9 @@ dev = [ "responses>=0.25.8", ] +[tool.uv.sources] +juriscraper = {git = "https://github.com/ttys0dev/juriscraper.git", rev = "a61aba66097fd5db8032573eb7fc323f4fed4d10"} + [tool.ruff] line-length = 79 lint.extend-safe-fixes = [ diff --git a/uv.lock b/uv.lock index cd45ab7525..3577154472 100644 --- a/uv.lock +++ b/uv.lock @@ -504,7 +504,7 @@ requires-dist = [ { name = "ipython", specifier = ">=9.9.0" }, { name = "itypes", specifier = ">=1.1.0" }, { name = "judge-pics", specifier = ">=2.0.5" }, - { name = "juriscraper", specifier = ">=3.0.0" }, + { name = "juriscraper", git = "https://github.com/ttys0dev/juriscraper.git?rev=a61aba66097fd5db8032573eb7fc323f4fed4d10" }, { name = "kombu", specifier = ">=5.5.1" }, { name = "lxml", specifier = ">=6.0.2" }, { name = "markdown2", specifier = ">=2.5.4" }, @@ -1856,7 +1856,7 @@ wheels = [ [[package]] name = "juriscraper" version = "3.0.2" -source = { registry = "https://pypi.org/simple" } +source = { git = "https://github.com/ttys0dev/juriscraper.git?rev=a61aba66097fd5db8032573eb7fc323f4fed4d10#a61aba66097fd5db8032573eb7fc323f4fed4d10" } dependencies = [ { name = "certifi" }, { name = "chardet" }, @@ -1870,13 +1870,7 @@ dependencies = [ { name = "lxml" }, { name = "nh3" }, { name = "python-dateutil" }, - { name = "requests" }, { name = "selenium" }, - { name = "tldextract" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/5a/0d/450d9a0a7c9b8f78ac00e420d3865148b3953db5e0172d3d2c0750b6b8dd/juriscraper-3.0.2.tar.gz", hash = "sha256:3273ad81e59cdf1c6789a54aaf421c4e46a80940ef668947d84cdb23056fab48", size = 386253, upload-time = "2026-03-20T15:11:22.283Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/90/6a/b7836b8301357d26463e329b0341e21f51ad3f23bc55387ae56be405dbfb/juriscraper-3.0.2-py3-none-any.whl", hash = "sha256:5b202d41cee5d2d22dd77778d162407b0530518e83f8b476fde7dcd342deee92", size = 612795, upload-time = "2026-03-20T15:11:20.583Z" }, ] [[package]] @@ -2899,18 +2893,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, ] -[[package]] -name = "requests-file" -version = "3.0.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "requests" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3c/f8/5dc70102e4d337063452c82e1f0d95e39abfe67aa222ed8a5ddeb9df8de8/requests_file-3.0.1.tar.gz", hash = "sha256:f14243d7796c588f3521bd423c5dea2ee4cc730e54a3cac9574d78aca1272576", size = 6967, upload-time = "2025-10-20T18:56:42.279Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e1/d5/de8f089119205a09da657ed4784c584ede8381a0ce6821212a6d4ca47054/requests_file-3.0.1-py2.py3-none-any.whl", hash = "sha256:d0f5eb94353986d998f80ac63c7f146a307728be051d4d1cd390dbdb59c10fa2", size = 4514, upload-time = "2025-10-20T18:56:41.184Z" }, -] - [[package]] name = "responses" version = "0.25.8" @@ -3275,21 +3257,6 @@ version = "0.5.0" source = { registry = "https://pypi.org/simple" } sdist = { url = "https://files.pythonhosted.org/packages/80/f8/0802dd14c58b5d3d72bb9caa4315535f58787a1dc50b81bbbcaaa15451be/timeout-decorator-0.5.0.tar.gz", hash = "sha256:6a2f2f58db1c5b24a2cc79de6345760377ad8bdc13813f5265f6c3e63d16b3d7", size = 4754, upload-time = "2020-11-15T00:53:06.506Z" } -[[package]] -name = "tldextract" -version = "5.3.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "filelock" }, - { name = "idna" }, - { name = "requests" }, - { name = "requests-file" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/65/7b/644fbbb49564a6cb124a8582013315a41148dba2f72209bba14a84242bf0/tldextract-5.3.1.tar.gz", hash = "sha256:a72756ca170b2510315076383ea2993478f7da6f897eef1f4a5400735d5057fb", size = 126105, upload-time = "2025-12-28T23:58:05.532Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6d/42/0e49d6d0aac449ca71952ec5bae764af009754fcb2e76a5cc097543747b3/tldextract-5.3.1-py3-none-any.whl", hash = "sha256:6bfe36d518de569c572062b788e16a659ccaceffc486d243af0484e8ecf432d9", size = 105886, upload-time = "2025-12-28T23:58:04.071Z" }, -] - [[package]] name = "tqdm" version = "4.67.1"