From a2f8499e1db013aba6bc58a3905cd76fce073f48 Mon Sep 17 00:00:00 2001 From: 0x26res Date: Fri, 13 Mar 2026 15:37:26 +0000 Subject: [PATCH 1/3] Add reproducible exaple --- tests/test_conversion.py | 13 ++++++++++--- tests/test_coverage.py | 25 +++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/tests/test_conversion.py b/tests/test_conversion.py index ba08945..377cca6 100644 --- a/tests/test_conversion.py +++ b/tests/test_conversion.py @@ -451,9 +451,16 @@ def test_get_arrow_default_value(enum_type: pa.DataType, expected: Any): def _check_messages_same(actual: Iterable[Message], expected: Iterable[Message]): - for left, right in zip(actual, expected): - assert left == right - assert actual == expected + for i, (left, right) in enumerate(zip(actual, expected)): + if left != right: + left_path = pathlib.Path(f"protarrow_mismatch_left_{i}.binpb") + right_path = pathlib.Path(f"protarrow_mismatch_right_{i}.binpb") + left_path.write_bytes(left.SerializeToString()) + right_path.write_bytes(right.SerializeToString()) + raise AssertionError( + f"Message {i} differs. Saved to {left_path} and {right_path}" + ) + assert len(actual) == len(expected) def test_nested_field_values_not_null_when_message_missing(): diff --git a/tests/test_coverage.py b/tests/test_coverage.py index 3229546..536af09 100644 --- a/tests/test_coverage.py +++ b/tests/test_coverage.py @@ -11,6 +11,7 @@ import pytest from google.protobuf.descriptor import Descriptor, EnumDescriptor, FieldDescriptor from google.protobuf.wrappers_pb2 import BoolValue, DoubleValue +from google.type.date_pb2 import Date import protarrow from protarrow import cast_record_batch @@ -575,3 +576,27 @@ def test_dict_binary_enum_with_large_binary(self): binary_type=pa.large_binary(), ) assert config.enum_type == pa.dictionary(pa.int32(), pa.binary()) + + +def test_flaky(): + default_date = Date() + date = Date(year=1, month=1, day=1) + + assert default_date != date + + example = ExampleMessage(date_value=date, date_values=[date]) + nested = NestedExampleMessage( + example_message=example, + repeated_example_message=[example], + ) + nested.example_message_int32_map[1].CopyFrom(example) + nested.example_message_string_map["a"].CopyFrom(example) + message = SuperNestedExampleMessage( + nested_example_message=nested, + repeated_nested_example_message=[nested], + ) + message.nested_example_message_int32_map[1].CopyFrom(example) + message.nested_example_message_string_map["a"].CopyFrom(example) + table = protarrow.messages_to_table([message], SuperNestedExampleMessage) + result = protarrow.table_to_messages(table, SuperNestedExampleMessage) + assert result == [message] From f0d315f3f175b7749885cd0a6b8b8cb32a71570d Mon Sep 17 00:00:00 2001 From: 0x26res Date: Fri, 13 Mar 2026 16:59:43 +0000 Subject: [PATCH 2/3] feat: doc --- docs/types.md | 7 ++++ protarrow/arrow_to_proto.py | 10 +++--- protarrow/proto_to_arrow.py | 12 ++++--- tests/random_generator.py | 2 +- tests/test_conversion.py | 13 ++----- tests/test_coverage.py | 68 +++++++++++++++++++++++++++---------- 6 files changed, 75 insertions(+), 37 deletions(-) diff --git a/docs/types.md b/docs/types.md index e51d867..68851ec 100644 --- a/docs/types.md +++ b/docs/types.md @@ -76,6 +76,13 @@ protarrow.ProtarrowConfig( ) ``` +## Date range limitation + +`google.type.Date` is converted through Python's `datetime.date`, which only supports dates from +`0001-01-01` to `9999-12-31`. Proto `Date` values outside this range (e.g. `Date(year=0, month=0, day=0)`) +cannot be represented as `datetime.date`. These values are stored using a special sentinel value and +will round-trip back as `Date(year=0, month=0, day=0)` regardless of the original month and day. + ## Nullability By default, nullability follows the convention imposed by protobuf: diff --git a/protarrow/arrow_to_proto.py b/protarrow/arrow_to_proto.py index 2daa70b..418630a 100644 --- a/protarrow/arrow_to_proto.py +++ b/protarrow/arrow_to_proto.py @@ -58,12 +58,14 @@ def _timestamp_s_scalar_to_proto(scalar: pa.TimestampScalar) -> Timestamp: return timestamp +_INVALID_DATE_SENTINEL = -719163 + + def _date_scalar_to_proto(scalar: pa.Date32Scalar) -> Date: - date: datetime.date = scalar.as_py() - if date == datetime.date.min: + if scalar.value == _INVALID_DATE_SENTINEL: return Date() - else: - return Date(year=date.year, month=date.month, day=date.day) + date: datetime.date = scalar.as_py() + return Date(year=date.year, month=date.month, day=date.day) def _time_64_ns_scalar_to_proto(scalar: pa.Time64Scalar) -> TimeOfDay: diff --git a/protarrow/proto_to_arrow.py b/protarrow/proto_to_arrow.py index f9909e9..8fb55db 100644 --- a/protarrow/proto_to_arrow.py +++ b/protarrow/proto_to_arrow.py @@ -89,15 +89,19 @@ def _time_of_day_to_seconds(time_of_day: TimeOfDay) -> int: return (time_of_day.hours * 60 + time_of_day.minutes) * 60 + time_of_day.seconds -def _proto_date_to_py_date(proto_date: Date) -> datetime.date: +_INVALID_DATE_SENTINEL = -719163 + + +def _proto_date_to_date32(proto_date: Date) -> int: if proto_date.year == 0: - return datetime.date.min + return _INVALID_DATE_SENTINEL else: - return datetime.date(proto_date.year, proto_date.month, proto_date.day) + date = datetime.date(proto_date.year, proto_date.month, proto_date.day) + return date.toordinal() - 719163 _PROTO_DESCRIPTOR_TO_ARROW_CONVERTER = { - Date.DESCRIPTOR: _proto_date_to_py_date, + Date.DESCRIPTOR: _proto_date_to_date32, TimeOfDay.DESCRIPTOR: _time_of_day_to_nanos, BoolValue.DESCRIPTOR: operator.attrgetter("value"), BytesValue.DESCRIPTOR: operator.attrgetter("value"), diff --git a/tests/random_generator.py b/tests/random_generator.py index f8b7201..728718c 100644 --- a/tests/random_generator.py +++ b/tests/random_generator.py @@ -42,7 +42,7 @@ def random_duration() -> Duration: def random_date() -> Date: - date = datetime.date.min + datetime.timedelta(days=random.randint(0, 3652058)) + date = datetime.date.fromordinal(random.randint(1, 3652059)) return Date(year=date.year, month=date.month, day=date.day) diff --git a/tests/test_conversion.py b/tests/test_conversion.py index 377cca6..ba08945 100644 --- a/tests/test_conversion.py +++ b/tests/test_conversion.py @@ -451,16 +451,9 @@ def test_get_arrow_default_value(enum_type: pa.DataType, expected: Any): def _check_messages_same(actual: Iterable[Message], expected: Iterable[Message]): - for i, (left, right) in enumerate(zip(actual, expected)): - if left != right: - left_path = pathlib.Path(f"protarrow_mismatch_left_{i}.binpb") - right_path = pathlib.Path(f"protarrow_mismatch_right_{i}.binpb") - left_path.write_bytes(left.SerializeToString()) - right_path.write_bytes(right.SerializeToString()) - raise AssertionError( - f"Message {i} differs. Saved to {left_path} and {right_path}" - ) - assert len(actual) == len(expected) + for left, right in zip(actual, expected): + assert left == right + assert actual == expected def test_nested_field_values_not_null_when_message_missing(): diff --git a/tests/test_coverage.py b/tests/test_coverage.py index 536af09..4e8fbc8 100644 --- a/tests/test_coverage.py +++ b/tests/test_coverage.py @@ -578,25 +578,57 @@ def test_dict_binary_enum_with_large_binary(self): assert config.enum_type == pa.dictionary(pa.int32(), pa.binary()) -def test_flaky(): +def test_date_behavior(): + assert pa.scalar(0, pa.date32()).as_py() == datetime.date(1970, 1, 1) + assert pa.scalar(-1, pa.date32()).as_py() == datetime.date(1969, 12, 31) + assert pa.scalar(-719162, pa.date32()).as_py() == datetime.date(1, 1, 1) + with pytest.raises(OverflowError, match=r"date value out of range"): + assert pa.scalar(-719163, pa.date32()).as_py() + + assert datetime.date(1970, 1, 1).toordinal() == 719163 + assert datetime.date.min.toordinal() == 1 + assert datetime.date.max.toordinal() == 3652059 + + assert datetime.date.fromordinal(1) == datetime.date.min + assert datetime.date.fromordinal(3652059) == datetime.date.max + + +def test_bad_date(): default_date = Date() - date = Date(year=1, month=1, day=1) + min_date = Date(year=1, month=1, day=1) - assert default_date != date + assert default_date != min_date - example = ExampleMessage(date_value=date, date_values=[date]) - nested = NestedExampleMessage( - example_message=example, - repeated_example_message=[example], - ) - nested.example_message_int32_map[1].CopyFrom(example) - nested.example_message_string_map["a"].CopyFrom(example) - message = SuperNestedExampleMessage( - nested_example_message=nested, - repeated_nested_example_message=[nested], + +def test_can_pass_min_max_date(): + default_date = Date() + bad_date = Date(year=0, month=1, day=1) + min_date = Date(year=1, month=1, day=1) + max_date = Date(year=9999, month=12, day=31) + + messages = [ + ExampleMessage(date_value=default_date), + ExampleMessage(date_value=bad_date), + ExampleMessage(date_value=min_date), + ExampleMessage(date_value=max_date), + ] + table = protarrow.messages_to_table(messages, ExampleMessage) + assert table["date_value"] == pa.chunked_array( + pa.array( + [ + -719163, + -719163, + -719162, + 2932896, + ], + type=pa.date32(), + ) ) - message.nested_example_message_int32_map[1].CopyFrom(example) - message.nested_example_message_string_map["a"].CopyFrom(example) - table = protarrow.messages_to_table([message], SuperNestedExampleMessage) - result = protarrow.table_to_messages(table, SuperNestedExampleMessage) - assert result == [message] + + messages_back = protarrow.table_to_messages(table, ExampleMessage) + assert messages_back == [ + ExampleMessage(date_value=default_date), + ExampleMessage(date_value=default_date), # This changed + ExampleMessage(date_value=min_date), + ExampleMessage(date_value=max_date), + ] From 9d9ba12908feb360d1d2b1a692dbc5517205d48a Mon Sep 17 00:00:00 2001 From: 0x26res Date: Fri, 13 Mar 2026 17:12:26 +0000 Subject: [PATCH 3/3] feat: move sentinel --- protarrow/arrow_to_proto.py | 11 +++++++---- protarrow/common.py | 2 ++ protarrow/proto_to_arrow.py | 13 ++++++++----- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/protarrow/arrow_to_proto.py b/protarrow/arrow_to_proto.py index 418630a..31dc6ee 100644 --- a/protarrow/arrow_to_proto.py +++ b/protarrow/arrow_to_proto.py @@ -23,7 +23,13 @@ from google.type.date_pb2 import Date from google.type.timeofday_pb2 import TimeOfDay -from protarrow.common import M, is_binary_enum, is_string_enum, offset_values_array +from protarrow.common import ( + _INVALID_DATE_SENTINEL, + M, + is_binary_enum, + is_string_enum, + offset_values_array, +) _NANOS_PER_UNIT = {"ns": 1, "us": 1_000, "ms": 1_000_000, "s": 1_000_000_000} _TIME_CONVERTER = { @@ -58,9 +64,6 @@ def _timestamp_s_scalar_to_proto(scalar: pa.TimestampScalar) -> Timestamp: return timestamp -_INVALID_DATE_SENTINEL = -719163 - - def _date_scalar_to_proto(scalar: pa.Date32Scalar) -> Date: if scalar.value == _INVALID_DATE_SENTINEL: return Date() diff --git a/protarrow/common.py b/protarrow/common.py index e3ccdcb..97a2c21 100644 --- a/protarrow/common.py +++ b/protarrow/common.py @@ -6,6 +6,8 @@ M = TypeVar("M", bound=Message) +_INVALID_DATE_SENTINEL = -719163 + SUPPORTED_ENUM_TYPES = ( pa.int32(), pa.binary(), diff --git a/protarrow/proto_to_arrow.py b/protarrow/proto_to_arrow.py index 8fb55db..190f169 100644 --- a/protarrow/proto_to_arrow.py +++ b/protarrow/proto_to_arrow.py @@ -37,7 +37,13 @@ from google.type.date_pb2 import Date from google.type.timeofday_pb2 import TimeOfDay -from protarrow.common import M, ProtarrowConfig, is_binary_enum, is_string_enum +from protarrow.common import ( + _INVALID_DATE_SENTINEL, + M, + ProtarrowConfig, + is_binary_enum, + is_string_enum, +) _PROTO_DESCRIPTOR_TO_PYARROW = { BoolValue.DESCRIPTOR: pa.bool_(), @@ -89,15 +95,12 @@ def _time_of_day_to_seconds(time_of_day: TimeOfDay) -> int: return (time_of_day.hours * 60 + time_of_day.minutes) * 60 + time_of_day.seconds -_INVALID_DATE_SENTINEL = -719163 - - def _proto_date_to_date32(proto_date: Date) -> int: if proto_date.year == 0: return _INVALID_DATE_SENTINEL else: date = datetime.date(proto_date.year, proto_date.month, proto_date.day) - return date.toordinal() - 719163 + return date.toordinal() + _INVALID_DATE_SENTINEL _PROTO_DESCRIPTOR_TO_ARROW_CONVERTER = {