diff --git a/docs/types.md b/docs/types.md index e51d867..68851ec 100644 --- a/docs/types.md +++ b/docs/types.md @@ -76,6 +76,13 @@ protarrow.ProtarrowConfig( ) ``` +## Date range limitation + +`google.type.Date` is converted through Python's `datetime.date`, which only supports dates from +`0001-01-01` to `9999-12-31`. Proto `Date` values outside this range (e.g. `Date(year=0, month=0, day=0)`) +cannot be represented as `datetime.date`. These values are stored using a special sentinel value and +will round-trip back as `Date(year=0, month=0, day=0)` regardless of the original month and day. + ## Nullability By default, nullability follows the convention imposed by protobuf: diff --git a/protarrow/arrow_to_proto.py b/protarrow/arrow_to_proto.py index 2daa70b..31dc6ee 100644 --- a/protarrow/arrow_to_proto.py +++ b/protarrow/arrow_to_proto.py @@ -23,7 +23,13 @@ from google.type.date_pb2 import Date from google.type.timeofday_pb2 import TimeOfDay -from protarrow.common import M, is_binary_enum, is_string_enum, offset_values_array +from protarrow.common import ( + _INVALID_DATE_SENTINEL, + M, + is_binary_enum, + is_string_enum, + offset_values_array, +) _NANOS_PER_UNIT = {"ns": 1, "us": 1_000, "ms": 1_000_000, "s": 1_000_000_000} _TIME_CONVERTER = { @@ -59,11 +65,10 @@ def _timestamp_s_scalar_to_proto(scalar: pa.TimestampScalar) -> Timestamp: def _date_scalar_to_proto(scalar: pa.Date32Scalar) -> Date: - date: datetime.date = scalar.as_py() - if date == datetime.date.min: + if scalar.value == _INVALID_DATE_SENTINEL: return Date() - else: - return Date(year=date.year, month=date.month, day=date.day) + date: datetime.date = scalar.as_py() + return Date(year=date.year, month=date.month, day=date.day) def _time_64_ns_scalar_to_proto(scalar: pa.Time64Scalar) -> TimeOfDay: diff --git a/protarrow/common.py b/protarrow/common.py index e3ccdcb..97a2c21 100644 --- a/protarrow/common.py +++ b/protarrow/common.py @@ -6,6 +6,8 @@ M = TypeVar("M", bound=Message) +_INVALID_DATE_SENTINEL = -719163 + SUPPORTED_ENUM_TYPES = ( pa.int32(), pa.binary(), diff --git a/protarrow/proto_to_arrow.py b/protarrow/proto_to_arrow.py index f9909e9..190f169 100644 --- a/protarrow/proto_to_arrow.py +++ b/protarrow/proto_to_arrow.py @@ -37,7 +37,13 @@ from google.type.date_pb2 import Date from google.type.timeofday_pb2 import TimeOfDay -from protarrow.common import M, ProtarrowConfig, is_binary_enum, is_string_enum +from protarrow.common import ( + _INVALID_DATE_SENTINEL, + M, + ProtarrowConfig, + is_binary_enum, + is_string_enum, +) _PROTO_DESCRIPTOR_TO_PYARROW = { BoolValue.DESCRIPTOR: pa.bool_(), @@ -89,15 +95,16 @@ def _time_of_day_to_seconds(time_of_day: TimeOfDay) -> int: return (time_of_day.hours * 60 + time_of_day.minutes) * 60 + time_of_day.seconds -def _proto_date_to_py_date(proto_date: Date) -> datetime.date: +def _proto_date_to_date32(proto_date: Date) -> int: if proto_date.year == 0: - return datetime.date.min + return _INVALID_DATE_SENTINEL else: - return datetime.date(proto_date.year, proto_date.month, proto_date.day) + date = datetime.date(proto_date.year, proto_date.month, proto_date.day) + return date.toordinal() + _INVALID_DATE_SENTINEL _PROTO_DESCRIPTOR_TO_ARROW_CONVERTER = { - Date.DESCRIPTOR: _proto_date_to_py_date, + Date.DESCRIPTOR: _proto_date_to_date32, TimeOfDay.DESCRIPTOR: _time_of_day_to_nanos, BoolValue.DESCRIPTOR: operator.attrgetter("value"), BytesValue.DESCRIPTOR: operator.attrgetter("value"), diff --git a/tests/random_generator.py b/tests/random_generator.py index f8b7201..728718c 100644 --- a/tests/random_generator.py +++ b/tests/random_generator.py @@ -42,7 +42,7 @@ def random_duration() -> Duration: def random_date() -> Date: - date = datetime.date.min + datetime.timedelta(days=random.randint(0, 3652058)) + date = datetime.date.fromordinal(random.randint(1, 3652059)) return Date(year=date.year, month=date.month, day=date.day) diff --git a/tests/test_coverage.py b/tests/test_coverage.py index 3229546..4e8fbc8 100644 --- a/tests/test_coverage.py +++ b/tests/test_coverage.py @@ -11,6 +11,7 @@ import pytest from google.protobuf.descriptor import Descriptor, EnumDescriptor, FieldDescriptor from google.protobuf.wrappers_pb2 import BoolValue, DoubleValue +from google.type.date_pb2 import Date import protarrow from protarrow import cast_record_batch @@ -575,3 +576,59 @@ def test_dict_binary_enum_with_large_binary(self): binary_type=pa.large_binary(), ) assert config.enum_type == pa.dictionary(pa.int32(), pa.binary()) + + +def test_date_behavior(): + assert pa.scalar(0, pa.date32()).as_py() == datetime.date(1970, 1, 1) + assert pa.scalar(-1, pa.date32()).as_py() == datetime.date(1969, 12, 31) + assert pa.scalar(-719162, pa.date32()).as_py() == datetime.date(1, 1, 1) + with pytest.raises(OverflowError, match=r"date value out of range"): + assert pa.scalar(-719163, pa.date32()).as_py() + + assert datetime.date(1970, 1, 1).toordinal() == 719163 + assert datetime.date.min.toordinal() == 1 + assert datetime.date.max.toordinal() == 3652059 + + assert datetime.date.fromordinal(1) == datetime.date.min + assert datetime.date.fromordinal(3652059) == datetime.date.max + + +def test_bad_date(): + default_date = Date() + min_date = Date(year=1, month=1, day=1) + + assert default_date != min_date + + +def test_can_pass_min_max_date(): + default_date = Date() + bad_date = Date(year=0, month=1, day=1) + min_date = Date(year=1, month=1, day=1) + max_date = Date(year=9999, month=12, day=31) + + messages = [ + ExampleMessage(date_value=default_date), + ExampleMessage(date_value=bad_date), + ExampleMessage(date_value=min_date), + ExampleMessage(date_value=max_date), + ] + table = protarrow.messages_to_table(messages, ExampleMessage) + assert table["date_value"] == pa.chunked_array( + pa.array( + [ + -719163, + -719163, + -719162, + 2932896, + ], + type=pa.date32(), + ) + ) + + messages_back = protarrow.table_to_messages(table, ExampleMessage) + assert messages_back == [ + ExampleMessage(date_value=default_date), + ExampleMessage(date_value=default_date), # This changed + ExampleMessage(date_value=min_date), + ExampleMessage(date_value=max_date), + ]