Skip to content

feature request: arrow to iceberg type mapping #15666

@kevinjqliu

Description

@kevinjqliu

Feature Request / Improvement

Inspired by apache/iceberg-python#3098, which documents mapping between pyarrow and pyiceberg types

Think its useful to document Iceberg Type <> Arrow type?

Maybe just put this into writing

private static class IcebergToArrowTypeConverter extends TypeUtil.SchemaVisitor<Field> {
private final NestedField currentField;
IcebergToArrowTypeConverter(NestedField field) {
this.currentField = field;
}
@Override
public Field schema(org.apache.iceberg.Schema schema, Field structResult) {
return structResult;
}
@Override
public Field struct(StructType struct, List<Field> fieldResults) {
return new Field(
currentField.name(),
new FieldType(currentField.isOptional(), ArrowType.Struct.INSTANCE, null),
convertChildren(struct.fields()));
}
@Override
public Field field(NestedField field, Field fieldResult) {
return fieldResult;
}
@Override
public Field list(ListType list, Field elementResult) {
return new Field(
currentField.name(),
new FieldType(currentField.isOptional(), ArrowType.List.INSTANCE, null),
convertChildren(list.fields()));
}
@Override
public Field map(MapType map, Field keyResult, Field valueResult) {
Map<String, String> metadata = ImmutableMap.of(ORIGINAL_TYPE, MAP_TYPE);
ArrowType arrowType = new ArrowType.Map(false);
List<Field> entryFields = convertChildren(map.fields());
Field entry =
new Field("", new FieldType(currentField.isOptional(), arrowType, null), entryFields);
List<Field> children = Lists.newArrayList(entry);
return new Field(
currentField.name(),
new FieldType(currentField.isOptional(), arrowType, null, metadata),
children);
}
private List<Field> convertChildren(Collection<NestedField> children) {
List<Field> converted = Lists.newArrayListWithCapacity(children.size());
for (NestedField child : children) {
converted.add(TypeUtil.visit(child.type(), new IcebergToArrowTypeConverter(child)));
}
return converted;
}
@Override
public Field primitive(Type.PrimitiveType primitive) {
final ArrowType arrowType;
switch (primitive.typeId()) {
case BINARY:
arrowType = ArrowType.Binary.INSTANCE;
break;
case FIXED:
final Types.FixedType fixedType = (Types.FixedType) primitive;
arrowType = new ArrowType.FixedSizeBinary(fixedType.length());
break;
case BOOLEAN:
arrowType = ArrowType.Bool.INSTANCE;
break;
case INTEGER:
arrowType = new ArrowType.Int(Integer.SIZE, true /* signed */);
break;
case LONG:
arrowType = new ArrowType.Int(Long.SIZE, true /* signed */);
break;
case FLOAT:
arrowType = new ArrowType.FloatingPoint(FloatingPointPrecision.SINGLE);
break;
case DOUBLE:
arrowType = new ArrowType.FloatingPoint(FloatingPointPrecision.DOUBLE);
break;
case DECIMAL:
final Types.DecimalType decimalType = (Types.DecimalType) primitive;
arrowType = new ArrowType.Decimal(decimalType.precision(), decimalType.scale(), 128);
break;
case STRING:
arrowType = ArrowType.Utf8.INSTANCE;
break;
case TIME:
arrowType = new ArrowType.Time(TimeUnit.MICROSECOND, Long.SIZE);
break;
case UUID:
arrowType = new ArrowType.FixedSizeBinary(16);
break;
case TIMESTAMP:
arrowType =
new ArrowType.Timestamp(
TimeUnit.MICROSECOND,
((Types.TimestampType) primitive).shouldAdjustToUTC() ? "UTC" : null);
break;
case TIMESTAMP_NANO:
arrowType =
new ArrowType.Timestamp(
TimeUnit.NANOSECOND,
((Types.TimestampNanoType) primitive).shouldAdjustToUTC() ? "UTC" : null);
break;
case DATE:
arrowType = new ArrowType.Date(DateUnit.DAY);
break;
default:
throw new UnsupportedOperationException("Unsupported primitive type: " + primitive);
}
return new Field(
currentField.name(),
new FieldType(currentField.isOptional(), arrowType, null),
Lists.newArrayList());
}
}

Query engine

None

Willingness to contribute

  • I can contribute this improvement/feature independently
  • I would be willing to contribute this improvement/feature with guidance from the Iceberg community
  • I cannot contribute this improvement/feature at this time

Metadata

Metadata

Assignees

Labels

improvementPR that improves existing functionality

Type

No type
No fields configured for issues without a type.

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions