Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 38 additions & 9 deletions api/v1alpha1/unstructureddataproduct_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.
package v1alpha1

import (
"fmt"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

Expand All @@ -27,15 +29,30 @@ type (
)

const (
SourceTypeS3 UnstructuredDataSourceType = "s3"
DestinationTypeInternalStage UnstructuredDataDestinationType = "snowflakeInternalStage"
ChunkingStrategyRecursiveCharacter ChunkingStrategy = "recursiveCharacterTextSplitter"
ChunkingStrategyMarkdown ChunkingStrategy = "markdownTextSplitter"
ChunkingStrategyToken ChunkingStrategy = "tokenTextSplitter"
SourceTypeS3 UnstructuredDataSourceType = "s3"
DestinationTypeInternalStage UnstructuredDataDestinationType = "snowflakeInternalStage"
ChunkingStrategyRecursiveCharacter ChunkingStrategy = "recursiveCharacterTextSplitter"
ChunkingStrategyMarkdown ChunkingStrategy = "markdownTextSplitter"
ChunkingStrategyToken ChunkingStrategy = "tokenTextSplitter"
ChunkingStrategyDoclingHierarchical ChunkingStrategy = "doclingHierarchicalChunker"
ChunkingStrategyDoclingHybrid ChunkingStrategy = "doclingHybridChunker"

UnstructuredDataProductCondition = "UnstructuredDataProductReady"
)

func IsDoclingChunkingStrategy(strategy ChunkingStrategy) bool {
return strategy == ChunkingStrategyDoclingHierarchical || strategy == ChunkingStrategyDoclingHybrid
}

func (s *UnstructuredDataProductSpec) ValidateSpec() error {
if IsDoclingChunkingStrategy(s.ChunksGeneratorConfig.Strategy) {
if s.DocumentProcessorConfig != nil {
return fmt.Errorf("documentProcessorConfig must not be set when using Docling chunking strategy %q; Docling performs conversion and chunking in a single step", s.ChunksGeneratorConfig.Strategy)
}
}
return nil
}

type DocumentProcessorConfig struct {
Type string `json:"type,omitempty"`
DoclingConfig DoclingConfig `json:"doclingConfig,omitempty"`
Expand All @@ -59,6 +76,8 @@ type ChunksGeneratorConfig struct {
RecursiveCharacterSplitterConfig RecursiveCharacterSplitterConfig `json:"recursiveCharacterSplitterConfig,omitempty"`
MarkdownSplitterConfig MarkdownSplitterConfig `json:"markdownSplitterConfig,omitempty"`
TokenSplitterConfig TokenSplitterConfig `json:"tokenSplitterConfig,omitempty"`
DoclingHierarchicalChunkerConfig DoclingHierarchicalChunkerConfig `json:"doclingHierarchicalChunkerConfig,omitempty"`
DoclingHybridChunkerConfig DoclingHybridChunkerConfig `json:"doclingHybridChunkerConfig,omitempty"`
Comment on lines +79 to +80
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should add validations such that if these are specified then DocumentProcessorConfig is not set

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Added ValidateSpec() on UnstructuredDataProductSpec that returns an error when a Docling chunking strategy is selected and DocumentProcessorConfig is set. Wired it up early in the Reconcile() loop so invalid specs are rejected before any CRs are created. Also changed DocumentProcessorConfig to a pointer so the nil check is clean.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should be done via validating webhook but it's understandable that it's not configured right now for this controller ... have you evaluated doing this via kubebuilder's inbuilt CRD validation - https://book.kubebuilder.io/reference/markers/crd-validation.html

i am suggesting this approach because i want the end result that the CR is not even applied and is rejected at creation time rather than getting accepted and then causes a runtime failure

what you've implemented is like an API request is accepted and fails at runtime ... what i want is that the API requested is rejected if it's incorrect

}

type RecursiveCharacterSplitterConfig struct {
Expand Down Expand Up @@ -86,12 +105,22 @@ type TokenSplitterConfig struct {
DisallowedSpecial []string `json:"disallowedSpecial,omitempty"`
}

type DoclingHierarchicalChunkerConfig struct {
MergeListItems *bool `json:"mergeListItems,omitempty"`
}

type DoclingHybridChunkerConfig struct {
Tokenizer string `json:"tokenizer,omitempty"`
MaxTokens int `json:"maxTokens,omitempty"`
MergePeers *bool `json:"mergePeers,omitempty"`
}

// UnstructuredDataProductSpec defines the desired state of UnstructuredDataProduct
type UnstructuredDataProductSpec struct {
SourceConfig SourceConfig `json:"sourceConfig,omitempty"`
DestinationConfig DestinationConfig `json:"destinationConfig,omitempty"`
DocumentProcessorConfig DocumentProcessorConfig `json:"documentProcessorConfig,omitempty"`
ChunksGeneratorConfig ChunksGeneratorConfig `json:"chunksGeneratorConfig,omitempty"`
SourceConfig SourceConfig `json:"sourceConfig,omitempty"`
DestinationConfig DestinationConfig `json:"destinationConfig,omitempty"`
DocumentProcessorConfig *DocumentProcessorConfig `json:"documentProcessorConfig,omitempty"`
ChunksGeneratorConfig ChunksGeneratorConfig `json:"chunksGeneratorConfig,omitempty"`
}

type SourceConfig struct {
Expand Down
150 changes: 150 additions & 0 deletions api/v1alpha1/unstructureddataproduct_types_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
/*
Copyright 2026.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package v1alpha1

import (
"testing"
)

func TestValidateSpec(t *testing.T) {
tests := []struct {
name string
spec UnstructuredDataProductSpec
wantErr bool
}{
{
name: "docling hierarchical with DocumentProcessorConfig set returns error",
spec: UnstructuredDataProductSpec{
ChunksGeneratorConfig: ChunksGeneratorConfig{
Strategy: ChunkingStrategyDoclingHierarchical,
},
DocumentProcessorConfig: &DocumentProcessorConfig{
Type: "docling",
},
},
wantErr: true,
},
{
name: "docling hybrid with DocumentProcessorConfig set returns error",
spec: UnstructuredDataProductSpec{
ChunksGeneratorConfig: ChunksGeneratorConfig{
Strategy: ChunkingStrategyDoclingHybrid,
},
DocumentProcessorConfig: &DocumentProcessorConfig{
Type: "docling",
},
},
wantErr: true,
},
{
name: "docling hierarchical without DocumentProcessorConfig returns no error",
spec: UnstructuredDataProductSpec{
ChunksGeneratorConfig: ChunksGeneratorConfig{
Strategy: ChunkingStrategyDoclingHierarchical,
},
},
wantErr: false,
},
{
name: "docling hybrid without DocumentProcessorConfig returns no error",
spec: UnstructuredDataProductSpec{
ChunksGeneratorConfig: ChunksGeneratorConfig{
Strategy: ChunkingStrategyDoclingHybrid,
},
},
wantErr: false,
},
{
name: "non-docling strategy with DocumentProcessorConfig set returns no error",
spec: UnstructuredDataProductSpec{
ChunksGeneratorConfig: ChunksGeneratorConfig{
Strategy: ChunkingStrategyRecursiveCharacter,
},
DocumentProcessorConfig: &DocumentProcessorConfig{
Type: "docling",
},
},
wantErr: false,
},
{
name: "non-docling strategy without DocumentProcessorConfig returns no error",
spec: UnstructuredDataProductSpec{
ChunksGeneratorConfig: ChunksGeneratorConfig{
Strategy: ChunkingStrategyMarkdown,
},
},
wantErr: false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := tt.spec.ValidateSpec()
if (err != nil) != tt.wantErr {
t.Errorf("ValidateSpec() error = %v, wantErr %v", err, tt.wantErr)
}
})
}
}

func TestIsDoclingChunkingStrategy(t *testing.T) {
tests := []struct {
name string
strategy ChunkingStrategy
want bool
}{
{
name: "docling hierarchical",
strategy: ChunkingStrategyDoclingHierarchical,
want: true,
},
{
name: "docling hybrid",
strategy: ChunkingStrategyDoclingHybrid,
want: true,
},
{
name: "recursive character",
strategy: ChunkingStrategyRecursiveCharacter,
want: false,
},
{
name: "markdown",
strategy: ChunkingStrategyMarkdown,
want: false,
},
{
name: "token",
strategy: ChunkingStrategyToken,
want: false,
},
{
name: "empty string",
strategy: "",
want: false,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := IsDoclingChunkingStrategy(tt.strategy)
if got != tt.want {
t.Errorf("IsDoclingChunkingStrategy(%q) = %v, want %v", tt.strategy, got, tt.want)
}
})
}
}
48 changes: 47 additions & 1 deletion api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,20 @@ spec:
properties:
config:
properties:
doclingHierarchicalChunkerConfig:
properties:
mergeListItems:
type: boolean
type: object
doclingHybridChunkerConfig:
properties:
maxTokens:
type: integer
mergePeers:
type: boolean
tokenizer:
type: string
type: object
markdownSplitterConfig:
properties:
chunkOverlap:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,20 @@ spec:
properties:
chunksGeneratorConfig:
properties:
doclingHierarchicalChunkerConfig:
properties:
mergeListItems:
type: boolean
type: object
doclingHybridChunkerConfig:
properties:
maxTokens:
type: integer
mergePeers:
type: boolean
tokenizer:
type: string
type: object
markdownSplitterConfig:
properties:
chunkOverlap:
Expand Down
Loading