Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""SemanticInsightsGenerator - SQL-based feature generation using LLMs."""

from agentune.analyze.feature.gen.semantic_insights_generator.generator import (
SemanticInsightsGenerator,
)

__all__ = ['SemanticInsightsGenerator']
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""Basic feature generator - executes one generation cycle."""

from __future__ import annotations

from collections.abc import AsyncIterator

from attrs import define, field
from duckdb import DuckDBPyConnection

from agentune.analyze.feature.gen.base import GeneratedFeature
from agentune.analyze.feature.gen.semantic_insights_generator.corrector import (
LLMSqlFeatureCorrector,
)
from agentune.analyze.feature.gen.semantic_insights_generator.llm.schema import (
GeneratedFeatureSpec,
)
from agentune.analyze.feature.problem import Problem
from agentune.analyze.feature.sql.create import feature_from_query
from agentune.analyze.feature.sql.validator_loop import ValidateAndRetryParams, validate_and_retry
from agentune.analyze.feature.validate.base import FeatureValidator
from agentune.analyze.join.base import TablesWithJoinStrategies
from agentune.core.dataset import Dataset
from agentune.core.sercontext import LLMWithSpec

# Default validation retry budgets
_DEFAULT_MAX_GLOBAL_RETRIES = 5
_DEFAULT_MAX_LOCAL_RETRIES = 3


@define
class BasicFeatureGenerator:
"""Executes one complete generation cycle for SQL-based features.

This component handles:
- Calling LLM to generate feature specifications
- Validating each feature using validate_and_retry()
- Yielding validated features as they're ready (streaming)
"""

generation_model: LLMWithSpec
repair_model: LLMWithSpec
seed: int | None = None
validators: tuple[FeatureValidator, ...] = field(factory=tuple)
max_global_retries: int = _DEFAULT_MAX_GLOBAL_RETRIES
max_local_retries: int = _DEFAULT_MAX_LOCAL_RETRIES

async def agenerate(
self,
dataset: Dataset,
problem: Problem, # noqa: ARG002 - Used in TODO: LLM prompt generation
join_strategies: TablesWithJoinStrategies,
conn: DuckDBPyConnection,
) -> AsyncIterator[GeneratedFeature]:
"""Generate features using a single LLM call cycle.

Args:
dataset: Input dataset for feature generation context
problem: Problem specification (target column, problem type)
join_strategies: Available join strategies for secondary tables
conn: DuckDB connection for SQL execution

Yields:
GeneratedFeature instances as they pass validation
"""
# TODO: Steps 1-3 - LLM generation (implement later)
# 1. Sample data for LLM context using self.seed
# 2. Build prompt with problem description, schema, samples (needs `problem`)
# 3. Call generation_model to get feature specifications

# Step 4: Validate each feature using validate_and_retry
feature_specs: list[GeneratedFeatureSpec] = [] # Placeholder empty list

# Extract secondary tables from join_strategies
secondary_tables = [tws.table for tws in join_strategies]

# TODO: Sample data for validation (use self.seed)
# For now, use full dataset
sampled_data = dataset

# Validate and yield features
for spec in feature_specs:
# Create corrector for this feature
corrector = LLMSqlFeatureCorrector(
repair_model=self.repair_model,
)

# Validate feature with retry loop
feature = await validate_and_retry(
ValidateAndRetryParams(
feature_ctor=feature_from_query,
conn=conn,
sql_query=spec.sql_query,
params=dataset.schema,
secondary_tables=secondary_tables,
input=sampled_data,
max_global_retries=self.max_global_retries,
max_local_retries=self.max_local_retries,
corrector=corrector,
validators=self.validators,
name=spec.name,
description=spec.description,
technical_description=spec.sql_query,
)
)

# Yield if validation succeeded
if feature:
yield GeneratedFeature(feature=feature, has_good_defaults=False)
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""LLM-based SQL query corrector for validation errors."""

from __future__ import annotations

from attrs import define

from agentune.analyze.feature.sql.base import SqlFeatureCorrector
from agentune.analyze.feature.validate.base import FeatureValidationError
from agentune.core.sercontext import LLMWithSpec


@define
class LLMSqlFeatureCorrector(SqlFeatureCorrector):
"""Corrector that uses LLM to fix SQL queries based on validation errors.

Integrates with validate_and_retry() loop to automatically repair features.
"""

repair_model: LLMWithSpec

async def correct(
self,
sql_query: str, # noqa: ARG002 - Used in TODO: LLM-based correction
error: FeatureValidationError, # noqa: ARG002 - Used in TODO: LLM-based correction
) -> str | None:
"""Attempt to fix the SQL query based on the validation error.

Args:
sql_query: The SQL query that failed validation
error: The validation error with code and message

Returns:
A corrected SQL query string, or None to give up
"""
# TODO: Implement LLM-based correction
# Return corrected query or None if can't fix
return None
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""SemanticInsightsGenerator - SQL-based feature generation using LLMs."""

from __future__ import annotations

from collections.abc import AsyncIterator

from attrs import frozen
from duckdb import DuckDBPyConnection

from agentune.analyze.feature.gen.base import FeatureGenerator, GeneratedFeature
from agentune.analyze.feature.gen.semantic_insights_generator.basic_generator import (
BasicFeatureGenerator,
)
from agentune.analyze.feature.problem import Problem
from agentune.analyze.feature.validate.law_and_order import LawAndOrderValidator
from agentune.analyze.join.base import TablesWithJoinStrategies
from agentune.core.dataset import Dataset
from agentune.core.sercontext import LLMWithSpec


@frozen
class SemanticInsightsGenerator(FeatureGenerator):
"""Feature generator that creates SQL-based features using LLMs.

This generator:
- Uses a reasoning model to generate SQL feature specifications
- Validates features using LawAndOrderValidator
- Repairs failed features using a fast repair model
- Integrates with the official validation framework

Attributes:
generation_model: LLM for generating feature SQL (e.g., claude-opus-4-5)
repair_model: LLM for repairing validation errors (e.g., claude-haiku-4)
seed: Random seed for reproducible generation (None = non-deterministic)
"""

generation_model: LLMWithSpec
repair_model: LLMWithSpec
seed: int | None = None

async def agenerate(
self,
feature_search: Dataset,
problem: Problem,
join_strategies: TablesWithJoinStrategies,
conn: DuckDBPyConnection,
) -> AsyncIterator[GeneratedFeature]:
"""Generate SQL-based features for the given problem.

Args:
feature_search: Input dataset for feature generation context
problem: Problem specification (target column, problem type)
join_strategies: Available join strategies for secondary tables
conn: DuckDB connection for SQL execution

Yields:
GeneratedFeature instances with has_good_defaults=False
"""
basic_gen = BasicFeatureGenerator(
generation_model=self.generation_model,
repair_model=self.repair_model,
seed=self.seed,
validators=(LawAndOrderValidator(),),
)

async for gen_feature in basic_gen.agenerate(
dataset=feature_search,
problem=problem,
join_strategies=join_strategies,
conn=conn,
):
yield gen_feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""LLM interface for SemanticInsightsGenerator."""

from agentune.analyze.feature.gen.semantic_insights_generator.llm.schema import (
GeneratedFeatureSpec,
)

__all__ = ['GeneratedFeatureSpec']
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""LLM interface schema definitions."""

from attrs import frozen


@frozen
class GeneratedFeatureSpec:
"""Specification for a SQL-based feature generated by LLM.

Represents a feature as SQL query along with metadata. The LLM generates
these specifications, which are then validated and converted to Feature objects.

Attributes:
name: Feature name for identification
description: Human-readable description of what the feature computes
sql_query: Complete SQL SELECT statement that computes the feature
"""

name: str
description: str
sql_query: str
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Tests for InsightfulTextGenerator (ConversationQueryFeatureGenerator)."""
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Tests for SemanticInsightsGenerator."""
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""Fixtures for semantic insights generator tests.

TODO: Temporary placeholder. Real test data will be added in a separate task.
Should use tabular (non-conversational) data, not the conversational data
used by insightful_text_generator.
"""

import pytest
from duckdb import DuckDBPyConnection

from agentune.analyze.feature.problem import ClassificationProblem
from agentune.analyze.join.base import TablesWithJoinStrategies
from agentune.core.dataset import Dataset
from agentune.core.llm import LLMContext
from agentune.core.sercontext import LLMWithSpec


@pytest.fixture
def test_dataset_with_strategy(conn: DuckDBPyConnection) -> tuple[Dataset, str, TablesWithJoinStrategies]:
"""Temporary placeholder. Real test data will be added in a separate task."""
raise NotImplementedError('TODO: Define test data for semantic insights generator')


@pytest.fixture
async def real_llm_with_spec(llm_context_nocache: LLMContext) -> LLMWithSpec:
"""Temporary placeholder. Real test data will be added in a separate task."""
raise NotImplementedError('TODO: Define LLM fixture for semantic insights generator')


@pytest.fixture
def problem(test_dataset_with_strategy: tuple[Dataset, str, TablesWithJoinStrategies]) -> ClassificationProblem:
"""Temporary placeholder. Real test data will be added in a separate task."""
raise NotImplementedError('TODO: Define problem fixture for semantic insights generator')
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
"""E2E tests for SemanticInsightsGenerator."""

import pytest
from duckdb import DuckDBPyConnection

from agentune.analyze.feature.gen.semantic_insights_generator import SemanticInsightsGenerator
from agentune.analyze.feature.problem import ClassificationProblem
from agentune.analyze.join.base import TablesWithJoinStrategies
from agentune.core.dataset import Dataset
from agentune.core.sercontext import LLMWithSpec


@pytest.mark.integration
async def test_semantic_insights_generator(
test_dataset_with_strategy: tuple[Dataset, str, TablesWithJoinStrategies],
conn: DuckDBPyConnection,
real_llm_with_spec: LLMWithSpec,
problem: ClassificationProblem,
) -> None:
"""Test that SemanticInsightsGenerator can be instantiated and API works.

This is a minimal test validating the generator structure. It expects
empty results until LLM generation is implemented in BasicFeatureGenerator.

NOTE: When LLM generation is implemented in BasicFeatureGenerator:
1. Remove the "assert len(features) == 0" check
2. Add assertions for generated features (structure, computation, validation)
3. Consider adding more comprehensive tests following test_e2e.py patterns
"""
# Unpack test data
main_dataset, target_col, strategies = test_dataset_with_strategy

# Create generator with both models
generator = SemanticInsightsGenerator(
generation_model=real_llm_with_spec,
repair_model=real_llm_with_spec,
seed=42,
)

# Call agenerate - should not yield features yet (stub implementation)
features = []
async for gen_feature in generator.agenerate(
feature_search=main_dataset,
problem=problem,
join_strategies=strategies,
conn=conn,
):
features.append(gen_feature)

# Expect empty list until LLM generation is implemented
assert len(features) == 0, (
'Expected no features from stub implementation. '
'If this fails, LLM generation has been implemented - update test!'
)