From 4d68ae08a71bd75ea7b26008d46804e3ffdd7c2a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 03:39:46 +0000 Subject: [PATCH 1/2] Initial plan From bb66b01b0573bd0ef169076fac72ab74b5f3f8e5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 11 Mar 2026 03:48:09 +0000 Subject: [PATCH 2/2] Fix generate_batch for >=200 rows: auto sub-batch and fix empty-return bug Co-authored-by: Icar0S <39846852+Icar0S@users.noreply.github.com> --- src/synthetic/generator.py | 54 +++++++- tests/backend/api/test_synthetic_backend.py | 133 +++++++++++++++++++- 2 files changed, 185 insertions(+), 2 deletions(-) diff --git a/src/synthetic/generator.py b/src/synthetic/generator.py index 1df3d15..f2de09c 100644 --- a/src/synthetic/generator.py +++ b/src/synthetic/generator.py @@ -21,6 +21,13 @@ from llm_client import create_llm_client +# Maximum rows to request per individual LLM call. +# Requests above this threshold are automatically split into sub-batches. +# Production testing shows ≤100 rows generates reliably; larger counts cause +# the model to return fewer rows than requested (typically only ~70-75%). +_LLM_MAX_ROWS_PER_CALL = 100 + + class SyntheticDataGenerator: """Generates synthetic data using LLM.""" @@ -344,10 +351,37 @@ def generate_batch( logs.append("ERROR: No LLM configured, using mock data") return self._generate_mock_data(schema, num_rows), logs + # For large requests, split into smaller LLM calls for reliability. + # Production testing shows ≤100 rows generates reliably; larger counts + # cause the model to return only ~70-75% of requested rows, wasting + # API quota on retries. + if num_rows > _LLM_MAX_ROWS_PER_CALL: + all_records = [] + offset = 0 + while len(all_records) < num_rows: + remaining = num_rows - len(all_records) + sub_size = min(_LLM_MAX_ROWS_PER_CALL, remaining) + sub_seed = seed + offset if seed is not None else None + sub_records, sub_logs = self.generate_batch( + schema=schema, + num_rows=sub_size, + locale=locale, + seed=sub_seed, + max_retries=max_retries, + ) + logs.extend(sub_logs) + all_records.extend(sub_records) + offset += sub_size + return all_records[:num_rows], logs + # Build prompt prompt = self._build_prompt(schema, num_rows, locale, seed) logs.append(f"Generated prompt for {num_rows} rows") + # Track the best partial result across attempts so we can fall back + # gracefully when all retries are exhausted. + best_rows = [] + # Try to generate data with retries for attempt in range(max_retries): try: @@ -367,6 +401,10 @@ def generate_batch( rows = self._parse_csv_response(response_text, num_columns) logs.append(f"Parsed {len(rows)} rows from CSV") + # Keep the best partial result in case all retries fail + if len(rows) > len(best_rows): + best_rows = rows + if len(rows) < num_rows * 0.8: # At least 80% of requested rows logs.append(f"WARNING: Only got {len(rows)}/{num_rows} rows, retrying...") continue @@ -388,7 +426,21 @@ def generate_batch( return self._generate_mock_data(schema, num_rows), logs time.sleep(1) # Brief delay before retry - return [], logs + # All retries exhausted via the insufficient-rows path (no exception). + # Use the best partial LLM result and fill any gap with mock data so + # callers always receive the requested number of rows. + logs.append( + f"Max retries reached with only {len(best_rows)}/{num_rows} rows, " + "filling remainder with mock data" + ) + if best_rows: + records = self._coerce_types(best_rows, schema) + records = self._enforce_uniqueness(records, schema) + if len(records) < num_rows: + mock_fill = self._generate_mock_data(schema, num_rows - len(records)) + records.extend(mock_fill) + return records[:num_rows], logs + return self._generate_mock_data(schema, num_rows), logs def _generate_random_date( self, start_str: str, end_str: str, include_time: bool = False diff --git a/tests/backend/api/test_synthetic_backend.py b/tests/backend/api/test_synthetic_backend.py index 59ba082..436d83c 100644 --- a/tests/backend/api/test_synthetic_backend.py +++ b/tests/backend/api/test_synthetic_backend.py @@ -2,12 +2,13 @@ import re import pytest +from unittest.mock import MagicMock, patch from src.synthetic.validators import ( validate_schema, validate_generate_request, validate_preview_request, ) -from src.synthetic.generator import SyntheticDataGenerator +from src.synthetic.generator import SyntheticDataGenerator, _LLM_MAX_ROWS_PER_CALL class TestValidators: @@ -312,5 +313,135 @@ def test_generate_mock_data_with_swapped_dates(self): assert datetime_pattern.match(record["created_at"]), f"Invalid datetime format: {record['created_at']}" +class TestGenerateBatchLargeRowHandling: + """Tests for generate_batch behaviour with large row counts and LLM failures.""" + + SCHEMA = { + "columns": [ + {"name": "id", "type": "integer", "options": {"min": 1, "max": 1000000}}, + {"name": "name", "type": "string", "options": {}}, + ] + } + + def _make_generator_with_mock_llm(self, llm_side_effect=None, llm_return_value=None): + """Return a SyntheticDataGenerator with a mocked LLM client.""" + generator = SyntheticDataGenerator.__new__(SyntheticDataGenerator) + generator.api_key = "fake-key" + generator.model = "fake-model" + generator._llm_available = True + mock_client = MagicMock() + if llm_side_effect is not None: + mock_client.generate.side_effect = llm_side_effect + elif llm_return_value is not None: + mock_client.generate.return_value = llm_return_value + generator.llm_client = mock_client + return generator + + def _csv_for_n_rows(self, n, start=1): + """Generate a valid CSV string for n rows (id, name).""" + lines = [f"{start + i},name_{start + i}" for i in range(n)] + return "\n".join(lines) + + # ------------------------------------------------------------------ + # Sub-batching tests + # ------------------------------------------------------------------ + + def test_llm_max_rows_per_call_constant_is_100(self): + """_LLM_MAX_ROWS_PER_CALL should be 100 so ≤100-row requests stay + in a single LLM call while larger requests are split.""" + assert _LLM_MAX_ROWS_PER_CALL == 100 + + def test_large_request_is_split_into_sub_batches(self): + """Requesting >100 rows must trigger sub-batching; each sub-call + should receive at most _LLM_MAX_ROWS_PER_CALL rows.""" + call_sizes = [] + + def capture_prompt(messages, **kwargs): + content = messages[0]["content"] + # Extract row count from "Generate X rows…" + import re as _re + m = _re.search(r"Generate (\d+) rows", content) + if m: + call_sizes.append(int(m.group(1))) + # Return enough rows for the requested count + n = int(m.group(1)) if m else 1 + return self._csv_for_n_rows(n) + + generator = self._make_generator_with_mock_llm(llm_side_effect=capture_prompt) + records, logs = generator.generate_batch(self.SCHEMA, num_rows=200) + + assert len(records) == 200 + # Each individual LLM call must be ≤ _LLM_MAX_ROWS_PER_CALL + assert all(n <= _LLM_MAX_ROWS_PER_CALL for n in call_sizes), ( + f"Some LLM calls requested more than {_LLM_MAX_ROWS_PER_CALL} rows: {call_sizes}" + ) + + def test_exact_threshold_uses_single_call(self): + """Requesting exactly _LLM_MAX_ROWS_PER_CALL rows must NOT trigger + sub-batching (one LLM call).""" + call_count = [0] + + def counting_side_effect(messages, **kwargs): + call_count[0] += 1 + return self._csv_for_n_rows(_LLM_MAX_ROWS_PER_CALL) + + generator = self._make_generator_with_mock_llm(llm_side_effect=counting_side_effect) + records, _ = generator.generate_batch(self.SCHEMA, num_rows=_LLM_MAX_ROWS_PER_CALL) + + assert call_count[0] == 1 + assert len(records) == _LLM_MAX_ROWS_PER_CALL + + # ------------------------------------------------------------------ + # Bug-fix: return [], logs → fallback when all retries return too few rows + # ------------------------------------------------------------------ + + def test_insufficient_rows_all_retries_falls_back_to_mock_data(self): + """When all retries return fewer than 80% of requested rows (no + exception), generate_batch must return the requested count via + mock-data fill instead of an empty list.""" + # LLM always returns only 70 rows when 100 are requested + generator = self._make_generator_with_mock_llm( + llm_return_value=self._csv_for_n_rows(70) + ) + records, logs = generator.generate_batch(self.SCHEMA, num_rows=100, max_retries=2) + + assert len(records) == 100, ( + "Expected 100 records but got an empty list – the return [], logs bug may have reappeared" + ) + assert any("filling remainder with mock data" in log for log in logs), ( + "Expected a log message about filling with mock data" + ) + + def test_insufficient_rows_result_is_never_empty(self): + """generate_batch must never return an empty list regardless of how + many retries fail the 80% threshold.""" + generator = self._make_generator_with_mock_llm( + llm_return_value=self._csv_for_n_rows(10) # 10/100 = 10%, well below 80% + ) + records, logs = generator.generate_batch(self.SCHEMA, num_rows=100, max_retries=3) + + assert len(records) > 0, "generate_batch must never return an empty list" + assert len(records) == 100 + + def test_best_partial_rows_are_kept_when_retries_exhausted(self): + """The best partial LLM result should be used (not discarded) when + retries are exhausted via the insufficient-rows path.""" + attempt_num = [0] + + def improving_side_effect(messages, **kwargs): + attempt_num[0] += 1 + # Return progressively more rows but always < 80 (threshold for 100) + rows = 50 + attempt_num[0] * 5 # 55, 60, 65 – all < 80 + return self._csv_for_n_rows(rows) + + generator = self._make_generator_with_mock_llm(llm_side_effect=improving_side_effect) + records, logs = generator.generate_batch(self.SCHEMA, num_rows=100, max_retries=3) + + # Should have 100 rows (best partial 65 + 35 mock fill) + assert len(records) == 100 + # The mock-fill log should be present + assert any("filling remainder with mock data" in log for log in logs) + + if __name__ == "__main__": pytest.main([__file__, "-v"])