diff --git a/README.md b/README.md index 8e62441..0e10221 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,7 @@ Apenas **Docker** e **Docker Compose** instalados na máquina. Nenhuma instalaç ## Instalação e início rápido ```bash -git clone +git clone https://github.com/ckoliveiraa/DataForge.git cd Dataforge # Sobe o frontend (interface visual) em segundo plano diff --git a/pyproject.toml b/pyproject.toml index 91997c0..44a3ccd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dataforge" -version = "0.1.0" +version = "0.1.7" description = "Synthetic relational dataset generator for data engineering studies" authors = [ {name = "Carlos Oliveira", email = "papodedados@gmail.com"} diff --git a/src/dataforge/core/registry.py b/src/dataforge/core/registry.py index d0e6e42..cb0947f 100644 --- a/src/dataforge/core/registry.py +++ b/src/dataforge/core/registry.py @@ -1,10 +1,25 @@ from __future__ import annotations +import re from datetime import date as _date # Dtypes that support min/max ranges RANGE_SUPPORTED_DTYPES = {"int", "float", "date"} + +def _normalize_date_value(value: str) -> str: + """Normaliza sufixos de data relativos para o formato aceito pelo Faker. + + Converte 'm' minúsculo para 'M' (meses) e garante que os demais + sufixos (d, w, y) fiquem em minúsculo, independente do que o usuário digitar. + """ + return re.sub( + r"([+-]?\d+)([a-zA-Z])", + lambda m: m.group(1) + ("M" if m.group(2).lower() == "m" else m.group(2).lower()), + value, + ) + + FAKER_REGISTRY: dict[str, callable] = { "uuid": lambda f, n, **kw: [f.uuid4() for _ in range(n)], "int_seq": lambda f, n, seq_start=1, **kw: list(range(seq_start, seq_start + n)), @@ -19,10 +34,10 @@ "bool": lambda f, n, **kw: [f.boolean() for _ in range(n)], "date": lambda f, n, min_value="-3y", max_value="today", **kw: [ f.date_between( - start_date=min_value + start_date=_normalize_date_value(min_value) if isinstance(min_value, str) else _date.fromisoformat(str(min_value)), - end_date=max_value + end_date=_normalize_date_value(max_value) if isinstance(max_value, str) else _date.fromisoformat(str(max_value)), ).isoformat() diff --git a/src/dataforge/frontend/src/App.tsx b/src/dataforge/frontend/src/App.tsx index 6d52033..948a43a 100644 --- a/src/dataforge/frontend/src/App.tsx +++ b/src/dataforge/frontend/src/App.tsx @@ -147,6 +147,7 @@ const VALID_DTYPES = [ const nodeTypes = { tableNode: TableNode }; + export default function App() { const [domain, setDomain] = useState("custom"); const [tables, setTables] = useState([]); @@ -613,6 +614,7 @@ export default function App() { }; const [showRunPanel, setShowRunPanel] = useState(false); + const [showRunHelp, setShowRunHelp] = useState(false); const [runConfig, setRunConfig] = useState<{ formats: string[], destination: 'local' | 'cloud' | 'database', @@ -1167,19 +1169,73 @@ export default function App() { )} + {/* Run Generator Help Modal */} + {showRunHelp && ( +
setShowRunHelp(false)}> +
e.stopPropagation()}> +
+

Run Generator — Field Reference

+ +
+ {([ + { section: 'Format', fields: [ + { name: 'Format (CSV / JSON / Parquet / Avro)', desc: 'Output file format. You can select multiple at once — the generator will write one file per format for each table.' }, + { name: 'JSON Mode', desc: 'Flat (NDJSON): one JSON object per line, ideal for large files and streaming pipelines.\nNested: a single JSON array — easier to read but heavier in memory.' }, + { name: 'Rows Override', desc: 'Override the number of rows defined in the schema for all tables. Leave empty to use the schema defaults.' }, + ]}, + { section: 'Destination', fields: [ + { name: 'Local', desc: 'Save generated files to a folder on this machine. Click the 📁 button to browse and select the folder path.' }, + { name: 'Cloud', desc: 'Upload generated files directly to a cloud bucket (GCS, S3, or Azure Blob Storage). Credentials are auto-loaded from the credentials/ folder in the project root.' }, + { name: 'Bucket / Container', desc: 'Name of the target cloud bucket or container where files will be uploaded.' }, + { name: 'Prefix', desc: 'Remote path prefix inside the bucket. Example: datasets/ → files land at datasets/schema_name/table_name/file.csv.' }, + { name: 'Database', desc: 'Load generated data directly into a database table. Supports PostgreSQL, MySQL and SQLite.' }, + { name: 'Database Type', desc: 'Choose the database engine. The connection form adapts to the selected type.' }, + { name: 'File Path (SQLite)', desc: 'Path to the SQLite .db file. It will be created if it does not exist.' }, + { name: 'Host / Port', desc: 'Address and port of the database server. Defaults are pre-filled per engine (PostgreSQL: 5432, MySQL: 3306).' }, + { name: 'Database', desc: 'Name of the database to connect to.' }, + { name: 'User / Password', desc: 'Credentials for the database connection.' }, + { name: 'If Table Exists', desc: 'Replace: drops and recreates the table.\nAppend: inserts rows without deleting existing data.\nFail: aborts if the table already exists.' }, + { name: 'DB Schema', desc: 'Optional database schema namespace (e.g. public in PostgreSQL). Leave empty to use the default.' }, + ]}, + { section: 'Reproducibility', fields: [ + { name: 'Random Seed', desc: 'Fixed integer seed for the random generator. Using the same seed always produces identical data — useful for testing and reproducible demos.' }, + ]}, + { section: 'Recurrence', fields: [ + { name: 'Interval (seconds)', desc: 'When set, the generator runs continuously, producing a new batch of data every N seconds. Press Stop to end the loop.' }, + { name: 'Batch Limit', desc: 'Maximum number of batches to run. Set to 0 for infinite recurrence (stop manually with the Stop button).' }, + { name: 'Column Increments', desc: 'Shifts a column\'s values forward by a fixed step on each batch — useful to simulate time-series or growing IDs.\nExample: orders › created_at › step 1 › days → each batch adds 1 day to all dates.' }, + ]}, + ] as { section: string; fields: { name: string; desc: string }[] }[]).map(({ section, fields }) => ( +
+

{section}

+ {fields.map(({ name, desc }) => ( +
+

{name}

+

{desc}

+
+ ))} +
+ ))} +
+
+ )} + {/* Run Generator Modal */} {showRunPanel && (

Run Generator

- +
+ + +
{/* Formats + JSON mode */} -
+ {runConfig.destination !== 'database' &&

Format

{(['csv', 'json', 'parquet', 'avro'] as const).map(fmt => { @@ -1213,7 +1269,7 @@ export default function App() { setRunConfig(r => ({...r, rows: e.target.value}))} style={{ width: '100%', padding: '0.5rem' }} placeholder="e.g. 5000" min="1" />
-
+
} {/* Destination */}
@@ -1239,7 +1295,29 @@ export default function App() { {runConfig.destination === 'local' && (
- setRunConfig(r => ({...r, outputDir: e.target.value}))} style={{ width: '100%', padding: '0.5rem' }} placeholder="e.g. output" /> +
+ setRunConfig(r => ({...r, outputDir: e.target.value}))} style={{ flex: 1, padding: '0.5rem' }} placeholder="e.g. output" /> + +
)} @@ -1323,13 +1401,7 @@ export default function App() { Save connection )} - +{/* Advanced connection string toggle disabled */}
{!dbAdvanced ? ( @@ -1607,16 +1679,7 @@ export default function App() {
)} -
- -