Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ Apenas **Docker** e **Docker Compose** instalados na máquina. Nenhuma instalaç
## Instalação e início rápido

```bash
git clone <repo-url>
git clone https://github.com/ckoliveiraa/DataForge.git
cd Dataforge

# Sobe o frontend (interface visual) em segundo plano
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dataforge"
version = "0.1.0"
version = "0.1.7"
description = "Synthetic relational dataset generator for data engineering studies"
authors = [
{name = "Carlos Oliveira", email = "papodedados@gmail.com"}
Expand Down
19 changes: 17 additions & 2 deletions src/dataforge/core/registry.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,25 @@
from __future__ import annotations

import re
from datetime import date as _date

# Dtypes that support min/max ranges
RANGE_SUPPORTED_DTYPES = {"int", "float", "date"}


def _normalize_date_value(value: str) -> str:
"""Normaliza sufixos de data relativos para o formato aceito pelo Faker.

Converte 'm' minúsculo para 'M' (meses) e garante que os demais
sufixos (d, w, y) fiquem em minúsculo, independente do que o usuário digitar.
"""
return re.sub(
r"([+-]?\d+)([a-zA-Z])",
lambda m: m.group(1) + ("M" if m.group(2).lower() == "m" else m.group(2).lower()),
value,
)


FAKER_REGISTRY: dict[str, callable] = {
"uuid": lambda f, n, **kw: [f.uuid4() for _ in range(n)],
"int_seq": lambda f, n, seq_start=1, **kw: list(range(seq_start, seq_start + n)),
Expand All @@ -19,10 +34,10 @@
"bool": lambda f, n, **kw: [f.boolean() for _ in range(n)],
"date": lambda f, n, min_value="-3y", max_value="today", **kw: [
f.date_between(
start_date=min_value
start_date=_normalize_date_value(min_value)
if isinstance(min_value, str)
else _date.fromisoformat(str(min_value)),
end_date=max_value
end_date=_normalize_date_value(max_value)
if isinstance(max_value, str)
else _date.fromisoformat(str(max_value)),
).isoformat()
Expand Down
105 changes: 84 additions & 21 deletions src/dataforge/frontend/src/App.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,7 @@ const VALID_DTYPES = [

const nodeTypes = { tableNode: TableNode };


export default function App() {
const [domain, setDomain] = useState("custom");
const [tables, setTables] = useState<Table[]>([]);
Expand Down Expand Up @@ -613,6 +614,7 @@ export default function App() {
};

const [showRunPanel, setShowRunPanel] = useState(false);
const [showRunHelp, setShowRunHelp] = useState(false);
const [runConfig, setRunConfig] = useState<{
formats: string[],
destination: 'local' | 'cloud' | 'database',
Expand Down Expand Up @@ -1167,19 +1169,73 @@ export default function App() {
)}
</div>

{/* Run Generator Help Modal */}
{showRunHelp && (
<div style={{ position: 'fixed', top: 0, left: 0, right: 0, bottom: 0, background: 'rgba(0,0,0,0.8)', backdropFilter: 'blur(6px)', zIndex: 200, display: 'flex', alignItems: 'center', justifyContent: 'center' }} onClick={() => setShowRunHelp(false)}>
<div style={{ width: '560px', maxWidth: '95vw', maxHeight: '85vh', overflowY: 'auto', background: 'rgba(9,12,20,0.98)', border: '1px solid rgba(255,255,255,0.1)', borderRadius: '12px', padding: '1.5rem' }} onClick={e => e.stopPropagation()}>
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '1.25rem' }}>
<h3 style={{ margin: 0, color: '#e2e8f0', fontSize: '1rem' }}>Run Generator — Field Reference</h3>
<button onClick={() => setShowRunHelp(false)} style={{ background: 'none', border: 'none', color: '#64748b', cursor: 'pointer', fontSize: '1.1rem', lineHeight: 1 }}>✕</button>
</div>
{([
{ section: 'Format', fields: [
{ name: 'Format (CSV / JSON / Parquet / Avro)', desc: 'Output file format. You can select multiple at once — the generator will write one file per format for each table.' },
{ name: 'JSON Mode', desc: 'Flat (NDJSON): one JSON object per line, ideal for large files and streaming pipelines.\nNested: a single JSON array — easier to read but heavier in memory.' },
{ name: 'Rows Override', desc: 'Override the number of rows defined in the schema for all tables. Leave empty to use the schema defaults.' },
]},
{ section: 'Destination', fields: [
{ name: 'Local', desc: 'Save generated files to a folder on this machine. Click the 📁 button to browse and select the folder path.' },
{ name: 'Cloud', desc: 'Upload generated files directly to a cloud bucket (GCS, S3, or Azure Blob Storage). Credentials are auto-loaded from the credentials/ folder in the project root.' },
{ name: 'Bucket / Container', desc: 'Name of the target cloud bucket or container where files will be uploaded.' },
{ name: 'Prefix', desc: 'Remote path prefix inside the bucket. Example: datasets/ → files land at datasets/schema_name/table_name/file.csv.' },
{ name: 'Database', desc: 'Load generated data directly into a database table. Supports PostgreSQL, MySQL and SQLite.' },
{ name: 'Database Type', desc: 'Choose the database engine. The connection form adapts to the selected type.' },
{ name: 'File Path (SQLite)', desc: 'Path to the SQLite .db file. It will be created if it does not exist.' },
{ name: 'Host / Port', desc: 'Address and port of the database server. Defaults are pre-filled per engine (PostgreSQL: 5432, MySQL: 3306).' },
{ name: 'Database', desc: 'Name of the database to connect to.' },
{ name: 'User / Password', desc: 'Credentials for the database connection.' },
{ name: 'If Table Exists', desc: 'Replace: drops and recreates the table.\nAppend: inserts rows without deleting existing data.\nFail: aborts if the table already exists.' },
{ name: 'DB Schema', desc: 'Optional database schema namespace (e.g. public in PostgreSQL). Leave empty to use the default.' },
]},
{ section: 'Reproducibility', fields: [
{ name: 'Random Seed', desc: 'Fixed integer seed for the random generator. Using the same seed always produces identical data — useful for testing and reproducible demos.' },
]},
{ section: 'Recurrence', fields: [
{ name: 'Interval (seconds)', desc: 'When set, the generator runs continuously, producing a new batch of data every N seconds. Press Stop to end the loop.' },
{ name: 'Batch Limit', desc: 'Maximum number of batches to run. Set to 0 for infinite recurrence (stop manually with the Stop button).' },
{ name: 'Column Increments', desc: 'Shifts a column\'s values forward by a fixed step on each batch — useful to simulate time-series or growing IDs.\nExample: orders › created_at › step 1 › days → each batch adds 1 day to all dates.' },
]},
] as { section: string; fields: { name: string; desc: string }[] }[]).map(({ section, fields }) => (
<div key={section} style={{ marginBottom: '1.25rem' }}>
<p style={{ margin: '0 0 0.6rem', fontSize: '0.7rem', textTransform: 'uppercase', letterSpacing: '0.08em', color: '#475569' }}>{section}</p>
{fields.map(({ name, desc }) => (
<div key={name} style={{ marginBottom: '0.75rem', paddingLeft: '0.75rem', borderLeft: '2px solid rgba(96,165,250,0.2)' }}>
<p style={{ margin: '0 0 0.2rem', fontSize: '0.82rem', color: '#93c5fd', fontWeight: 600 }}>{name}</p>
<p style={{ margin: 0, fontSize: '0.78rem', color: '#94a3b8', lineHeight: 1.5, whiteSpace: 'pre-line' }}>{desc}</p>
</div>
))}
</div>
))}
</div>
</div>
)}

{/* Run Generator Modal */}
{showRunPanel && (
<div style={{ position: 'fixed', top: 0, left: 0, right: 0, bottom: 0, background: 'rgba(0,0,0,0.72)', backdropFilter: 'blur(6px)', zIndex: 100, display: 'flex', alignItems: 'center', justifyContent: 'center' }}>
<div className="glass-panel animated-scale" style={{ width: '580px', maxWidth: '95vw', padding: '1.5rem', background: 'rgba(9, 12, 20, 0.97)', border: '1px solid rgba(255,255,255,0.08)', borderTopColor: 'rgba(255,255,255,0.12)' }}>
<div style={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', marginBottom: '1rem' }}>
<h2 style={{ margin: 0, fontFamily: 'var(--font-display)', fontSize: '1.2rem', display: 'flex', alignItems: 'center', gap: '0.5rem' }}><Play size={18} color="var(--success)"/> Run Generator</h2>
<button onClick={() => setShowRunPanel(false)} className="btn-icon" aria-label="Close run panel"><X size={18}/></button>
<div style={{ display: 'flex', gap: '0.5rem', alignItems: 'center' }}>
<button onClick={() => setShowRunHelp(true)} style={{ background: 'rgba(148,163,184,0.1)', border: '1px solid rgba(148,163,184,0.2)', borderRadius: '6px', color: '#94a3b8', cursor: 'pointer', fontSize: '0.78rem', padding: '0.25rem 0.6rem', display: 'flex', alignItems: 'center', gap: '0.3rem' }}>? Help</button>
<button onClick={() => setShowRunPanel(false)} className="btn-icon" aria-label="Close run panel"><X size={18}/></button>
</div>
</div>

<div style={{ display: 'flex', flexDirection: 'column', gap: '1rem', maxHeight: '70vh', overflowY: 'auto', paddingRight: '0.25rem' }}>

{/* Formats + JSON mode */}
<div style={{ borderBottom: '1px solid rgba(148,163,184,0.15)', paddingBottom: '1rem' }}>
{runConfig.destination !== 'database' && <div style={{ borderBottom: '1px solid rgba(148,163,184,0.15)', paddingBottom: '1rem' }}>
<p style={{ margin: '0 0 0.75rem', fontSize: '0.75rem', textTransform: 'uppercase', letterSpacing: '0.05em', color: '#64748b' }}>Format</p>
<div style={{ display: 'flex', gap: '0.5rem', flexWrap: 'wrap' }}>
{(['csv', 'json', 'parquet', 'avro'] as const).map(fmt => {
Expand Down Expand Up @@ -1213,7 +1269,7 @@ export default function App() {
<input type="number" value={runConfig.rows} onChange={e => setRunConfig(r => ({...r, rows: e.target.value}))} style={{ width: '100%', padding: '0.5rem' }} placeholder="e.g. 5000" min="1" />
</div>
</div>
</div>
</div>}

{/* Destination */}
<div style={{ borderBottom: '1px solid rgba(148,163,184,0.15)', paddingBottom: '1rem' }}>
Expand All @@ -1239,7 +1295,29 @@ export default function App() {
{runConfig.destination === 'local' && (
<div>
<label style={{ display: 'block', marginBottom: '0.5rem', color: '#cbd5e1', fontSize: '0.85rem' }}>Output Directory</label>
<input type="text" value={runConfig.outputDir} onChange={e => setRunConfig(r => ({...r, outputDir: e.target.value}))} style={{ width: '100%', padding: '0.5rem' }} placeholder="e.g. output" />
<div style={{ display: 'flex', gap: '0.5rem' }}>
<input type="text" value={runConfig.outputDir} onChange={e => setRunConfig(r => ({...r, outputDir: e.target.value}))} style={{ flex: 1, padding: '0.5rem' }} placeholder="e.g. output" />
<button
onClick={async (e) => {
const btn = e.currentTarget;
if (btn.disabled) return;
btn.disabled = true;
try {
const res = await fetch('/api/browse-folder');
const { path } = await res.json();
if (path) setRunConfig(r => ({ ...r, outputDir: path }));
} catch {
// ignore
} finally {
btn.disabled = false;
}
}}
title="Browse folder"
style={{ padding: '0.5rem 0.75rem', borderRadius: '6px', border: '1px solid rgba(255,255,255,0.15)', background: 'rgba(255,255,255,0.07)', color: '#94a3b8', cursor: 'pointer', fontSize: '1rem', whiteSpace: 'nowrap' }}
>
📁
</button>
</div>
</div>
)}

Expand Down Expand Up @@ -1323,13 +1401,7 @@ export default function App() {
Save connection
</button>
)}
<button
type="button"
onClick={() => { setDbAdvanced(v => !v); setDbTestStatus('idle'); setDbTestError(''); }}
style={{ background: 'none', border: 'none', color: '#64748b', fontSize: '0.75rem', cursor: 'pointer', textDecoration: 'underline', padding: 0 }}
>
{dbAdvanced ? 'Use form' : 'Advanced (connection string)'}
</button>
{/* Advanced connection string toggle disabled */}
</div>

{!dbAdvanced ? (
Expand Down Expand Up @@ -1607,16 +1679,7 @@ export default function App() {
</div>
</div>
)}
<div>
<label style={{ display: 'block', marginBottom: '0.5rem', color: '#cbd5e1', fontSize: '0.85rem' }}>Column Filters <span style={{ color: '#64748b' }}>(one per line: table:col1,col2)</span></label>
<textarea
value={runConfig.columnsFilter}
onChange={e => setRunConfig(r => ({ ...r, columnsFilter: e.target.value }))}
rows={3}
placeholder={"orders:id,status,total\ncustomers:id,email"}
style={{ width: '100%', padding: '0.5rem', background: 'rgba(255,255,255,0.04)', border: '1px solid rgba(255,255,255,0.1)', borderRadius: '6px', color: 'white', resize: 'vertical', fontSize: '0.82rem', fontFamily: 'monospace' }}
/>
</div>
{/* Column Filters disabled */}
</div>

{/* Terminal output — always visible, above the run button */}
Expand Down
68 changes: 67 additions & 1 deletion src/dataforge/frontend/vite.config.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { defineConfig } from 'vite'
import react from '@vitejs/plugin-react'
import { spawn } from 'child_process'
import { spawn, execSync } from 'child_process'
import { writeFileSync, readFileSync, mkdirSync, existsSync, readdirSync, rmSync } from 'fs'
import { resolve, join } from 'path'
import { tmpdir } from 'os'
Expand All @@ -10,11 +10,49 @@ const pyprojectRaw = readFileSync(resolve(__dirname, '../../../pyproject.toml'),
const appVersion = pyprojectRaw.match(/^version\s*=\s*"([^"]+)"/m)?.[1] ?? '0.0.0'

let activeProcess: ReturnType<typeof spawn> | null = null
let browseInProgress = false

const cliRunnerPlugin = () => ({
name: 'cli-runner',
configureServer(server: any) {
server.middlewares.use(async (req: any, res: any, next: any) => {
if (req.url === '/api/browse-folder' && req.method === 'GET') {
if (browseInProgress) {
res.setHeader('Content-Type', 'application/json')
res.end(JSON.stringify({ path: '' }))
return
}
browseInProgress = true
try {
const ps1Path = join(tmpdir(), `df_browse_${Date.now()}.ps1`)
const script = [
'Add-Type -AssemblyName System.Windows.Forms',
'$owner = New-Object System.Windows.Forms.Form',
'$owner.TopMost = $true',
'$owner.StartPosition = "CenterScreen"',
'$owner.Size = New-Object System.Drawing.Size(1,1)',
'$owner.Show()',
'$owner.Activate()',
'$d = New-Object System.Windows.Forms.FolderBrowserDialog',
'$d.ShowNewFolderButton = $true',
'$r = $d.ShowDialog($owner)',
'$owner.Dispose()',
'if ($r -eq [System.Windows.Forms.DialogResult]::OK) { Write-Output $d.SelectedPath }',
].join('\n')
writeFileSync(ps1Path, script, 'utf-8')
const result = execSync(`powershell -NoProfile -ExecutionPolicy Bypass -File "${ps1Path}"`, { encoding: 'utf-8', timeout: 60000 }).trim()
try { rmSync(ps1Path) } catch {}
browseInProgress = false
res.setHeader('Content-Type', 'application/json')
res.end(JSON.stringify({ path: result }))
} catch (e: any) {
browseInProgress = false
res.setHeader('Content-Type', 'application/json')
res.end(JSON.stringify({ path: '', error: e.message }))
}
return
}

if (req.url === '/api/stop-cli' && req.method === 'POST') {
if (activeProcess) {
activeProcess.kill('SIGTERM')
Expand Down Expand Up @@ -418,6 +456,34 @@ tables:
DTYPES: int_seq, uuid, int, float, str, bool, date, email, name, phone, address, city, country, company, text, url, currency, iban
FAKER: name, first_name, last_name, email, phone_number, address, city, postcode, country, company, job, url, user_name, uuid4, date, past_date, future_date, iban, currency_code, pricetag, text, latitude, longitude, ipv4, credit_card_number

RULES (all mandatory):

1. str columns MUST have choices or faker_provider — never plain str alone:
BAD: status: {dtype: str}
GOOD: status: {dtype: str, choices: [active, inactive]}
GOOD: category: {dtype: str, faker_provider: job}

2. Every table MUST have exactly one primary_key column (dtype: int_seq or uuid).
BAD: two columns with primary_key: true in the same table
BAD: a table with no primary_key at all

3. foreign_key MUST reference the primary_key column of the target table:
BAD: foreign_key: {table: orders, column: status}
GOOD: foreign_key: {table: orders, column: id}

4. min/max are only valid for dtype int, float, or date — never for bool, str, email, name, phone, etc.
BAD: email: {dtype: email, min: 1}
GOOD: age: {dtype: int, min: 18, max: 99}

5. nullable must be a float between 0.0 and 1.0:
BAD: nullable: true
BAD: nullable: 50
GOOD: nullable: 0.2

6. domain must be snake_case (lowercase letters, digits, underscores — no spaces, no hyphens):
BAD: domain: My Domain
GOOD: domain: my_domain

EXAMPLE:
domain: shop
tables:
Expand Down
Loading