diff --git a/ai_testbench/bin/test_time_extraction.dart b/ai_testbench/bin/test_time_extraction.dart index b2a08f2..6561b1d 100644 --- a/ai_testbench/bin/test_time_extraction.dart +++ b/ai_testbench/bin/test_time_extraction.dart @@ -251,7 +251,9 @@ void main(List args) async { for (var i = 0; i < testCases.length; i++) { final tc = testCases[i]; - print('─── Test ${i + 1}/${testCases.length}: ${tc.name} ───────────────────────'); + print( + '─── Test ${i + 1}/${testCases.length}: ${tc.name} ───────────────────────', + ); print(' Input: "${tc.transcript}"'); // Build prompt @@ -283,13 +285,15 @@ void main(List args) async { } } catch (e) { stderr.writeln(' ERROR during generation: $e'); - results.add(TestResult( - testCase: tc, - llmDuration: genSw.elapsed, - tokenCount: tokenCount, - status: TestStatus.fail, - failures: ['LLM generation error: $e'], - )); + results.add( + TestResult( + testCase: tc, + llmDuration: genSw.elapsed, + tokenCount: tokenCount, + status: TestStatus.fail, + failures: ['LLM generation error: $e'], + ), + ); print(''); continue; } @@ -300,10 +304,14 @@ void main(List args) async { // Strip end-of-turn tokens raw = raw.replaceAll('<|im_end|>', '').trim(); // Strip thinking blocks (Qwen3 models may use these) - raw = raw.replaceAll(RegExp(r'.*?', dotAll: true), '').trim(); + raw = raw + .replaceAll(RegExp(r'.*?', dotAll: true), '') + .trim(); final secs = genSw.elapsed.inMilliseconds / 1000; - print(' LLM time: ${secs.toStringAsFixed(2)}s (~${(tokenCount / secs).toStringAsFixed(1)} tok/s)'); + print( + ' LLM time: ${secs.toStringAsFixed(2)}s (~${(tokenCount / secs).toStringAsFixed(1)} tok/s)', + ); if (verbose) { print(' Raw output:'); @@ -341,13 +349,15 @@ void main(List args) async { if (!verbose) { print(' Raw output: $raw'); } - results.add(TestResult( - testCase: tc, - llmDuration: genSw.elapsed, - tokenCount: tokenCount, - status: TestStatus.fail, - failures: ['JSON parse failed: $e'], - )); + results.add( + TestResult( + testCase: tc, + llmDuration: genSw.elapsed, + tokenCount: tokenCount, + status: TestStatus.fail, + failures: ['JSON parse failed: $e'], + ), + ); print(''); continue; } @@ -355,15 +365,15 @@ void main(List args) async { // Resolve time expression with chrono ResolvedTime? resolvedTime; // Try English translation first, fall back to original expression - final timeExpr = llmResult.datetimeExpressionEnglish ?? + final timeExpr = + llmResult.datetimeExpressionEnglish ?? llmResult.datetimeExpressionOriginal; if (timeExpr != null) { - resolvedTime = resolver.resolve( - timeExpr, - referenceDate: referenceTime, - ); + resolvedTime = resolver.resolve(timeExpr, referenceDate: referenceTime); if (resolvedTime != null) { - print(' Chrono parse: ${resolvedTime.dateTime} (via ${resolvedTime.method})'); + print( + ' Chrono parse: ${resolvedTime.dateTime} (via ${resolvedTime.method})', + ); } else { print(' Chrono parse: FAILED — could not resolve "$timeExpr"'); } @@ -378,7 +388,8 @@ void main(List args) async { final intentMatch = _intentMatches(llmResult.intent, tc.expectedIntent); if (!intentMatch) { failures.add( - 'Intent mismatch: got "${llmResult.intent}", expected "${tc.expectedIntent}"'); + 'Intent mismatch: got "${llmResult.intent}", expected "${tc.expectedIntent}"', + ); } // Check 2: Time expression present/absent @@ -389,7 +400,8 @@ void main(List args) async { if (tc.expectedTimeEnglish == null && llmResult.datetimeExpressionEnglish != null) { failures.add( - 'Expected no time expression but got "${llmResult.datetimeExpressionEnglish}"'); + 'Expected no time expression but got "${llmResult.datetimeExpressionEnglish}"', + ); } // Check 3: Chrono parse succeeded when expected @@ -398,24 +410,28 @@ void main(List args) async { } if (tc.expectedDateTime == null && resolvedTime != null) { failures.add( - 'Expected no resolved time but got ${resolvedTime.dateTime}'); + 'Expected no resolved time but got ${resolvedTime.dateTime}', + ); } // Check 4: DateTime accuracy if (tc.expectedDateTime != null && resolvedTime != null) { - final diff = - resolvedTime.dateTime.difference(tc.expectedDateTime!).inMinutes.abs(); + final diff = resolvedTime.dateTime + .difference(tc.expectedDateTime!) + .inMinutes + .abs(); if (diff > tc.toleranceMinutes) { failures.add( - 'DateTime mismatch: got ${resolvedTime.dateTime}, expected ${tc.expectedDateTime} (diff: ${diff}min, tolerance: ${tc.toleranceMinutes}min)'); + 'DateTime mismatch: got ${resolvedTime.dateTime}, expected ${tc.expectedDateTime} (diff: ${diff}min, tolerance: ${tc.toleranceMinutes}min)', + ); } } final status = failures.isEmpty ? TestStatus.pass : (failures.length == 1 && !failures.first.contains('Intent')) - ? TestStatus.partial - : TestStatus.fail; + ? TestStatus.partial + : TestStatus.fail; if (failures.isEmpty) { print(' ✅ PASS'); @@ -429,15 +445,17 @@ void main(List args) async { print(' Expected: ${tc.expectedDateTime}'); } - results.add(TestResult( - testCase: tc, - llmResult: llmResult, - resolvedTime: resolvedTime, - llmDuration: genSw.elapsed, - tokenCount: tokenCount, - status: status, - failures: failures, - )); + results.add( + TestResult( + testCase: tc, + llmResult: llmResult, + resolvedTime: resolvedTime, + llmDuration: genSw.elapsed, + tokenCount: tokenCount, + status: status, + failures: failures, + ), + ); print(''); } @@ -449,12 +467,18 @@ void main(List args) async { final partial = results.where((r) => r.status == TestStatus.partial).length; final failed = results.where((r) => r.status == TestStatus.fail).length; final totalLlmTime = results.fold( - Duration.zero, (sum, r) => sum + r.llmDuration); + Duration.zero, + (sum, r) => sum + r.llmDuration, + ); print('╔══════════════════════════════════════════════════════════╗'); - print('║ Results: $passed passed, $partial partial, $failed failed ' - 'out of ${testCases.length} tests'); - print('║ Total LLM time: ${(totalLlmTime.inMilliseconds / 1000).toStringAsFixed(1)}s'); + print( + '║ Results: $passed passed, $partial partial, $failed failed ' + 'out of ${testCases.length} tests', + ); + print( + '║ Total LLM time: ${(totalLlmTime.inMilliseconds / 1000).toStringAsFixed(1)}s', + ); print('║ Model: $modelFile'); print('╚══════════════════════════════════════════════════════════╝'); @@ -463,7 +487,8 @@ void main(List args) async { print(''); print('Failed/partial tests:'); for (final r in results.where( - (r) => r.status == TestStatus.fail || r.status == TestStatus.partial)) { + (r) => r.status == TestStatus.fail || r.status == TestStatus.partial, + )) { print(' ${r.testCase.name}:'); for (final f in r.failures) { print(' - $f'); diff --git a/ai_testbench/lib/benchmark_main.dart b/ai_testbench/lib/benchmark_main.dart index a54b248..4414351 100644 --- a/ai_testbench/lib/benchmark_main.dart +++ b/ai_testbench/lib/benchmark_main.dart @@ -37,7 +37,9 @@ Future main(List args) async { if (filteredModelPaths.isEmpty) { stdout.writeln('[BenchmarkRunner] No matching .gguf files found'); if (config.modelFilter != null) { - stdout.writeln('[BenchmarkRunner] Model filter: ${config.modelFilter}'); + stdout.writeln( + '[BenchmarkRunner] Model filter: ${config.modelFilter}', + ); } exitCode = 1; return; @@ -48,7 +50,9 @@ Future main(List args) async { caseLimit: config.caseLimit, ); if (selectedCases.isEmpty) { - stdout.writeln('[BenchmarkRunner] No benchmark cases matched the request'); + stdout.writeln( + '[BenchmarkRunner] No benchmark cases matched the request', + ); if (config.caseFilter != null) { stdout.writeln('[BenchmarkRunner] Case filter: ${config.caseFilter}'); } @@ -77,11 +81,7 @@ Future main(List args) async { } } - runApp( - BenchmarkApp( - modelDirectory: modelDir, - ), - ); + runApp(BenchmarkApp(modelDirectory: modelDir)); } class _RunnerConfig { @@ -113,13 +113,17 @@ _RunnerConfig _parseConfig(List args) { return null; } - final modelDir = readValue('--model-dir') ?? Directory('models').absolute.path; + final modelDir = + readValue('--model-dir') ?? Directory('models').absolute.path; final outputPath = readValue('--output'); final modelFilter = readValue('--model'); final caseFilter = readValue('--case'); final caseLimitValue = readValue('--case-limit'); - final caseLimit = caseLimitValue == null ? null : int.tryParse(caseLimitValue); - final headless = hasFlag('--headless') || Platform.environment['AI_BENCH_HEADLESS'] == '1'; + final caseLimit = caseLimitValue == null + ? null + : int.tryParse(caseLimitValue); + final headless = + hasFlag('--headless') || Platform.environment['AI_BENCH_HEADLESS'] == '1'; return _RunnerConfig( headless: headless, @@ -219,16 +223,20 @@ Future _runHeadlessBenchmark({ 'finishedAt': finishedAt.toIso8601String(), 'modelCount': results.length, 'caseCount': selectedCases.length, - if (modelFilter != null && modelFilter.isNotEmpty) 'modelFilter': modelFilter, + if (modelFilter != null && modelFilter.isNotEmpty) + 'modelFilter': modelFilter, if (caseFilter != null && caseFilter.isNotEmpty) 'caseFilter': caseFilter, 'results': results.map(_serializeModelResult).toList(growable: false), }; - final resolvedOutputPath = outputPath ?? + final resolvedOutputPath = + outputPath ?? '${Directory.current.path}${Platform.pathSeparator}benchmark_results_${DateTime.now().millisecondsSinceEpoch}.json'; final outputFile = File(resolvedOutputPath); outputFile.parent.createSync(recursive: true); - outputFile.writeAsStringSync(const JsonEncoder.withIndent(' ').convert(report)); + outputFile.writeAsStringSync( + const JsonEncoder.withIndent(' ').convert(report), + ); stdout.writeln('[BenchmarkRunner] Headless benchmark complete'); stdout.writeln('[BenchmarkRunner] Results written to ${outputFile.path}'); @@ -258,42 +266,41 @@ Map _serializeModelResult(BenchmarkModelResult result) { 'totalCases': result.cases.length, 'avgTokensPerSecond': result.avgTokensPerSecond, 'totalElapsedMs': result.totalElapsed.inMilliseconds, - 'cases': result.cases.map((caseResult) { - return { - 'caseName': caseResult.caseName, - 'passed': caseResult.passed, - 'validJson': caseResult.validJson, - 'intentMatch': caseResult.intentMatch, - 'timePresenceMatch': caseResult.timePresenceMatch, - 'titleLanguageMatch': caseResult.titleLanguageMatch, - 'titleLanguageDetail': caseResult.titleLanguageDetail, - 'timeResolutionCorrect': caseResult.timeResolutionCorrect, - 'timeResolutionDetail': caseResult.timeResolutionDetail, - 'durationMatch': caseResult.durationMatch, - 'durationDetail': caseResult.durationDetail, - 'intent': caseResult.intent, - 'title': caseResult.title, - 'datetimeOriginal': caseResult.datetimeOriginal, - 'datetimeEnglish': caseResult.datetimeEnglish, - 'elapsedMs': caseResult.elapsed.inMilliseconds, - 'tokensPerSecond': caseResult.tokensPerSecond, - 'outputPreview': caseResult.outputPreview, - 'error': caseResult.error, - 'extractedCount': caseResult.extractedCount, - 'expectedCount': caseResult.expectedCount, - 'countMatch': caseResult.countMatch, - if (caseResult.itemFailures.isNotEmpty) - 'itemFailures': caseResult.itemFailures, - }; - }).toList(growable: false), + 'cases': result.cases + .map((caseResult) { + return { + 'caseName': caseResult.caseName, + 'passed': caseResult.passed, + 'validJson': caseResult.validJson, + 'intentMatch': caseResult.intentMatch, + 'timePresenceMatch': caseResult.timePresenceMatch, + 'titleLanguageMatch': caseResult.titleLanguageMatch, + 'titleLanguageDetail': caseResult.titleLanguageDetail, + 'timeResolutionCorrect': caseResult.timeResolutionCorrect, + 'timeResolutionDetail': caseResult.timeResolutionDetail, + 'durationMatch': caseResult.durationMatch, + 'durationDetail': caseResult.durationDetail, + 'intent': caseResult.intent, + 'title': caseResult.title, + 'datetimeOriginal': caseResult.datetimeOriginal, + 'datetimeEnglish': caseResult.datetimeEnglish, + 'elapsedMs': caseResult.elapsed.inMilliseconds, + 'tokensPerSecond': caseResult.tokensPerSecond, + 'outputPreview': caseResult.outputPreview, + 'error': caseResult.error, + 'extractedCount': caseResult.extractedCount, + 'expectedCount': caseResult.expectedCount, + 'countMatch': caseResult.countMatch, + if (caseResult.itemFailures.isNotEmpty) + 'itemFailures': caseResult.itemFailures, + }; + }) + .toList(growable: false), }; } class BenchmarkApp extends StatelessWidget { - const BenchmarkApp({ - super.key, - required this.modelDirectory, - }); + const BenchmarkApp({super.key, required this.modelDirectory}); final String modelDirectory; diff --git a/ai_testbench/lib/correction_main.dart b/ai_testbench/lib/correction_main.dart index 81f9ac4..2d1c27c 100644 --- a/ai_testbench/lib/correction_main.dart +++ b/ai_testbench/lib/correction_main.dart @@ -21,16 +21,18 @@ Future runHeadlessCorrectionBenchmark(List args) async { return null; } - final modelDir = readValue('--model-dir') ?? Directory('models').absolute.path; + final modelDir = + readValue('--model-dir') ?? Directory('models').absolute.path; final outputPath = readValue('--output'); - final modelPaths = Directory(modelDir) - .listSync() - .whereType() - .map((f) => f.path) - .where((p) => p.toLowerCase().endsWith('.gguf')) - .toList() - ..sort(); + final modelPaths = + Directory(modelDir) + .listSync() + .whereType() + .map((f) => f.path) + .where((p) => p.toLowerCase().endsWith('.gguf')) + .toList() + ..sort(); if (modelPaths.isEmpty) { stdout.writeln('[CorrectionBench] No .gguf files found in $modelDir'); @@ -45,7 +47,9 @@ Future runHeadlessCorrectionBenchmark(List args) async { for (final p in modelPaths) { stdout.writeln(' - ${p.split(Platform.pathSeparator).last}'); } - stdout.writeln('[CorrectionBench] Cases: ${CorrectionBenchmarkService.benchmarkCases.length}'); + stdout.writeln( + '[CorrectionBench] Cases: ${CorrectionBenchmarkService.benchmarkCases.length}', + ); final service = CorrectionBenchmarkService(); final startedAt = DateTime.now().toUtc(); @@ -71,17 +75,19 @@ Future runHeadlessCorrectionBenchmark(List args) async { for (final model in results) { stdout.writeln(''); - stdout.writeln('┌── ${model.modelName} ── ' - '${model.passedCases}/${model.cases.length} passed ──┐'); + stdout.writeln( + '┌── ${model.modelName} ── ' + '${model.passedCases}/${model.cases.length} passed ──┐', + ); for (final c in model.cases) { final tag = c.passed ? 'PASS' : 'FAIL'; final reasons = []; if (!c.modificationMatch) { - reasons.add(c.modificationExpected - ? 'NOT_MODIFIED' - : 'UNEXPECTED_MODIFICATION'); + reasons.add( + c.modificationExpected ? 'NOT_MODIFIED' : 'UNEXPECTED_MODIFICATION', + ); } if (!c.allMustContainFound) { reasons.add('MISSING[${c.missingKeywords.join(",")}]'); @@ -117,33 +123,41 @@ Future runHeadlessCorrectionBenchmark(List args) async { 'finishedAt': finishedAt.toIso8601String(), 'modelCount': results.length, 'caseCount': CorrectionBenchmarkService.benchmarkCases.length, - 'results': results.map((r) => { - 'modelPath': r.modelPath, - 'modelName': r.modelName, - 'passedCases': r.passedCases, - 'totalCases': r.cases.length, - 'avgTokensPerSecond': r.avgTokensPerSecond, - 'totalElapsedMs': r.totalElapsed.inMilliseconds, - 'cases': r.cases.map((c) => { - 'caseName': c.caseName, - 'passed': c.passed, - 'wasModified': c.wasModified, - 'modificationExpected': c.modificationExpected, - 'modificationMatch': c.modificationMatch, - 'allMustContainFound': c.allMustContainFound, - 'missingKeywords': c.missingKeywords, - 'allMustNotContainAbsent': c.allMustNotContainAbsent, - 'unwantedKeywordsFound': c.unwantedKeywordsFound, - 'cleanOutput': c.cleanOutput, - 'cleanOutputDetail': c.cleanOutputDetail, - 'input': c.input, - 'expectedOutput': c.expectedOutput, - 'actualOutput': c.actualOutput, - 'elapsedMs': c.elapsed.inMilliseconds, - 'tokensPerSecond': c.tokensPerSecond, - 'error': c.error, - }).toList(growable: false), - }).toList(growable: false), + 'results': results + .map( + (r) => { + 'modelPath': r.modelPath, + 'modelName': r.modelName, + 'passedCases': r.passedCases, + 'totalCases': r.cases.length, + 'avgTokensPerSecond': r.avgTokensPerSecond, + 'totalElapsedMs': r.totalElapsed.inMilliseconds, + 'cases': r.cases + .map( + (c) => { + 'caseName': c.caseName, + 'passed': c.passed, + 'wasModified': c.wasModified, + 'modificationExpected': c.modificationExpected, + 'modificationMatch': c.modificationMatch, + 'allMustContainFound': c.allMustContainFound, + 'missingKeywords': c.missingKeywords, + 'allMustNotContainAbsent': c.allMustNotContainAbsent, + 'unwantedKeywordsFound': c.unwantedKeywordsFound, + 'cleanOutput': c.cleanOutput, + 'cleanOutputDetail': c.cleanOutputDetail, + 'input': c.input, + 'expectedOutput': c.expectedOutput, + 'actualOutput': c.actualOutput, + 'elapsedMs': c.elapsed.inMilliseconds, + 'tokensPerSecond': c.tokensPerSecond, + 'error': c.error, + }, + ) + .toList(growable: false), + }, + ) + .toList(growable: false), }; final file = File(outputPath); diff --git a/ai_testbench/lib/main.dart b/ai_testbench/lib/main.dart index 1160f91..8bffc6e 100644 --- a/ai_testbench/lib/main.dart +++ b/ai_testbench/lib/main.dart @@ -82,10 +82,7 @@ class _HomeShellState extends State<_HomeShell> { selectedIndex: _index, onDestinationSelected: (i) => setState(() => _index = i), destinations: const [ - NavigationDestination( - icon: Icon(Icons.science), - label: 'Classify', - ), + NavigationDestination(icon: Icon(Icons.science), label: 'Classify'), NavigationDestination( icon: Icon(Icons.access_time), label: 'Time Extract', @@ -95,4 +92,3 @@ class _HomeShellState extends State<_HomeShell> { ); } } - diff --git a/ai_testbench/lib/router_benchmark_main.dart b/ai_testbench/lib/router_benchmark_main.dart index c953ded..6813948 100644 --- a/ai_testbench/lib/router_benchmark_main.dart +++ b/ai_testbench/lib/router_benchmark_main.dart @@ -17,15 +17,19 @@ Future runRouterBenchmark(List args) async { final modelFilter = _readArg(args, '--model'); final modelDir = _readArg(args, '--model-dir') ?? 'models'; - final modelPaths = Directory(modelDir) - .listSync() - .whereType() - .map((f) => f.path) - .where((p) => p.toLowerCase().endsWith('.gguf')) - .where( - (p) => modelFilter == null || p.toLowerCase().contains(modelFilter.toLowerCase())) - .toList() - ..sort(); + final modelPaths = + Directory(modelDir) + .listSync() + .whereType() + .map((f) => f.path) + .where((p) => p.toLowerCase().endsWith('.gguf')) + .where( + (p) => + modelFilter == null || + p.toLowerCase().contains(modelFilter.toLowerCase()), + ) + .toList() + ..sort(); if (modelPaths.isEmpty) { stdout.writeln('[RouterBench] No .gguf models found'); @@ -40,7 +44,8 @@ Future runRouterBenchmark(List args) async { ..setModel(modelPath) ..nCtx = 4096 ..nThreads = Platform.numberOfProcessors - ..maxTokens = 32 // Router output is tiny: {"route":"timer_alarm"} + ..maxTokens = + 32 // Router output is tiny: {"route":"timer_alarm"} ..temperature = 0.1 ..topP = 1.0 ..presencePenalty = 2.0 @@ -74,27 +79,34 @@ Future runRouterBenchmark(List args) async { _RouterTestCase('Remind me in 30 minutes to check the oven', 'voice_memo'), _RouterTestCase('Remind me at 3 PM to call the dentist', 'voice_memo'), _RouterTestCase( - 'Påminn mig om 10 minuter att stänga av ugnen', 'voice_memo'), + 'Påminn mig om 10 minuter att stänga av ugnen', + 'voice_memo', + ), _RouterTestCase( - 'Påminn mig klockan 15 att ringa tandläkaren', 'voice_memo'), + 'Påminn mig klockan 15 att ringa tandläkaren', + 'voice_memo', + ), // Event/note cases → voice_memo _RouterTestCase('Meeting with John next Tuesday at 2 pm', 'voice_memo'), _RouterTestCase('Buy milk and bread', 'voice_memo'), _RouterTestCase('Köp mjölk och bröd på vägen hem', 'voice_memo'), + _RouterTestCase('Tandläkare den 15 mars klockan halv 10', 'voice_memo'), _RouterTestCase( - 'Tandläkare den 15 mars klockan halv 10', 'voice_memo'), - _RouterTestCase( - 'Fika med Anna imorgon klockan 10 och sen lämna in paketet', - 'voice_memo'), + 'Fika med Anna imorgon klockan 10 och sen lämna in paketet', + 'voice_memo', + ), // Mixed cases _RouterTestCase( - 'Set a timer for 10 minutes and an alarm for 7 AM tomorrow', 'timer_alarm'), + 'Set a timer for 10 minutes and an alarm for 7 AM tomorrow', + 'timer_alarm', + ), _RouterTestCase('Set an alarm for 6:30 and buy milk', 'mixed'), _RouterTestCase( - 'Sätt en timer på 5 minuter och påminn mig klockan 3 att ringa tandläkaren', - 'mixed'), + 'Sätt en timer på 5 minuter och påminn mig klockan 3 att ringa tandläkaren', + 'mixed', + ), ]; stdout.writeln('[RouterBench] Running ${cases.length} router cases...\n'); @@ -110,10 +122,14 @@ Future runRouterBenchmark(List args) async { now: referenceTime, ); - final result = await llm.generate(prompt).timeout( + final result = await llm + .generate(prompt) + .timeout( const Duration(seconds: 30), onTimeout: () => const InferenceResult( - output: '{"route":"timeout"}', elapsed: Duration(seconds: 30)), + output: '{"route":"timeout"}', + elapsed: Duration(seconds: 30), + ), ); routerTimes.add(result.elapsed); @@ -123,19 +139,23 @@ Future runRouterBenchmark(List args) async { final status = correct ? 'OK' : 'FAIL'; stdout.writeln( - ' $status route=$route (expected=${tc.expectedRoute}) ' - '${result.elapsed.inMilliseconds}ms "${tc.transcript}"'); + ' $status route=$route (expected=${tc.expectedRoute}) ' + '${result.elapsed.inMilliseconds}ms "${tc.transcript}"', + ); } final avgRouterMs = routerTimes.fold(0, (s, d) => s + d.inMilliseconds) ~/ - routerTimes.length; + routerTimes.length; stdout.writeln( - '\n[RouterBench] Router accuracy: $routerCorrect/${cases.length}'); + '\n[RouterBench] Router accuracy: $routerCorrect/${cases.length}', + ); stdout.writeln('[RouterBench] Router avg latency: ${avgRouterMs}ms'); // ── Stage 2: Full two-stage pipeline on timer/alarm cases ── - stdout.writeln('\n[RouterBench] Running full two-stage pipeline on timer/alarm cases...\n'); + stdout.writeln( + '\n[RouterBench] Running full two-stage pipeline on timer/alarm cases...\n', + ); // Reconfigure for extraction (more tokens needed) llm.maxTokens = 384; @@ -157,10 +177,14 @@ Future runRouterBenchmark(List args) async { transcript: tc.transcript, now: referenceTime, ); - final routerResult = await llm.generate(routerPrompt).timeout( + final routerResult = await llm + .generate(routerPrompt) + .timeout( const Duration(seconds: 30), onTimeout: () => const InferenceResult( - output: '{"route":"timeout"}', elapsed: Duration(seconds: 30)), + output: '{"route":"timeout"}', + elapsed: Duration(seconds: 30), + ), ); final routerMs = routerResult.elapsed.inMilliseconds; @@ -171,10 +195,14 @@ Future runRouterBenchmark(List args) async { transcript: tc.transcript, now: referenceTime, ); - final extractResult = await llm.generate(extractPrompt).timeout( + final extractResult = await llm + .generate(extractPrompt) + .timeout( const Duration(seconds: 60), onTimeout: () => const InferenceResult( - output: '[]', elapsed: Duration(seconds: 60)), + output: '[]', + elapsed: Duration(seconds: 60), + ), ); final extractMs = extractResult.elapsed.inMilliseconds; @@ -185,29 +213,34 @@ Future runRouterBenchmark(List args) async { final first = parseResult.extractions.isNotEmpty ? parseResult.extractions.first : null; - final intentOk = first != null && - (first.intent == 'timer' || first.intent == 'alarm'); + final intentOk = + first != null && (first.intent == 'timer' || first.intent == 'alarm'); if (intentOk) extractionCorrect++; stdout.writeln( - ' ${intentOk ? "OK" : "FAIL"} intent=${first?.intent ?? "null"} ' - 'dur=${first?.durationSeconds ?? "null"} ' - 'router=${routerMs}ms extract=${extractMs}ms ' - 'total=${sw.elapsedMilliseconds}ms "${tc.transcript}"'); + ' ${intentOk ? "OK" : "FAIL"} intent=${first?.intent ?? "null"} ' + 'dur=${first?.durationSeconds ?? "null"} ' + 'router=${routerMs}ms extract=${extractMs}ms ' + 'total=${sw.elapsedMilliseconds}ms "${tc.transcript}"', + ); } final avgTotalMs = totalTimes.fold(0, (s, d) => s + d.inMilliseconds) ~/ - totalTimes.length; + totalTimes.length; stdout.writeln( - '\n[RouterBench] Extraction accuracy: $extractionCorrect/${timerCases.length}'); + '\n[RouterBench] Extraction accuracy: $extractionCorrect/${timerCases.length}', + ); stdout.writeln('[RouterBench] Avg total (router+extract): ${avgTotalMs}ms'); stdout.writeln('[RouterBench] Avg router only: ${avgRouterMs}ms'); stdout.writeln( - '[RouterBench] Avg extract only: ${avgTotalMs - avgRouterMs}ms'); + '[RouterBench] Avg extract only: ${avgTotalMs - avgRouterMs}ms', + ); // ── Stage 3: Compare with single-pass original prompt on voice_memo cases ── - stdout.writeln('\n[RouterBench] Comparing single-pass original prompt latency...\n'); + stdout.writeln( + '\n[RouterBench] Comparing single-pass original prompt latency...\n', + ); final voiceMemoCases = cases .where((c) => c.expectedRoute == 'voice_memo') @@ -223,23 +256,33 @@ Future runRouterBenchmark(List args) async { transcript: tc.transcript, now: referenceTime, ); - final result = await llm.generate(prompt).timeout( + final result = await llm + .generate(prompt) + .timeout( const Duration(seconds: 90), onTimeout: () => const InferenceResult( - output: 'timeout', elapsed: Duration(seconds: 90)), + output: 'timeout', + elapsed: Duration(seconds: 90), + ), ); singlePassTimes.add(result.elapsed); stdout.writeln( - ' single-pass: ${result.elapsed.inMilliseconds}ms "${tc.transcript}"'); + ' single-pass: ${result.elapsed.inMilliseconds}ms "${tc.transcript}"', + ); } final avgSingleMs = singlePassTimes.fold(0, (s, d) => s + d.inMilliseconds) ~/ - singlePassTimes.length; - stdout.writeln('\n[RouterBench] Avg single-pass (original prompt): ${avgSingleMs}ms'); - stdout.writeln('[RouterBench] Avg two-stage (router+extract): ${avgTotalMs}ms'); + singlePassTimes.length; + stdout.writeln( + '\n[RouterBench] Avg single-pass (original prompt): ${avgSingleMs}ms', + ); + stdout.writeln( + '[RouterBench] Avg two-stage (router+extract): ${avgTotalMs}ms', + ); stdout.writeln( - '[RouterBench] Overhead: ${avgTotalMs - avgSingleMs}ms (${((avgTotalMs - avgSingleMs) / avgSingleMs * 100).toStringAsFixed(0)}%)'); + '[RouterBench] Overhead: ${avgTotalMs - avgSingleMs}ms (${((avgTotalMs - avgSingleMs) / avgSingleMs * 100).toStringAsFixed(0)}%)', + ); llm.dispose(); stdout.writeln('\n[RouterBench] Done.'); @@ -252,7 +295,8 @@ String _parseRoute(String raw) { if (start == -1) return _guessRoute(raw); final end = raw.lastIndexOf('}'); if (end == -1) return _guessRoute(raw); - final json = jsonDecode(raw.substring(start, end + 1)) as Map; + final json = + jsonDecode(raw.substring(start, end + 1)) as Map; final route = (json['route'] as String?)?.trim().toLowerCase() ?? ''; if (route == 'timer_alarm' || route == 'voice_memo' || route == 'mixed') { return route; diff --git a/ai_testbench/lib/screens/testbench_screen.dart b/ai_testbench/lib/screens/testbench_screen.dart index d1bbc0a..3a17702 100644 --- a/ai_testbench/lib/screens/testbench_screen.dart +++ b/ai_testbench/lib/screens/testbench_screen.dart @@ -90,10 +90,7 @@ class _TestbenchScreenState extends State { final pickedPath = result.files.single.path!; setState(() { _modelPath = pickedPath; - _availableModelPaths = { - ..._availableModelPaths, - pickedPath, - }.toList() + _availableModelPaths = {..._availableModelPaths, pickedPath}.toList() ..sort(); }); } @@ -213,9 +210,13 @@ class _TestbenchScreenState extends State { final prompt = _mode == _RunMode.classify ? ChronoPromptTemplate.render( - ChronoPromptTemplate.defaultTemplate, transcript: transcript) + ChronoPromptTemplate.defaultTemplate, + transcript: transcript, + ) : PromptTemplates.summarize( - language: _selectedLanguage, transcript: transcript); + language: _selectedLanguage, + transcript: transcript, + ); setState(() { _formattedPrompt = prompt; @@ -243,10 +244,14 @@ class _TestbenchScreenState extends State { } sw.stop(); - debugPrint('[Testbench] Stream done: $streamEvents events, ' - '${_streamBuffer.length} chars, ${sw.elapsedMilliseconds}ms'); + debugPrint( + '[Testbench] Stream done: $streamEvents events, ' + '${_streamBuffer.length} chars, ${sw.elapsedMilliseconds}ms', + ); if (_streamBuffer.isNotEmpty) { - debugPrint('[Testbench] Output preview: ${_streamBuffer.substring(0, _streamBuffer.length.clamp(0, 300))}'); + debugPrint( + '[Testbench] Output preview: ${_streamBuffer.substring(0, _streamBuffer.length.clamp(0, 300))}', + ); } else { debugPrint('[Testbench] WARNING: output is empty!'); } @@ -328,7 +333,11 @@ class _TestbenchScreenState extends State { Padding( padding: const EdgeInsets.only(right: 16), child: Chip( - avatar: const Icon(Icons.check_circle, size: 18, color: Colors.green), + avatar: const Icon( + Icons.check_circle, + size: 18, + color: Colors.green, + ), label: Text( _modelPath?.split(Platform.pathSeparator).last ?? '', style: const TextStyle(fontSize: 12), @@ -401,10 +410,15 @@ class _TestbenchScreenState extends State { FilledButton.icon( onPressed: _isRunning ? null : _loadModel, icon: const Icon(Icons.memory, size: 18), - label: Text(_llm.isModelLoaded ? 'Reload Model' : 'Set Model'), + label: Text( + _llm.isModelLoaded ? 'Reload Model' : 'Set Model', + ), ), OutlinedButton.icon( - onPressed: _isRunning || _isBenchmarking || _availableModelPaths.isEmpty + onPressed: + _isRunning || + _isBenchmarking || + _availableModelPaths.isEmpty ? null : () => _runBenchmarks(), icon: _isBenchmarking @@ -418,8 +432,8 @@ class _TestbenchScreenState extends State { _isBenchmarking && _benchmarkProgress != null ? 'Benchmarking ${_benchmarkProgress!.completedRuns}/${_benchmarkProgress!.totalRuns}' : _isBenchmarking - ? 'Benchmarking…' - : 'Benchmark All', + ? 'Benchmarking…' + : 'Benchmark All', ), ), ], @@ -534,17 +548,16 @@ class _TestbenchScreenState extends State { const SizedBox(height: 24), // ── Sample transcripts ───────────────────────────────────────── - Text('Sample Transcripts', - style: Theme.of(context).textTheme.titleMedium), + Text( + 'Sample Transcripts', + style: Theme.of(context).textTheme.titleMedium, + ), const SizedBox(height: 8), ..._sampleTranscripts.map( (sample) => Padding( padding: const EdgeInsets.only(bottom: 4), child: ActionChip( - label: Text( - sample.label, - overflow: TextOverflow.ellipsis, - ), + label: Text(sample.label, overflow: TextOverflow.ellipsis), onPressed: () { _transcriptController.text = sample.text; setState(() => _selectedLanguage = sample.language); @@ -632,8 +645,10 @@ class _TestbenchScreenState extends State { if (_benchmarkResults.isNotEmpty) ...[ const SizedBox(height: 16), - Text('Benchmark Results', - style: Theme.of(context).textTheme.titleMedium), + Text( + 'Benchmark Results', + style: Theme.of(context).textTheme.titleMedium, + ), const SizedBox(height: 8), ..._benchmarkResults.map( (result) => Card( @@ -652,7 +667,9 @@ class _TestbenchScreenState extends State { spacing: 12, runSpacing: 4, children: [ - Text('Pass: ${result.passedCases}/${result.cases.length}'), + Text( + 'Pass: ${result.passedCases}/${result.cases.length}', + ), Text( 'Avg tok/s: ${result.avgTokensPerSecond.toStringAsFixed(1)}', ), @@ -688,7 +705,9 @@ class _TestbenchScreenState extends State { Text( 'time: ${caseResult.timeResolutionDetail}', style: TextStyle( - color: caseResult.timeResolutionCorrect ? Colors.green : Colors.orangeAccent, + color: caseResult.timeResolutionCorrect + ? Colors.green + : Colors.orangeAccent, fontSize: 11, ), ), @@ -732,16 +751,20 @@ class _TestbenchScreenState extends State { // ── Parsed card preview ─────────────────────────────────────── if (_parsedJson != null) ...[ - Text('Card Preview', - style: Theme.of(context).textTheme.titleMedium), + Text( + 'Card Preview', + style: Theme.of(context).textTheme.titleMedium, + ), const SizedBox(height: 8), MemoCard(data: _parsedJson!), const SizedBox(height: 24), ], // ── Raw JSON output ─────────────────────────────────────────── - Text('Raw Model Output', - style: Theme.of(context).textTheme.titleMedium), + Text( + 'Raw Model Output', + style: Theme.of(context).textTheme.titleMedium, + ), const SizedBox(height: 8), Container( width: double.infinity, @@ -838,7 +861,9 @@ class _TestbenchScreenState extends State { ], ), const SizedBox(height: 12), - LinearProgressIndicator(value: total == 0 ? null : fraction.clamp(0, 1)), + LinearProgressIndicator( + value: total == 0 ? null : fraction.clamp(0, 1), + ), const SizedBox(height: 10), Wrap( spacing: 12, @@ -846,7 +871,9 @@ class _TestbenchScreenState extends State { children: [ Text('Completed: $completed/$total'), if (progress != null) - Text('Model: ${progress.currentModelIndex + 1}/${progress.totalModels}'), + Text( + 'Model: ${progress.currentModelIndex + 1}/${progress.totalModels}', + ), if (progress != null && progress.currentCaseName.isNotEmpty) Text('Case: ${progress.currentCaseName}'), if (eta != null) Text('ETA: ${_formatDuration(eta)}'), diff --git a/ai_testbench/lib/screens/time_extraction_screen.dart b/ai_testbench/lib/screens/time_extraction_screen.dart index 3294393..6406eb7 100644 --- a/ai_testbench/lib/screens/time_extraction_screen.dart +++ b/ai_testbench/lib/screens/time_extraction_screen.dart @@ -46,13 +46,14 @@ class _TimeExtractionScreenState extends State { _log('No models/ directory found'); return; } - final models = modelDir - .listSync() - .whereType() - .map((f) => f.path) - .where((p) => p.toLowerCase().endsWith('.gguf')) - .toList() - ..sort(); + final models = + modelDir + .listSync() + .whereType() + .map((f) => f.path) + .where((p) => p.toLowerCase().endsWith('.gguf')) + .toList() + ..sort(); setState(() { _availableModels = models; @@ -135,8 +136,10 @@ class _TimeExtractionScreenState extends State { _availableModels, onProgress: (p) { setState(() => _progress = p); - _log('[${p.modelName}] [${p.completedCases}/${p.totalCases}] ' - '${p.currentCaseName}'); + _log( + '[${p.modelName}] [${p.completedCases}/${p.totalCases}] ' + '${p.currentCaseName}', + ); }, ); @@ -155,9 +158,7 @@ class _TimeExtractionScreenState extends State { @override Widget build(BuildContext context) { return Scaffold( - appBar: AppBar( - title: const Text('Time Extraction Testbench'), - ), + appBar: AppBar(title: const Text('Time Extraction Testbench')), body: Column( children: [ // ── Controls bar ── @@ -165,14 +166,11 @@ class _TimeExtractionScreenState extends State { // ── Progress indicator ── if (_isRunning && _progress != null) Padding( - padding: - const EdgeInsets.symmetric(horizontal: 16, vertical: 4), + padding: const EdgeInsets.symmetric(horizontal: 16, vertical: 4), child: Column( crossAxisAlignment: CrossAxisAlignment.start, children: [ - LinearProgressIndicator( - value: _progress!.fraction, - ), + LinearProgressIndicator(value: _progress!.fraction), const SizedBox(height: 4), Text( '${_progress!.modelName} — ' @@ -256,20 +254,40 @@ class _TimeExtractionScreenState extends State { DataColumn(label: Text('tok/s'), numeric: true), ], rows: _results.map((model) { - return DataRow(cells: [ - DataCell(Text(model.modelName, - style: const TextStyle(fontWeight: FontWeight.bold))), - DataCell(Text('${model.passedCount}', - style: const TextStyle(color: Colors.green))), - DataCell(Text('${model.partialCount}', - style: const TextStyle(color: Colors.orange))), - DataCell(Text('${model.failedCount}', - style: const TextStyle(color: Colors.red))), - DataCell(Text( - '${(model.totalElapsed.inMilliseconds / 1000).toStringAsFixed(1)}s')), - DataCell( - Text(model.avgTokensPerSecond.toStringAsFixed(1))), - ]); + return DataRow( + cells: [ + DataCell( + Text( + model.modelName, + style: const TextStyle(fontWeight: FontWeight.bold), + ), + ), + DataCell( + Text( + '${model.passedCount}', + style: const TextStyle(color: Colors.green), + ), + ), + DataCell( + Text( + '${model.partialCount}', + style: const TextStyle(color: Colors.orange), + ), + ), + DataCell( + Text( + '${model.failedCount}', + style: const TextStyle(color: Colors.red), + ), + ), + DataCell( + Text( + '${(model.totalElapsed.inMilliseconds / 1000).toStringAsFixed(1)}s', + ), + ), + DataCell(Text(model.avgTokensPerSecond.toStringAsFixed(1))), + ], + ); }).toList(), ), ); diff --git a/ai_testbench/lib/services/correction_benchmark_service.dart b/ai_testbench/lib/services/correction_benchmark_service.dart index 480aa6f..c36071e 100644 --- a/ai_testbench/lib/services/correction_benchmark_service.dart +++ b/ai_testbench/lib/services/correction_benchmark_service.dart @@ -125,17 +125,14 @@ class CorrectionModelResult { final String modelPath; final List cases; - const CorrectionModelResult({ - required this.modelPath, - required this.cases, - }); + const CorrectionModelResult({required this.modelPath, required this.cases}); String get modelName => modelPath.split(Platform.pathSeparator).last; int get passedCases => cases.where((c) => c.passed).length; double get avgTokensPerSecond => cases.isEmpty ? 0 : cases.fold(0, (sum, c) => sum + c.tokensPerSecond) / - cases.length; + cases.length; Duration get totalElapsed => cases.fold(Duration.zero, (sum, c) => sum + c.elapsed); } @@ -158,7 +155,6 @@ class CorrectionBenchmarkService { static final benchmarkCases = [ // ── English: Whisper homophone / wrong-word errors ────────────────── - const CorrectionBenchmarkCase( name: 'en_homophone_weak_week', input: 'I need to finish the report by next weak', @@ -189,14 +185,15 @@ class CorrectionBenchmarkService { ), const CorrectionBenchmarkCase( name: 'en_wrong_word_nonsense', - input: 'I have a dentist appointment and I need to cancel it because I have a cold and a terrible headache', - expectedOutput: 'I have a dentist appointment and I need to cancel it because I have a cold and a terrible headache.', + input: + 'I have a dentist appointment and I need to cancel it because I have a cold and a terrible headache', + expectedOutput: + 'I have a dentist appointment and I need to cancel it because I have a cold and a terrible headache.', mustContain: ['dentist', 'cancel', 'headache'], expectModification: false, // Clean input — should pass through ), // ── English: filler words + stuttering ────────────────────────────── - const CorrectionBenchmarkCase( name: 'en_filler_um_stutter', input: 'I I need to um finish the report and and send it to the the boss', @@ -206,35 +203,38 @@ class CorrectionBenchmarkService { ), const CorrectionBenchmarkCase( name: 'en_filler_heavy', - input: 'so uh you know I was like thinking we should you know maybe like schedule a meeting', + input: + 'so uh you know I was like thinking we should you know maybe like schedule a meeting', expectedOutput: 'I was thinking we should maybe schedule a meeting.', mustContain: ['schedule', 'meeting'], mustNotContain: [' uh '], ), // ── English: missing/wrong punctuation ────────────────────────────── - const CorrectionBenchmarkCase( name: 'en_missing_punctuation', - input: 'call the plumber tomorrow at 3 then pick up the kids at 5 and dont forget to buy groceries', - expectedOutput: "Call the plumber tomorrow at 3, then pick up the kids at 5, and don't forget to buy groceries.", + input: + 'call the plumber tomorrow at 3 then pick up the kids at 5 and dont forget to buy groceries', + expectedOutput: + "Call the plumber tomorrow at 3, then pick up the kids at 5, and don't forget to buy groceries.", mustContain: ['plumber', 'kids', 'groceries'], mustNotContain: [], - expectModification: false, // punctuation-only change is acceptable either way + expectModification: + false, // punctuation-only change is acceptable either way ), // ── English: Whisper word-boundary / context errors ───────────────── - const CorrectionBenchmarkCase( name: 'en_word_boundary', - input: 'I can not believe they moved the meeting to an other day with out telling us', - expectedOutput: 'I cannot believe they moved the meeting to another day without telling us.', + input: + 'I can not believe they moved the meeting to an other day with out telling us', + expectedOutput: + 'I cannot believe they moved the meeting to another day without telling us.', mustContain: ['another', 'without'], mustNotContain: ['an other', 'with out'], ), // ── Swedish: Whisper errors ───────────────────────────────────────── - const CorrectionBenchmarkCase( name: 'sv_filler_stutter', input: 'jag ska eh köpa köpa mjölk och bröd på på hemvägen', @@ -270,18 +270,20 @@ class CorrectionBenchmarkService { const CorrectionBenchmarkCase( name: 'sv_wrong_word_whisper', input: 'ring veterinären och boka en tid för katten som har ont i ögat', - expectedOutput: 'Ring veterinären och boka en tid för katten som har ont i ögat.', + expectedOutput: + 'Ring veterinären och boka en tid för katten som har ont i ögat.', mustContain: ['veterinären', 'katten'], expectModification: false, // Clean-ish input language: 'sv', ), // ── German: Whisper errors ────────────────────────────────────────── - const CorrectionBenchmarkCase( name: 'de_missing_umlaut', - input: 'ich muss den Arzt anrufen und einen Termin fur nachste Woche machen', - expectedOutput: 'Ich muss den Arzt anrufen und einen Termin für nächste Woche machen.', + input: + 'ich muss den Arzt anrufen und einen Termin fur nachste Woche machen', + expectedOutput: + 'Ich muss den Arzt anrufen und einen Termin für nächste Woche machen.', mustContain: ['für', 'nächste'], mustNotContain: [' fur ', 'nachste'], language: 'de', @@ -296,7 +298,6 @@ class CorrectionBenchmarkService { ), // ── English: Whisper misheard context-dependent words ────────────── - const CorrectionBenchmarkCase( name: 'en_misheard_their_there', input: 'I left my keys over their on the table', @@ -313,31 +314,35 @@ class CorrectionBenchmarkService { ), const CorrectionBenchmarkCase( name: 'en_clean_long_sentence', - input: 'I had a great meeting with the design team today and we agreed on the new color scheme for the watch face', - expectedOutput: 'I had a great meeting with the design team today and we agreed on the new color scheme for the watch face.', + input: + 'I had a great meeting with the design team today and we agreed on the new color scheme for the watch face', + expectedOutput: + 'I had a great meeting with the design team today and we agreed on the new color scheme for the watch face.', mustContain: ['design team', 'color scheme', 'watch face'], expectModification: false, ), // ── English: Whisper garbled multi-word ───────────────────────── - const CorrectionBenchmarkCase( name: 'en_garbled_sentence', - input: 'the whether is really nice today so lets go for a walk in the park', - expectedOutput: "The weather is really nice today so let's go for a walk in the park.", + input: + 'the whether is really nice today so lets go for a walk in the park', + expectedOutput: + "The weather is really nice today so let's go for a walk in the park.", mustContain: ['weather'], mustNotContain: ['whether'], ), const CorrectionBenchmarkCase( name: 'en_multiple_errors_combined', - input: 'I need to by some flower for the party its on wendsday at there house', - expectedOutput: "I need to buy some flowers for the party, it's on Wednesday at their house.", + input: + 'I need to by some flower for the party its on wendsday at there house', + expectedOutput: + "I need to buy some flowers for the party, it's on Wednesday at their house.", mustContain: ['buy', 'Wednesday'], mustNotContain: ['by some', 'wendsday'], ), // ── Swedish: more realistic Whisper errors ─────────────────── - const CorrectionBenchmarkCase( name: 'sv_filler_liksom', input: 'jag tankte liksom att vi kanske liksom borde traffas imorgon', @@ -357,14 +362,14 @@ class CorrectionBenchmarkService { const CorrectionBenchmarkCase( name: 'sv_clean_longer', input: 'Vi ses på kontoret imorgon bitti för att gå igenom rapporten.', - expectedOutput: 'Vi ses på kontoret imorgon bitti för att gå igenom rapporten.', + expectedOutput: + 'Vi ses på kontoret imorgon bitti för att gå igenom rapporten.', mustContain: ['kontoret', 'rapporten'], expectModification: false, language: 'sv', ), // ── German: more realistic Whisper errors ─────────────────── - const CorrectionBenchmarkCase( name: 'de_eszett_and_umlaut', input: 'ich weiss nicht ob er die strasse finden konnte', @@ -405,7 +410,11 @@ class CorrectionBenchmarkService { final caseResults = []; try { - for (var caseIndex = 0; caseIndex < benchmarkCases.length; caseIndex++) { + for ( + var caseIndex = 0; + caseIndex < benchmarkCases.length; + caseIndex++ + ) { final testCase = benchmarkCases[caseIndex]; onProgress?.call( CorrectionBenchmarkProgress( @@ -421,8 +430,9 @@ class CorrectionBenchmarkService { ); final prompt = buildCorrectionPrompt(testCase.input); - final maxTok = - CorrectionPromptTemplate.estimateMaxTokens(testCase.input); + final maxTok = CorrectionPromptTemplate.estimateMaxTokens( + testCase.input, + ); try { final result = await llm @@ -432,63 +442,69 @@ class CorrectionBenchmarkService { final output = _cleanOutput(result.output); final check = _evaluate(testCase, output); - caseResults.add(CorrectionCaseResult( - caseName: testCase.name, - input: testCase.input, - expectedOutput: testCase.expectedOutput, - actualOutput: output, - wasModified: check.wasModified, - modificationExpected: testCase.expectModification, - modificationMatch: check.modificationMatch, - allMustContainFound: check.allMustContainFound, - missingKeywords: check.missingKeywords, - allMustNotContainAbsent: check.allMustNotContainAbsent, - unwantedKeywordsFound: check.unwantedKeywordsFound, - cleanOutput: check.cleanOutput, - cleanOutputDetail: check.cleanOutputDetail, - elapsed: result.elapsed, - tokensPerSecond: result.tokensPerSecond, - )); + caseResults.add( + CorrectionCaseResult( + caseName: testCase.name, + input: testCase.input, + expectedOutput: testCase.expectedOutput, + actualOutput: output, + wasModified: check.wasModified, + modificationExpected: testCase.expectModification, + modificationMatch: check.modificationMatch, + allMustContainFound: check.allMustContainFound, + missingKeywords: check.missingKeywords, + allMustNotContainAbsent: check.allMustNotContainAbsent, + unwantedKeywordsFound: check.unwantedKeywordsFound, + cleanOutput: check.cleanOutput, + cleanOutputDetail: check.cleanOutputDetail, + elapsed: result.elapsed, + tokensPerSecond: result.tokensPerSecond, + ), + ); } on TimeoutException { llm.cancelInference(); - caseResults.add(CorrectionCaseResult( - caseName: testCase.name, - input: testCase.input, - expectedOutput: testCase.expectedOutput, - actualOutput: '', - wasModified: false, - modificationExpected: testCase.expectModification, - modificationMatch: false, - allMustContainFound: false, - missingKeywords: testCase.mustContain, - allMustNotContainAbsent: true, - unwantedKeywordsFound: const [], - cleanOutput: false, - cleanOutputDetail: 'Timed out', - elapsed: perCaseTimeout, - tokensPerSecond: 0, - error: 'Timed out after ${perCaseTimeout.inSeconds}s', - )); + caseResults.add( + CorrectionCaseResult( + caseName: testCase.name, + input: testCase.input, + expectedOutput: testCase.expectedOutput, + actualOutput: '', + wasModified: false, + modificationExpected: testCase.expectModification, + modificationMatch: false, + allMustContainFound: false, + missingKeywords: testCase.mustContain, + allMustNotContainAbsent: true, + unwantedKeywordsFound: const [], + cleanOutput: false, + cleanOutputDetail: 'Timed out', + elapsed: perCaseTimeout, + tokensPerSecond: 0, + error: 'Timed out after ${perCaseTimeout.inSeconds}s', + ), + ); } catch (e) { llm.cancelInference(); - caseResults.add(CorrectionCaseResult( - caseName: testCase.name, - input: testCase.input, - expectedOutput: testCase.expectedOutput, - actualOutput: '', - wasModified: false, - modificationExpected: testCase.expectModification, - modificationMatch: false, - allMustContainFound: false, - missingKeywords: testCase.mustContain, - allMustNotContainAbsent: true, - unwantedKeywordsFound: const [], - cleanOutput: false, - cleanOutputDetail: 'Error: $e', - elapsed: Duration.zero, - tokensPerSecond: 0, - error: e.toString(), - )); + caseResults.add( + CorrectionCaseResult( + caseName: testCase.name, + input: testCase.input, + expectedOutput: testCase.expectedOutput, + actualOutput: '', + wasModified: false, + modificationExpected: testCase.expectModification, + modificationMatch: false, + allMustContainFound: false, + missingKeywords: testCase.mustContain, + allMustNotContainAbsent: true, + unwantedKeywordsFound: const [], + cleanOutput: false, + cleanOutputDetail: 'Error: $e', + elapsed: Duration.zero, + tokensPerSecond: 0, + error: e.toString(), + ), + ); } completedRuns++; @@ -509,7 +525,9 @@ class CorrectionBenchmarkService { llm.dispose(); } - results.add(CorrectionModelResult(modelPath: modelPath, cases: caseResults)); + results.add( + CorrectionModelResult(modelPath: modelPath, cases: caseResults), + ); } return results; @@ -565,17 +583,18 @@ class CorrectionBenchmarkService { final normalizedInput = inputLower.replaceAll(RegExp(r'\s+'), ' ').trim(); final normalizedOutput = outputLower.replaceAll(RegExp(r'\s+'), ' ').trim(); // Strip trailing punctuation for comparison - final normalizedInputNoPunct = - normalizedInput.replaceAll(RegExp(r'[.!?,;:]+$'), '').trim(); - final normalizedOutputNoPunct = - normalizedOutput.replaceAll(RegExp(r'[.!?,;:]+$'), '').trim(); + final normalizedInputNoPunct = normalizedInput + .replaceAll(RegExp(r'[.!?,;:]+$'), '') + .trim(); + final normalizedOutputNoPunct = normalizedOutput + .replaceAll(RegExp(r'[.!?,;:]+$'), '') + .trim(); final wasModified = normalizedInputNoPunct != normalizedOutputNoPunct; // Modification expectation check // If modification is expected, it must have changed // If no modification expected, echoing it back is fine (but changing is also OK) - final modificationMatch = - testCase.expectModification ? wasModified : true; + final modificationMatch = testCase.expectModification ? wasModified : true; // Must-contain check final missingKeywords = []; @@ -614,7 +633,9 @@ class CorrectionBenchmarkService { } // Check for markdown - if (output.contains('```') || output.contains('**') || output.contains('##')) { + if (output.contains('```') || + output.contains('**') || + output.contains('##')) { return const _CleanCheck(passed: false, detail: 'contains markdown'); } @@ -639,7 +660,10 @@ class CorrectionBenchmarkService { // Check for excessive length (more than 3x input length likely means explanations) // Relaxed — just flag it if (output.length > 500) { - return const _CleanCheck(passed: false, detail: 'output suspiciously long (>500 chars)'); + return const _CleanCheck( + passed: false, + detail: 'output suspiciously long (>500 chars)', + ); } return const _CleanCheck(passed: true, detail: null); diff --git a/ai_testbench/lib/services/llm_service.dart b/ai_testbench/lib/services/llm_service.dart index 2ba39c8..3f38ad2 100644 --- a/ai_testbench/lib/services/llm_service.dart +++ b/ai_testbench/lib/services/llm_service.dart @@ -51,14 +51,21 @@ class LlmService { double topP = 1.0; double presencePenalty = 2.0; int numGpuLayers = 0; + /// When false, disables thinking/reasoning for models like Qwen3/3.5. bool enableThinking = true; static void _logFilter(String log) { - if (log.contains('loaded') || log.contains('error') || log.contains('Error') || - log.contains('token') || log.contains('speed') || log.contains('FAILED') || - log.contains('Model loaded') || log.contains('Initialized') || - log.contains('Backend initialized') || log.contains('Available backends')) { + if (log.contains('loaded') || + log.contains('error') || + log.contains('Error') || + log.contains('token') || + log.contains('speed') || + log.contains('FAILED') || + log.contains('Model loaded') || + log.contains('Initialized') || + log.contains('Backend initialized') || + log.contains('Available backends')) { debugPrint('[llama.cpp] $log'); } } @@ -111,16 +118,17 @@ class LlmService { ); _requestInFlight = true; - _runningRequestId = await fllamaChat( - request, - (String response, String responseJson, bool done) { - if (done && !completer.isCompleted) { - _requestInFlight = false; - _runningRequestId = -1; - completer.complete(response); - } - }, - ); + _runningRequestId = await fllamaChat(request, ( + String response, + String responseJson, + bool done, + ) { + if (done && !completer.isCompleted) { + _requestInFlight = false; + _runningRequestId = -1; + completer.complete(response); + } + }); final output = await completer.future; sw.stop(); @@ -148,7 +156,10 @@ class LlmService { /// Strip Qwen3-style reasoning blocks from output. static String _stripThinkingTags(String text) { // Remove complete ... blocks - var cleaned = text.replaceAll(RegExp(r'.*?', dotAll: true), ''); + var cleaned = text.replaceAll( + RegExp(r'.*?', dotAll: true), + '', + ); // Remove unclosed (thinking consumed entire budget) cleaned = cleaned.replaceAll(RegExp(r'.*', dotAll: true), ''); return cleaned.trim(); @@ -188,20 +199,17 @@ class LlmService { ); _requestInFlight = true; - fllamaChat( - request, - (String response, String responseJson, bool done) { - debugPrint('[LlmService] stream cb: done=$done, len=${response.length}'); - if (!controller.isClosed) { - controller.add(response); - if (done) { - _requestInFlight = false; - _runningRequestId = -1; - controller.close(); - } + fllamaChat(request, (String response, String responseJson, bool done) { + debugPrint('[LlmService] stream cb: done=$done, len=${response.length}'); + if (!controller.isClosed) { + controller.add(response); + if (done) { + _requestInFlight = false; + _runningRequestId = -1; + controller.close(); } - }, - ).then((id) { + } + }).then((id) { _runningRequestId = id; }); diff --git a/ai_testbench/lib/services/model_benchmark_service.dart b/ai_testbench/lib/services/model_benchmark_service.dart index 77698a1..0397b3e 100644 --- a/ai_testbench/lib/services/model_benchmark_service.dart +++ b/ai_testbench/lib/services/model_benchmark_service.dart @@ -58,16 +58,16 @@ class BenchmarkCase { int? expectedDurationSeconds, int durationToleranceSeconds = 5, }) : expectedItems = [ - ExpectedItem( - expectedIntent: expectedIntent, - expectTime: expectTime, - titleLanguageKeywords: titleLanguageKeywords, - expectedDateTime: expectedDateTime, - toleranceMinutes: toleranceMinutes, - expectedDurationSeconds: expectedDurationSeconds, - durationToleranceSeconds: durationToleranceSeconds, - ), - ]; + ExpectedItem( + expectedIntent: expectedIntent, + expectTime: expectTime, + titleLanguageKeywords: titleLanguageKeywords, + expectedDateTime: expectedDateTime, + toleranceMinutes: toleranceMinutes, + expectedDurationSeconds: expectedDurationSeconds, + durationToleranceSeconds: durationToleranceSeconds, + ), + ]; /// Shorthand accessors for single-item cases (used by existing code). String get expectedIntent => expectedItems.first.expectedIntent; @@ -190,17 +190,14 @@ class BenchmarkModelResult { final String modelPath; final List cases; - const BenchmarkModelResult({ - required this.modelPath, - required this.cases, - }); + const BenchmarkModelResult({required this.modelPath, required this.cases}); String get modelName => modelPath.split(Platform.pathSeparator).last; int get passedCases => cases.where((c) => c.passed).length; double get avgTokensPerSecond => cases.isEmpty ? 0 : cases.fold(0, (sum, c) => sum + c.tokensPerSecond) / - cases.length; + cases.length; Duration get totalElapsed => cases.fold(Duration.zero, (sum, c) => sum + c.elapsed); } @@ -220,7 +217,6 @@ class ModelBenchmarkService { static final benchmarkCases = [ // ── English single-item cases ────────────────────────────────────── - BenchmarkCase.single( name: 'en_event_precise_time', transcript: @@ -276,7 +272,6 @@ class ModelBenchmarkService { ), // ── Swedish single-item cases (native language title validation) ── - BenchmarkCase.single( name: 'sv_reminder_tomorrow', transcript: 'Påminn mig imorgon klockan 8 att ringa tandläkaren.', @@ -319,7 +314,6 @@ class ModelBenchmarkService { ), // ── German single-item cases (native language title validation) ─── - BenchmarkCase.single( name: 'de_event_appointment', transcript: @@ -340,7 +334,6 @@ class ModelBenchmarkService { ), // ── Additional English single-item cases ───────────────────────── - BenchmarkCase.single( name: 'en_reminder_specific_date', transcript: 'Submit the expense report by March 20th at 9 AM.', @@ -366,7 +359,6 @@ class ModelBenchmarkService { ), // ── Additional Swedish single-item cases ───────────────────────── - BenchmarkCase.single( name: 'sv_event_fika', transcript: 'Fika med Lisa på fredag klockan 15.', @@ -439,7 +431,6 @@ class ModelBenchmarkService { ), // ── Voice note tests – short (1-2 sentences, casual/fragmented) ── - BenchmarkCase.single( name: 'voice_short_en_idea', transcript: 'Maybe add a compass widget to the watch face.', @@ -484,7 +475,6 @@ class ModelBenchmarkService { ), // ── Voice note tests – long (rambling, filler words, multi-sentence) ─ - BenchmarkCase.single( name: 'voice_long_en_rambling_reminder', transcript: @@ -531,7 +521,6 @@ class ModelBenchmarkService { ), // ── Additional multi-item cases ────────────────────────────────── - BenchmarkCase( name: 'sv_multi_fika_and_errand', transcript: @@ -622,7 +611,6 @@ class ModelBenchmarkService { ), // ── Conversational Swedish / Swenglish voice notes ─────────────── - BenchmarkCase( name: 'sv_multi_dev_tasks_swenglish', transcript: @@ -671,7 +659,6 @@ class ModelBenchmarkService { ), // ── Multi-item cases ───────────────────────────────────────────── - BenchmarkCase( name: 'en_multi_two_reminders', transcript: @@ -730,10 +717,7 @@ class ModelBenchmarkService { transcript: 'Tomorrow at 8 go for a run then have lunch with Mike at noon and buy groceries.', expectedItems: [ - ExpectedItem( - expectedIntent: 'reminder', - expectTime: true, - ), + ExpectedItem(expectedIntent: 'reminder', expectTime: true), ExpectedItem( expectedIntent: 'event', expectTime: true, @@ -823,7 +807,6 @@ class ModelBenchmarkService { ), // ── Timer cases ────────────────────────────────────────────────── - BenchmarkCase.single( name: 'en_timer_simple', transcript: 'Set a timer for 8 minutes', @@ -906,7 +889,6 @@ class ModelBenchmarkService { ), // ── Alarm cases ────────────────────────────────────────────────── - BenchmarkCase.single( name: 'en_alarm_morning', transcript: 'Set an alarm for 7:30 AM', @@ -983,11 +965,9 @@ class ModelBenchmarkService { ), // ── Multi-item cases with timers/alarms ────────────────────────── - BenchmarkCase( name: 'en_multi_timer_and_alarm', - transcript: - 'Set a timer for 10 minutes and an alarm for 7 AM tomorrow', + transcript: 'Set a timer for 10 minutes and an alarm for 7 AM tomorrow', expectedItems: [ ExpectedItem( expectedIntent: 'timer', @@ -1039,7 +1019,6 @@ class ModelBenchmarkService { // ── No-speech / garbage input cases ───────────────────────────────── // These should produce an empty array []. The LLM should not // hallucinate actions from noise or silence markers. - BenchmarkCase( name: 'nospeech_blank_audio_marker', transcript: '[BLANK_AUDIO]', @@ -1103,9 +1082,7 @@ class ModelBenchmarkService { final caseResults = []; try { - for (var caseIndex = 0; - caseIndex < casesToRun.length; - caseIndex++) { + for (var caseIndex = 0; caseIndex < casesToRun.length; caseIndex++) { final testCase = casesToRun[caseIndex]; onProgress?.call( BenchmarkProgress( @@ -1128,16 +1105,15 @@ class ModelBenchmarkService { ); try { - final result = await llm - .generate(prompt) - .timeout(perCaseTimeout); + final result = await llm.generate(prompt).timeout(perCaseTimeout); // Parse using the shared ChronoLlmParser final parseResult = _parser.parse(result.output); final extractions = parseResult.extractions; // An empty array [] is valid JSON when we expect 0 items // (no-speech / garbage input). - final validJson = extractions.isNotEmpty || + final validJson = + extractions.isNotEmpty || (testCase.expectedCount == 0 && _containsEmptyJsonArray(result.output)); @@ -1157,8 +1133,9 @@ class ModelBenchmarkService { String? allDurationDetail; final itemFailures = []; - final checkCount = - extractedCount < expectedCount ? extractedCount : expectedCount; + final checkCount = extractedCount < expectedCount + ? extractedCount + : expectedCount; for (var i = 0; i < checkCount; i++) { final ext = extractions[i]; final exp = testCase.expectedItems[i]; @@ -1167,51 +1144,49 @@ class ModelBenchmarkService { if (!intentOk) { allIntentMatch = false; itemFailures.add( - 'item[$i] intent: got "${ext.intent}", expected "${exp.expectedIntent}"'); + 'item[$i] intent: got "${ext.intent}", expected "${exp.expectedIntent}"', + ); } - final hasTime = ext.datetimeExpressionOriginal != null || + final hasTime = + ext.datetimeExpressionOriginal != null || ext.datetimeExpressionEnglish != null; if (hasTime != exp.expectTime) { allTimePresenceMatch = false; itemFailures.add( - 'item[$i] time presence: got $hasTime, expected ${exp.expectTime}'); + 'item[$i] time presence: got $hasTime, expected ${exp.expectTime}', + ); } - final titleLang = - _checkTitleLanguageForItem(ext.title, exp); + final titleLang = _checkTitleLanguageForItem(ext.title, exp); if (!titleLang.passed) { allTitleLangMatch = false; - itemFailures.add( - 'item[$i] title lang: ${titleLang.detail}'); + itemFailures.add('item[$i] title lang: ${titleLang.detail}'); } - allTitleLangDetail = (allTitleLangDetail ?? '') + + allTitleLangDetail = + (allTitleLangDetail ?? '') + 'item[$i]: ${titleLang.detail}; '; final timeRes = _checkTimeResolutionForItem( - ext.datetimeExpressionEnglish ?? - ext.datetimeExpressionOriginal, + ext.datetimeExpressionEnglish ?? ext.datetimeExpressionOriginal, exp, resolver, ); if (!timeRes.passed) { allTimeResMatch = false; - itemFailures.add( - 'item[$i] time: ${timeRes.detail}'); + itemFailures.add('item[$i] time: ${timeRes.detail}'); } - allTimeResDetail = (allTimeResDetail ?? '') + - 'item[$i]: ${timeRes.detail}; '; + allTimeResDetail = + (allTimeResDetail ?? '') + 'item[$i]: ${timeRes.detail}; '; // Duration validation for timer intents - final durRes = _checkDurationForItem( - ext.durationSeconds, exp); + final durRes = _checkDurationForItem(ext.durationSeconds, exp); if (!durRes.passed) { allDurationMatch = false; - itemFailures.add( - 'item[$i] duration: ${durRes.detail}'); + itemFailures.add('item[$i] duration: ${durRes.detail}'); } - allDurationDetail = (allDurationDetail ?? '') + - 'item[$i]: ${durRes.detail}; '; + allDurationDetail = + (allDurationDetail ?? '') + 'item[$i]: ${durRes.detail}; '; } // If count mismatch, mark missing items as failures @@ -1221,8 +1196,7 @@ class ModelBenchmarkService { allTimePresenceMatch = false; } for (var i = checkCount; i < extractedCount; i++) { - itemFailures - .add('item[$i] unexpected extra extraction'); + itemFailures.add('item[$i] unexpected extra extraction'); } // Use first extraction for summary fields (backward compat) @@ -1268,8 +1242,7 @@ class ModelBenchmarkService { intent: 'timeout', elapsed: perCaseTimeout, tokensPerSecond: 0, - outputPreview: - 'Timed out after ${perCaseTimeout.inSeconds}s', + outputPreview: 'Timed out after ${perCaseTimeout.inSeconds}s', error: 'Timed out after ${perCaseTimeout.inSeconds}s', extractedCount: 0, expectedCount: testCase.expectedCount, @@ -1317,7 +1290,8 @@ class ModelBenchmarkService { } results.add( - BenchmarkModelResult(modelPath: modelPath, cases: caseResults)); + BenchmarkModelResult(modelPath: modelPath, cases: caseResults), + ); } onProgress?.call( @@ -1326,12 +1300,10 @@ class ModelBenchmarkService { totalCasesPerModel: totalCases, totalRuns: totalRuns, completedRuns: completedRuns, - currentModelIndex: - modelPaths.isEmpty ? 0 : modelPaths.length - 1, + currentModelIndex: modelPaths.isEmpty ? 0 : modelPaths.length - 1, currentCaseIndex: totalCases == 0 ? 0 : totalCases - 1, currentModelPath: modelPaths.isEmpty ? '' : modelPaths.last, - currentCaseName: - benchmarkCases.isEmpty ? '' : benchmarkCases.last.name, + currentCaseName: benchmarkCases.isEmpty ? '' : benchmarkCases.last.name, ), ); @@ -1366,10 +1338,7 @@ class ModelBenchmarkService { return const _CheckResult(passed: true, detail: 'no keyword check'); } if (title == null || title.isEmpty) { - return const _CheckResult( - passed: false, - detail: 'no title in output', - ); + return const _CheckResult(passed: false, detail: 'no title in output'); } final lower = title.toLowerCase(); @@ -1407,10 +1376,7 @@ class ModelBenchmarkService { '(diff ${diff}s, tolerance ${item.durationToleranceSeconds}s)', ); } - return _CheckResult( - passed: true, - detail: '${gotDuration}s OK', - ); + return _CheckResult(passed: true, detail: '${gotDuration}s OK'); } _CheckResult _checkTimeResolution( @@ -1419,7 +1385,10 @@ class ModelBenchmarkService { TimeExpressionResolver resolver, ) { return _checkTimeResolutionForItem( - timeExpr, testCase.expectedItems.first, resolver); + timeExpr, + testCase.expectedItems.first, + resolver, + ); } _CheckResult _checkTimeResolutionForItem( @@ -1437,10 +1406,7 @@ class ModelBenchmarkService { ); } - final resolved = resolver.resolve( - timeExpr, - referenceDate: referenceTime, - ); + final resolved = resolver.resolve(timeExpr, referenceDate: referenceTime); if (resolved == null) { return _CheckResult( diff --git a/ai_testbench/lib/services/time_extraction_benchmark_service.dart b/ai_testbench/lib/services/time_extraction_benchmark_service.dart index 608a448..2169587 100644 --- a/ai_testbench/lib/services/time_extraction_benchmark_service.dart +++ b/ai_testbench/lib/services/time_extraction_benchmark_service.dart @@ -84,8 +84,7 @@ class TimeExtractionProgress { required this.modelName, }); - double get fraction => - totalCases == 0 ? 0 : completedCases / totalCases; + double get fraction => totalCases == 0 ? 0 : completedCases / totalCases; } // ── Aggregate result for a model ──────────────────────────────────────── @@ -100,16 +99,14 @@ class TimeExtractionModelResult { }); String get modelName => modelPath.split(Platform.pathSeparator).last; - int get passedCount => - cases.where((c) => c.status == TestStatus.pass).length; + int get passedCount => cases.where((c) => c.status == TestStatus.pass).length; int get partialCount => cases.where((c) => c.status == TestStatus.partial).length; - int get failedCount => - cases.where((c) => c.status == TestStatus.fail).length; + int get failedCount => cases.where((c) => c.status == TestStatus.fail).length; double get avgTokensPerSecond => cases.isEmpty ? 0 : cases.fold(0, (sum, c) => sum + c.tokensPerSecond) / - cases.length; + cases.length; Duration get totalElapsed => cases.fold(Duration.zero, (sum, c) => sum + c.llmDuration); } @@ -275,16 +272,16 @@ class TimeExtractionBenchmarkService { for (var i = 0; i < testCases.length; i++) { final tc = testCases[i]; - onProgress?.call(TimeExtractionProgress( - totalCases: testCases.length, - completedCases: i, - currentCaseName: tc.name, - modelName: modelName, - )); - - debugPrint( - '\n─── Test ${i + 1}/${testCases.length}: ${tc.name} ───', + onProgress?.call( + TimeExtractionProgress( + totalCases: testCases.length, + completedCases: i, + currentCaseName: tc.name, + modelName: modelName, + ), ); + + debugPrint('\n─── Test ${i + 1}/${testCases.length}: ${tc.name} ───'); debugPrint(' Input: "${tc.transcript}"'); try { @@ -301,9 +298,7 @@ class TimeExtractionBenchmarkService { while (true) { attempts++; - result = await llm - .generate(prompt) - .timeout(perCaseTimeout); + result = await llm.generate(prompt).timeout(perCaseTimeout); totalElapsed += result.elapsed; lastTokensPerSecond = result.tokensPerSecond; llmResult = _parseLlmOutput(result.output); @@ -314,39 +309,47 @@ class TimeExtractionBenchmarkService { break; } - debugPrint(' ↻ Retrying invalid output (attempt ${attempts + 1}/2)'); + debugPrint( + ' ↻ Retrying invalid output (attempt ${attempts + 1}/2)', + ); } - debugPrint(' LLM time: ${totalElapsed.inMilliseconds}ms ' - '(${lastTokensPerSecond.toStringAsFixed(1)} tok/s)'); + debugPrint( + ' LLM time: ${totalElapsed.inMilliseconds}ms ' + '(${lastTokensPerSecond.toStringAsFixed(1)} tok/s)', + ); debugPrint(' attempts: $attempts'); debugPrint(' intent: ${llmResult.intent}'); debugPrint(' title: ${llmResult.title}'); - debugPrint(' time (orig): ${llmResult.datetimeExpressionOriginal}'); + debugPrint( + ' time (orig): ${llmResult.datetimeExpressionOriginal}', + ); debugPrint(' time (EN): ${llmResult.datetimeExpressionEnglish}'); // Resolve time expression ResolvedTime? resolvedTime; - final timeExpr = llmResult.datetimeExpressionEnglish ?? + final timeExpr = + llmResult.datetimeExpressionEnglish ?? llmResult.datetimeExpressionOriginal; if (timeExpr != null) { resolvedTime = resolver.resolve( timeExpr, referenceDate: referenceTime, ); - debugPrint(resolvedTime != null - ? ' Chrono: ${resolvedTime.dateTime} (via ${resolvedTime.method})' - : ' Chrono: FAILED for "$timeExpr"'); + debugPrint( + resolvedTime != null + ? ' Chrono: ${resolvedTime.dateTime} (via ${resolvedTime.method})' + : ' Chrono: FAILED for "$timeExpr"', + ); } // Evaluate final failures = _evaluate(tc, llmResult, resolvedTime); final status = failures.isEmpty ? TestStatus.pass - : (failures.length == 1 && - !failures.first.contains('Intent')) - ? TestStatus.partial - : TestStatus.fail; + : (failures.length == 1 && !failures.first.contains('Intent')) + ? TestStatus.partial + : TestStatus.fail; for (final f in failures) { debugPrint(' ❌ $f'); @@ -355,52 +358,59 @@ class TimeExtractionBenchmarkService { debugPrint(' ✅ PASS'); } - caseResults.add(TimeExtractionTestResult( - testCase: tc, - llmResult: llmResult, - resolvedTime: resolvedTime, - llmDuration: totalElapsed, - tokensPerSecond: lastTokensPerSecond, - status: status, - failures: failures, - )); + caseResults.add( + TimeExtractionTestResult( + testCase: tc, + llmResult: llmResult, + resolvedTime: resolvedTime, + llmDuration: totalElapsed, + tokensPerSecond: lastTokensPerSecond, + status: status, + failures: failures, + ), + ); } on TimeoutException { llm.cancelInference(); debugPrint(' ⏱ TIMEOUT'); - caseResults.add(TimeExtractionTestResult( - testCase: tc, - llmDuration: perCaseTimeout, - tokensPerSecond: 0, - status: TestStatus.fail, - failures: ['Timed out after ${perCaseTimeout.inSeconds}s'], - )); + caseResults.add( + TimeExtractionTestResult( + testCase: tc, + llmDuration: perCaseTimeout, + tokensPerSecond: 0, + status: TestStatus.fail, + failures: ['Timed out after ${perCaseTimeout.inSeconds}s'], + ), + ); } catch (e) { llm.cancelInference(); debugPrint(' ❌ ERROR: $e'); - caseResults.add(TimeExtractionTestResult( - testCase: tc, - llmDuration: Duration.zero, - tokensPerSecond: 0, - status: TestStatus.fail, - failures: ['Error: $e'], - )); + caseResults.add( + TimeExtractionTestResult( + testCase: tc, + llmDuration: Duration.zero, + tokensPerSecond: 0, + status: TestStatus.fail, + failures: ['Error: $e'], + ), + ); } - onProgress?.call(TimeExtractionProgress( - totalCases: testCases.length, - completedCases: i + 1, - currentCaseName: tc.name, - modelName: modelName, - )); + onProgress?.call( + TimeExtractionProgress( + totalCases: testCases.length, + completedCases: i + 1, + currentCaseName: tc.name, + modelName: modelName, + ), + ); } } finally { llm.dispose(); } - results.add(TimeExtractionModelResult( - modelPath: modelPath, - cases: caseResults, - )); + results.add( + TimeExtractionModelResult(modelPath: modelPath, cases: caseResults), + ); } return results; @@ -424,11 +434,13 @@ class TimeExtractionBenchmarkService { LlmExtractionResult llmResult, ) { final hasIntent = llmResult.intent != null && llmResult.intent!.isNotEmpty; - final hasAnyTime = (llmResult.datetimeExpressionEnglish != null && + final hasAnyTime = + (llmResult.datetimeExpressionEnglish != null && llmResult.datetimeExpressionEnglish!.isNotEmpty) || (llmResult.datetimeExpressionOriginal != null && llmResult.datetimeExpressionOriginal!.isNotEmpty); - final hasJsonFields = hasIntent || + final hasJsonFields = + hasIntent || llmResult.title != null || llmResult.datetimeExpressionEnglish != null || llmResult.datetimeExpressionOriginal != null; @@ -517,13 +529,19 @@ class TimeExtractionBenchmarkService { for (final model in results) { buf.writeln('╔══════════════════════════════════════════════════════╗'); buf.writeln('║ Model: ${model.modelName}'); - buf.writeln('║ Results: ${model.passedCount} passed, ' - '${model.partialCount} partial, ${model.failedCount} failed ' - 'of ${model.cases.length}'); - buf.writeln('║ Total time: ' - '${(model.totalElapsed.inMilliseconds / 1000).toStringAsFixed(1)}s'); - buf.writeln('║ Avg: ' - '${model.avgTokensPerSecond.toStringAsFixed(1)} tok/s'); + buf.writeln( + '║ Results: ${model.passedCount} passed, ' + '${model.partialCount} partial, ${model.failedCount} failed ' + 'of ${model.cases.length}', + ); + buf.writeln( + '║ Total time: ' + '${(model.totalElapsed.inMilliseconds / 1000).toStringAsFixed(1)}s', + ); + buf.writeln( + '║ Avg: ' + '${model.avgTokensPerSecond.toStringAsFixed(1)} tok/s', + ); buf.writeln('╚══════════════════════════════════════════════════════╝'); buf.writeln(); @@ -539,20 +557,24 @@ class TimeExtractionBenchmarkService { buf.writeln(' Intent: ${r.llmResult!.intent}'); buf.writeln(' Title: ${r.llmResult!.title}'); buf.writeln( - ' Time (orig): ${r.llmResult!.datetimeExpressionOriginal}'); + ' Time (orig): ${r.llmResult!.datetimeExpressionOriginal}', + ); buf.writeln( - ' Time (EN): ${r.llmResult!.datetimeExpressionEnglish}'); + ' Time (EN): ${r.llmResult!.datetimeExpressionEnglish}', + ); } if (r.resolvedTime != null) { buf.writeln( - ' Resolved: ${r.resolvedTime!.dateTime} (${r.resolvedTime!.method})'); + ' Resolved: ${r.resolvedTime!.dateTime} (${r.resolvedTime!.method})', + ); } if (r.testCase.expectedDateTime != null) { buf.writeln(' Expected: ${r.testCase.expectedDateTime}'); } buf.writeln( - ' LLM: ${r.llmDuration.inMilliseconds}ms, ' - '${r.tokensPerSecond.toStringAsFixed(1)} tok/s'); + ' LLM: ${r.llmDuration.inMilliseconds}ms, ' + '${r.tokensPerSecond.toStringAsFixed(1)} tok/s', + ); for (final f in r.failures) { buf.writeln(' ↳ $f'); } diff --git a/ai_testbench/lib/time_extraction_main.dart b/ai_testbench/lib/time_extraction_main.dart index 86b1244..43e10fc 100644 --- a/ai_testbench/lib/time_extraction_main.dart +++ b/ai_testbench/lib/time_extraction_main.dart @@ -63,18 +63,18 @@ Future runHeadlessTimeExtraction(List args) async { return; } - var modelPaths = modelsDirectory - .listSync() - .whereType() - .map((f) => f.path) - .where((p) => p.toLowerCase().endsWith('.gguf')) - .toList() - ..sort(); + var modelPaths = + modelsDirectory + .listSync() + .whereType() + .map((f) => f.path) + .where((p) => p.toLowerCase().endsWith('.gguf')) + .toList() + ..sort(); if (modelFilter != null) { modelPaths = modelPaths - .where((p) => - p.toLowerCase().contains(modelFilter!.toLowerCase())) + .where((p) => p.toLowerCase().contains(modelFilter!.toLowerCase())) .toList(); } @@ -96,11 +96,19 @@ Future runHeadlessTimeExtraction(List args) async { for (final p in modelPaths) { stdout.writeln(' - ${p.split(Platform.pathSeparator).last}'); } - stdout.writeln('Test cases: ${TimeExtractionBenchmarkService.testCases.length}'); - stdout.writeln('Reference time: ${TimeExtractionBenchmarkService.referenceTime}'); + stdout.writeln( + 'Test cases: ${TimeExtractionBenchmarkService.testCases.length}', + ); + stdout.writeln( + 'Reference time: ${TimeExtractionBenchmarkService.referenceTime}', + ); stdout.writeln('Prompt variant: ${promptVariant.name}'); - stdout.writeln('Language hint: ${includeLanguageHint ? 'enabled' : 'disabled'}'); - stdout.writeln('Retry invalid output: ${retryInvalidOutput ? 'enabled' : 'disabled'}'); + stdout.writeln( + 'Language hint: ${includeLanguageHint ? 'enabled' : 'disabled'}', + ); + stdout.writeln( + 'Retry invalid output: ${retryInvalidOutput ? 'enabled' : 'disabled'}', + ); stdout.writeln(''); final service = TimeExtractionBenchmarkService(); diff --git a/ai_testbench/lib/widgets/memo_card.dart b/ai_testbench/lib/widgets/memo_card.dart index f71452b..bed0e6a 100644 --- a/ai_testbench/lib/widgets/memo_card.dart +++ b/ai_testbench/lib/widgets/memo_card.dart @@ -44,7 +44,9 @@ class MemoCard extends StatelessWidget { children: [ Container( padding: const EdgeInsets.symmetric( - horizontal: 8, vertical: 2), + horizontal: 8, + vertical: 2, + ), decoration: BoxDecoration( color: color.withValues(alpha: 0.15), borderRadius: BorderRadius.circular(6), @@ -62,10 +64,9 @@ class MemoCard extends StatelessWidget { const SizedBox(height: 4), Text( title, - style: Theme.of(context) - .textTheme - .titleLarge - ?.copyWith(fontWeight: FontWeight.bold), + style: Theme.of(context).textTheme.titleLarge?.copyWith( + fontWeight: FontWeight.bold, + ), ), ], ), @@ -123,14 +124,9 @@ class MemoCard extends StatelessWidget { const SizedBox(width: 8), Text( '$label: ', - style: const TextStyle( - fontSize: 13, - fontWeight: FontWeight.bold, - ), - ), - Expanded( - child: Text(value, style: const TextStyle(fontSize: 14)), + style: const TextStyle(fontSize: 13, fontWeight: FontWeight.bold), ), + Expanded(child: Text(value, style: const TextStyle(fontSize: 14))), ], ), ); diff --git a/packages/chrono_ai_flow/scripts/time_resolver_debug_manual.dart b/packages/chrono_ai_flow/scripts/time_resolver_debug_manual.dart index 8bdae16..0b18532 100644 --- a/packages/chrono_ai_flow/scripts/time_resolver_debug_manual.dart +++ b/packages/chrono_ai_flow/scripts/time_resolver_debug_manual.dart @@ -13,7 +13,9 @@ void main() { ); if (chronoResult.isNotEmpty) { final d = chronoResult.first.date(); - print('chrono raw: $d weekday=${d.weekday} daysFromRef=${d.difference(ref).inDays}'); + print( + 'chrono raw: $d weekday=${d.weekday} daysFromRef=${d.difference(ref).inDays}', + ); } final cases = [ diff --git a/packages/chrono_ai_flow/test/parser_test.dart b/packages/chrono_ai_flow/test/parser_test.dart index b8ee467..8ccd0ea 100644 --- a/packages/chrono_ai_flow/test/parser_test.dart +++ b/packages/chrono_ai_flow/test/parser_test.dart @@ -37,11 +37,15 @@ void main() { expect(result.extractions[0].intent, 'reminder'); expect(result.extractions[0].title, 'pick up dog'); expect( - result.extractions[0].datetimeExpressionEnglish, 'tomorrow at 5 pm'); + result.extractions[0].datetimeExpressionEnglish, + 'tomorrow at 5 pm', + ); expect(result.extractions[1].intent, 'reminder'); expect(result.extractions[1].title, 'turn off lights'); expect( - result.extractions[1].datetimeExpressionEnglish, 'tomorrow at 9 pm'); + result.extractions[1].datetimeExpressionEnglish, + 'tomorrow at 9 pm', + ); }); test('parses array with mixed intents', () { diff --git a/zswatch_app/.gitignore b/zswatch_app/.gitignore index 494f123..e2d70cf 100644 --- a/zswatch_app/.gitignore +++ b/zswatch_app/.gitignore @@ -44,6 +44,13 @@ app.*.map.json /android/app/profile /android/app/release +# Generated desktop plugin registrants +/linux/flutter/generated_plugin_registrant.cc +/linux/flutter/generated_plugins.cmake +/macos/Flutter/GeneratedPluginRegistrant.swift +/windows/flutter/generated_plugin_registrant.cc +/windows/flutter/generated_plugins.cmake + # Android signing (NEVER commit these!) key.properties *.keystore diff --git a/zswatch_app/android/app/build.gradle.kts b/zswatch_app/android/app/build.gradle.kts index 3dcbcbb..66da6cf 100644 --- a/zswatch_app/android/app/build.gradle.kts +++ b/zswatch_app/android/app/build.gradle.kts @@ -98,6 +98,10 @@ android { } } +dependencies { + implementation("com.google.ai.edge.litertlm:litertlm-android:0.10.0") +} + flutter { source = "../.." } diff --git a/zswatch_app/android/app/src/main/AndroidManifest.xml b/zswatch_app/android/app/src/main/AndroidManifest.xml index 5e4bfa8..cb2044d 100644 --- a/zswatch_app/android/app/src/main/AndroidManifest.xml +++ b/zswatch_app/android/app/src/main/AndroidManifest.xml @@ -17,6 +17,8 @@ + + @@ -91,6 +93,17 @@ android:name=".LlmComputeService" android:exported="false" android:foregroundServiceType="shortService" /> + + + + + + + +