From f238d89e54d4a20637755e77b32ec6be11eacda5 Mon Sep 17 00:00:00 2001
From: TheShigure7 <2947458856@qq.com>
Date: Tue, 28 Apr 2026 16:55:53 +0800
Subject: [PATCH 1/6] refactor(speech): introduce VAD provider layer and thin
 OpenAI STT client

---
 .../_migration_backup/gateway_speech_stt.go   |  87 +++
 pkg/speech/_migration_backup/stt_provider.go  | 245 +++++++
 pkg/speech/_migration_backup/stt_whisper.go   | 644 ++++++++++++++++++
 pkg/speech/_migration_backup/vad.go           | 306 +++++++++
 pkg/speech/_migration_backup/voicewake.go     | 620 +++++++++++++++++
 pkg/speech/stt_openai_client.go               | 126 ++++
 pkg/speech/stt_whisper.go                     | 135 +---
 pkg/speech/vad.go                             |   8 +
 pkg/speech/vad_provider.go                    |  55 ++
 pkg/speech/voicewake.go                       |  17 +-
 10 files changed, 2111 insertions(+), 132 deletions(-)
 create mode 100644 pkg/speech/_migration_backup/gateway_speech_stt.go
 create mode 100644 pkg/speech/_migration_backup/stt_provider.go
 create mode 100644 pkg/speech/_migration_backup/stt_whisper.go
 create mode 100644 pkg/speech/_migration_backup/vad.go
 create mode 100644 pkg/speech/_migration_backup/voicewake.go
 create mode 100644 pkg/speech/stt_openai_client.go
 create mode 100644 pkg/speech/vad_provider.go

diff --git a/pkg/speech/_migration_backup/gateway_speech_stt.go b/pkg/speech/_migration_backup/gateway_speech_stt.go
new file mode 100644
index 00000000..79bb630f
--- /dev/null
+++ b/pkg/speech/_migration_backup/gateway_speech_stt.go
@@ -0,0 +1,87 @@
+package gateway
+
+import (
+	"time"
+
+	"github.com/1024XEngineer/anyclaw/pkg/speech"
+)
+
+func (s *Server) initSTT() {
+	sttCfg := s.mainRuntime.Config.Speech.STT
+	if !sttCfg.Enabled {
+		return
+	}
+
+	s.sttManager = speech.NewSTTManager()
+
+	if sttCfg.Provider != "" && sttCfg.APIKey != "" {
+		providerType := speech.STTProviderType(sttCfg.Provider)
+		sttProviderCfg := speech.STTConfig{
+			Type:     providerType,
+			APIKey:   sttCfg.APIKey,
+			BaseURL:  sttCfg.BaseURL,
+			Model:    sttCfg.Model,
+			Language: sttCfg.DefaultLang,
+			Timeout:  time.Duration(sttCfg.TimeoutSec) * time.Second,
+		}
+		if sttCfg.TimeoutSec <= 0 {
+			sttProviderCfg.Timeout = 120 * time.Second
+		}
+
+		provider, err := speech.NewSTTProvider(sttProviderCfg)
+		if err != nil {
+			s.appendEvent("stt.init.error", "", map[string]any{"error": err.Error(), "provider": sttCfg.Provider})
+			return
+		}
+
+		if err := s.sttManager.Register(sttCfg.Provider, provider); err != nil {
+			s.appendEvent("stt.init.error", "", map[string]any{"error": err.Error(), "provider": sttCfg.Provider})
+			return
+		}
+	}
+
+	pipelineCfg := speech.STTPipelineConfig{
+		Provider:      sttCfg.Provider,
+		DefaultLang:   sttCfg.DefaultLang,
+		AutoDetect:    sttCfg.DefaultLang == "auto",
+		MaxDuration:   time.Duration(sttCfg.MaxDurationSec) * time.Second,
+		MinConfidence: sttCfg.MinConfidence,
+		Timeout:       time.Duration(sttCfg.TimeoutSec) * time.Second,
+	}
+	if sttCfg.MaxDurationSec <= 0 {
+		pipelineCfg.MaxDuration = 10 * time.Minute
+	}
+	if sttCfg.TimeoutSec <= 0 {
+		pipelineCfg.Timeout = 120 * time.Second
+	}
+
+	s.sttPipeline = speech.NewSTTPipeline(s.sttManager, pipelineCfg)
+
+	integrationCfg := speech.STTIntegrationConfig{
+		Enabled:          sttCfg.Enabled,
+		AutoSTT:          sttCfg.AutoSTT,
+		TriggerPrefix:    sttCfg.TriggerPrefix,
+		Provider:         sttCfg.Provider,
+		DefaultLang:      sttCfg.DefaultLang,
+		MaxDuration:      pipelineCfg.MaxDuration,
+		MinConfidence:    sttCfg.MinConfidence,
+		Timeout:          pipelineCfg.Timeout,
+		Channels:         sttCfg.Channels,
+		ExcludeChannels:  sttCfg.ExcludeChannels,
+		FallbackToVoice:  sttCfg.FallbackToVoice,
+		AppendTranscript: sttCfg.AppendTranscript,
+	}
+	if integrationCfg.TriggerPrefix == "" {
+		integrationCfg.TriggerPrefix = "/transcribe"
+	}
+
+	s.sttIntegration = speech.NewSTTIntegration(s.sttPipeline, integrationCfg)
+
+	s.appendEvent("stt.init.ok", "", map[string]any{
+		"provider": sttCfg.Provider,
+		"auto_stt": sttCfg.AutoSTT,
+		"language": sttCfg.DefaultLang,
+		"channels": len(sttCfg.Channels),
+		"excluded": len(sttCfg.ExcludeChannels),
+	})
+}
diff --git a/pkg/speech/_migration_backup/stt_provider.go b/pkg/speech/_migration_backup/stt_provider.go
new file mode 100644
index 00000000..5f6c9ba6
--- /dev/null
+++ b/pkg/speech/_migration_backup/stt_provider.go
@@ -0,0 +1,245 @@
+package speech
+
+import (
+	"context"
+	"fmt"
+	"time"
+)
+
+type STTProviderType string
+
+const (
+	STTProviderOpenAI        STTProviderType = "openai"
+	STTProviderAzure         STTProviderType = "azure"
+	STTProviderGoogle        STTProviderType = "google"
+	STTProviderDeepgram      STTProviderType = "deepgram"
+	STTProviderAssemblyAI    STTProviderType = "assemblyai"
+	STTProviderWhisperCPP    STTProviderType = "whisper.cpp"
+	STTProviderVosk          STTProviderType = "vosk"
+	STTProviderFasterWhisper STTProviderType = "faster-whisper"
+	STTProviderCustom        STTProviderType = "custom"
+)
+
+type AudioInputFormat string
+
+const (
+	InputMP3  AudioInputFormat = "mp3"
+	InputWAV  AudioInputFormat = "wav"
+	InputOGG  AudioInputFormat = "ogg"
+	InputFLAC AudioInputFormat = "flac"
+	InputPCM  AudioInputFormat = "pcm"
+	InputM4A  AudioInputFormat = "m4a"
+	InputMP4  AudioInputFormat = "mp4"
+	InputMPEG AudioInputFormat = "mpeg"
+	InputMPGA AudioInputFormat = "mpga"
+	InputWEBM AudioInputFormat = "webm"
+)
+
+type STTProvider interface {
+	Name() string
+	Type() STTProviderType
+	Transcribe(ctx context.Context, audio []byte, opts ...TranscribeOption) (*TranscriptResult, error)
+	ListLanguages(ctx context.Context) ([]string, error)
+}
+
+type STTConfig struct {
+	Type       STTProviderType
+	APIKey     string
+	BaseURL    string
+	Model      string
+	Language   string
+	SampleRate int
+	Timeout    time.Duration
+}
+
+func NewSTTProvider(cfg STTConfig) (STTProvider, error) {
+	switch cfg.Type {
+	case STTProviderOpenAI:
+		opts := []WhisperOption{}
+		if cfg.BaseURL != "" {
+			opts = append(opts, WithWhisperBaseURL(cfg.BaseURL))
+		}
+		if cfg.Model != "" {
+			opts = append(opts, WithWhisperModel(WhisperModel(cfg.Model)))
+		}
+		if cfg.Language != "" {
+			opts = append(opts, WithWhisperLanguage(cfg.Language))
+		}
+		if cfg.Timeout > 0 {
+			opts = append(opts, WithWhisperTimeout(cfg.Timeout))
+		}
+		return NewWhisperProvider(cfg.APIKey, opts...)
+	case STTProviderGoogle:
+		opts := []GoogleOption{}
+		if cfg.BaseURL != "" {
+			opts = append(opts, WithGoogleBaseURL(cfg.BaseURL))
+		}
+		if cfg.Language != "" {
+			opts = append(opts, WithGoogleLanguageCode(cfg.Language))
+		}
+		if cfg.Timeout > 0 {
+			opts = append(opts, WithGoogleTimeout(cfg.Timeout))
+		}
+		return NewGoogleProvider(cfg.APIKey, opts...)
+	case STTProviderWhisperCPP:
+		opts := []WhisperCPPOption{}
+		if cfg.Model != "" {
+			opts = append(opts, WithWhisperCPPModelPath(cfg.Model))
+		}
+		if cfg.Language != "" {
+			opts = append(opts, WithWhisperCPPLanguage(cfg.Language))
+		}
+		if cfg.Timeout > 0 {
+			opts = append(opts, WithWhisperCPPTimeout(cfg.Timeout))
+		}
+		return NewWhisperCPPProvider(opts...)
+	default:
+		return nil, NewSTTError(ErrProviderNotSupported, "unknown STT provider: "+string(cfg.Type))
+	}
+}
+
+type TranscribeMode string
+
+const (
+	ModeTranscription TranscribeMode = "transcription"
+	ModeTranslation   TranscribeMode = "translation"
+)
+
+type TranscribeOptions struct {
+	Language        string
+	Model           string
+	Prompt          string
+	Temperature     float64
+	Mode            TranscribeMode
+	InputFormat     AudioInputFormat
+	SampleRate      int
+	WordTimestamps  bool
+	SpeakerLabels   bool
+	MaxAlternatives int
+}
+
+type TranscribeOption func(*TranscribeOptions)
+
+func WithSTTLanguage(lang string) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.Language = lang
+	}
+}
+
+func WithSTTModel(model string) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.Model = model
+	}
+}
+
+func WithSTTPrompt(prompt string) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.Prompt = prompt
+	}
+}
+
+func WithSTTTemperature(temp float64) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.Temperature = temp
+	}
+}
+
+func WithSTTMode(mode TranscribeMode) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.Mode = mode
+	}
+}
+
+func WithSTTInputFormat(format AudioInputFormat) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.InputFormat = format
+	}
+}
+
+func WithSTTSampleRate(rate int) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.SampleRate = rate
+	}
+}
+
+func WithSTTWordTimestamps(enabled bool) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.WordTimestamps = enabled
+	}
+}
+
+func WithSTTSpeakerLabels(enabled bool) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.SpeakerLabels = enabled
+	}
+}
+
+func WithSTTMaxAlternatives(n int) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.MaxAlternatives = n
+	}
+}
+
+type WordInfo struct {
+	Word       string
+	StartTime  time.Duration
+	EndTime    time.Duration
+	Confidence float64
+}
+
+type SegmentInfo struct {
+	ID         int
+	Text       string
+	StartTime  time.Duration
+	EndTime    time.Duration
+	Confidence float64
+	Speaker    string
+	Words      []WordInfo
+}
+
+type TranscriptResult struct {
+	Text         string
+	Language     string
+	Duration     time.Duration
+	Confidence   float64
+	Segments     []SegmentInfo
+	Words        []WordInfo
+	Alternatives []string
+}
+
+type STTErrorCode string
+
+const (
+	ErrProviderNotSupported STTErrorCode = "provider_not_supported"
+	ErrAudioFormatInvalid   STTErrorCode = "audio_format_invalid"
+	ErrTranscriptionFailed  STTErrorCode = "transcription_failed"
+	ErrAudioTooLong         STTErrorCode = "audio_too_long"
+	ErrAudioTooLarge        STTErrorCode = "audio_too_large"
+	ErrRateLimited          STTErrorCode = "rate_limited"
+	ErrAuthentication       STTErrorCode = "authentication_failed"
+)
+
+type STTError struct {
+	Code    STTErrorCode
+	Message string
+	Err     error
+}
+
+func NewSTTError(code STTErrorCode, message string) *STTError {
+	return &STTError{Code: code, Message: message}
+}
+
+func NewSTTErrorf(code STTErrorCode, format string, args ...interface{}) *STTError {
+	return &STTError{Code: code, Message: fmt.Sprintf(format, args...)}
+}
+
+func (e *STTError) Error() string {
+	if e.Err != nil {
+		return string(e.Code) + ": " + e.Message + ": " + e.Err.Error()
+	}
+	return string(e.Code) + ": " + e.Message
+}
+
+func (e *STTError) Unwrap() error {
+	return e.Err
+}
diff --git a/pkg/speech/_migration_backup/stt_whisper.go b/pkg/speech/_migration_backup/stt_whisper.go
new file mode 100644
index 00000000..9ac04a86
--- /dev/null
+++ b/pkg/speech/_migration_backup/stt_whisper.go
@@ -0,0 +1,644 @@
+package speech
+
+import (
+	"bufio"
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"mime/multipart"
+	"net/http"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+)
+
+type WhisperModel string
+
+const (
+	WhisperModelV1 WhisperModel = "whisper-1"
+)
+
+var validWhisperModels = map[WhisperModel]bool{
+	WhisperModelV1: true,
+}
+
+var validInputFormats = map[AudioInputFormat]bool{
+	InputMP3:  true,
+	InputWAV:  true,
+	InputOGG:  true,
+	InputFLAC: true,
+	InputM4A:  true,
+	InputMP4:  true,
+	InputMPEG: true,
+	InputMPGA: true,
+	InputWEBM: true,
+}
+
+type WhisperProvider struct {
+	apiKey        string
+	baseURL       string
+	model         WhisperModel
+	language      string
+	timeout       time.Duration
+	retries       int
+	client        *http.Client
+	httpTransport *http.Transport
+}
+
+type WhisperOption func(*WhisperProvider)
+
+func WithWhisperBaseURL(url string) WhisperOption {
+	return func(p *WhisperProvider) {
+		p.baseURL = strings.TrimRight(url, "/")
+	}
+}
+
+func WithWhisperModel(model WhisperModel) WhisperOption {
+	return func(p *WhisperProvider) {
+		p.model = model
+	}
+}
+
+func WithWhisperLanguage(lang string) WhisperOption {
+	return func(p *WhisperProvider) {
+		p.language = lang
+	}
+}
+
+func WithWhisperTimeout(timeout time.Duration) WhisperOption {
+	return func(p *WhisperProvider) {
+		p.timeout = timeout
+	}
+}
+
+func WithWhisperRetries(retries int) WhisperOption {
+	return func(p *WhisperProvider) {
+		p.retries = retries
+	}
+}
+
+func WithWhisperHTTPTransport(transport *http.Transport) WhisperOption {
+	return func(p *WhisperProvider) {
+		p.httpTransport = transport
+	}
+}
+
+func NewWhisperProvider(apiKey string, opts ...WhisperOption) (*WhisperProvider, error) {
+	if apiKey == "" {
+		return nil, NewSTTError(ErrAuthentication, "openai: API key is required")
+	}
+
+	p := &WhisperProvider{
+		apiKey:  apiKey,
+		baseURL: "https://api.openai.com",
+		model:   WhisperModelV1,
+		timeout: 120 * time.Second,
+		retries: 2,
+		client:  &http.Client{Timeout: 120 * time.Second},
+	}
+
+	for _, opt := range opts {
+		opt(p)
+	}
+
+	if p.httpTransport != nil {
+		p.client.Transport = p.httpTransport
+	}
+	p.client.Timeout = p.timeout
+
+	if !validWhisperModels[p.model] {
+		return nil, NewSTTErrorf(ErrProviderNotSupported, "openai: invalid whisper model: %s", p.model)
+	}
+
+	return p, nil
+}
+
+func (p *WhisperProvider) Name() string {
+	return "openai-whisper"
+}
+
+func (p *WhisperProvider) Type() STTProviderType {
+	return STTProviderOpenAI
+}
+
+func (p *WhisperProvider) Transcribe(ctx context.Context, audio []byte, opts ...TranscribeOption) (*TranscriptResult, error) {
+	options := TranscribeOptions{
+		Model:       string(p.model),
+		Language:    p.language,
+		Temperature: 0,
+		Mode:        ModeTranscription,
+		InputFormat: InputMP3,
+	}
+	for _, opt := range opts {
+		opt(&options)
+	}
+
+	if err := p.validateTranscribeOptions(options); err != nil {
+		return nil, err
+	}
+
+	if len(audio) == 0 {
+		return nil, NewSTTError(ErrAudioFormatInvalid, "openai-whisper: audio data is empty")
+	}
+
+	const maxAudioSize = 25 * 1024 * 1024
+	if len(audio) > maxAudioSize {
+		return nil, NewSTTErrorf(ErrAudioTooLarge, "openai-whisper: audio exceeds 25MB limit (%d bytes)", len(audio))
+	}
+
+	if !validInputFormats[options.InputFormat] {
+		return nil, NewSTTErrorf(ErrAudioFormatInvalid, "openai-whisper: unsupported input format: %s", options.InputFormat)
+	}
+
+	var lastErr error
+	for attempt := 0; attempt <= p.retries; attempt++ {
+		if attempt > 0 {
+			backoff := time.Duration(attempt) * time.Second
+			select {
+			case <-ctx.Done():
+				return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: context cancelled during retry: %v", ctx.Err())
+			case <-time.After(backoff):
+			}
+		}
+
+		result, err := p.doTranscribe(ctx, audio, options)
+		if err == nil {
+			return result, nil
+		}
+
+		lastErr = err
+
+		if sttErr, ok := err.(*STTError); ok {
+			if sttErr.Code == ErrAuthentication || sttErr.Code == ErrAudioFormatInvalid || sttErr.Code == ErrAudioTooLarge || sttErr.Code == ErrRateLimited {
+				return nil, err
+			}
+		}
+	}
+
+	return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: all %d retries failed: %v", p.retries, lastErr)
+}
+
+func (p *WhisperProvider) TranscribeFile(ctx context.Context, filePath string, opts ...TranscribeOption) (*TranscriptResult, error) {
+	if filePath == "" {
+		return nil, NewSTTError(ErrAudioFormatInvalid, "openai-whisper: file path is empty")
+	}
+
+	info, err := os.Stat(filePath)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil, NewSTTErrorf(ErrAudioFormatInvalid, "openai-whisper: file not found: %s", filePath)
+		}
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to stat file: %v", err)
+	}
+
+	const maxAudioSize = 25 * 1024 * 1024
+	if info.Size() > maxAudioSize {
+		return nil, NewSTTErrorf(ErrAudioTooLarge, "openai-whisper: file exceeds 25MB limit (%d bytes)", info.Size())
+	}
+
+	audio, err := os.ReadFile(filePath)
+	if err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to read file: %v", err)
+	}
+
+	if len(opts) == 0 || anyInputFormatNotSet(opts) {
+		ext := strings.TrimPrefix(strings.ToLower(filepath.Ext(filePath)), ".")
+		if ext != "" {
+			formatOpts := append([]TranscribeOption{WithSTTInputFormat(AudioInputFormat(ext))}, opts...)
+			return p.Transcribe(ctx, audio, formatOpts...)
+		}
+	}
+
+	return p.Transcribe(ctx, audio, opts...)
+}
+
+func anyInputFormatNotSet(opts []TranscribeOption) bool {
+	for _, opt := range opts {
+		o := &TranscribeOptions{}
+		opt(o)
+		if o.InputFormat != "" {
+			return false
+		}
+	}
+	return true
+}
+
+func (p *WhisperProvider) TranscribeStream(ctx context.Context, reader io.Reader, opts ...TranscribeOption) (*TranscriptResult, error) {
+	if reader == nil {
+		return nil, NewSTTError(ErrAudioFormatInvalid, "openai-whisper: reader is nil")
+	}
+
+	audio, err := io.ReadAll(reader)
+	if err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to read stream: %v", err)
+	}
+
+	return p.Transcribe(ctx, audio, opts...)
+}
+
+func (p *WhisperProvider) validateTranscribeOptions(options TranscribeOptions) error {
+	if options.Temperature < 0 || options.Temperature > 1 {
+		return NewSTTErrorf(ErrAudioFormatInvalid, "openai-whisper: temperature must be between 0 and 1, got: %f", options.Temperature)
+	}
+
+	if options.MaxAlternatives < 0 {
+		return NewSTTErrorf(ErrAudioFormatInvalid, "openai-whisper: maxAlternatives cannot be negative")
+	}
+
+	if options.Model == "" {
+		return NewSTTError(ErrAudioFormatInvalid, "openai-whisper: model is required")
+	}
+
+	return nil
+}
+
+func (p *WhisperProvider) doTranscribe(ctx context.Context, audio []byte, options TranscribeOptions) (*TranscriptResult, error) {
+	var body bytes.Buffer
+	writer := multipart.NewWriter(&body)
+
+	filename := "audio." + string(options.InputFormat)
+	part, err := writer.CreateFormFile("file", filename)
+	if err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to create form file: %v", err)
+	}
+
+	if _, err := part.Write(audio); err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write audio data: %v", err)
+	}
+
+	if err := writer.WriteField("model", options.Model); err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write model field: %v", err)
+	}
+
+	if options.Language != "" {
+		if err := writer.WriteField("language", options.Language); err != nil {
+			return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write language field: %v", err)
+		}
+	}
+
+	if options.Prompt != "" {
+		if err := writer.WriteField("prompt", options.Prompt); err != nil {
+			return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write prompt field: %v", err)
+		}
+	}
+
+	if options.Temperature > 0 {
+		if err := writer.WriteField("temperature", fmt.Sprintf("%.2f", options.Temperature)); err != nil {
+			return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write temperature field: %v", err)
+		}
+	}
+
+	if options.MaxAlternatives > 0 {
+		if err := writer.WriteField("max_alternatives", fmt.Sprintf("%d", options.MaxAlternatives)); err != nil {
+			return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write max_alternatives field: %v", err)
+		}
+	}
+
+	if options.WordTimestamps || options.SpeakerLabels {
+		if options.WordTimestamps {
+			if err := writer.WriteField("timestamp_granularities[]", "word"); err != nil {
+				return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write word timestamp_granularities: %v", err)
+			}
+		}
+		if options.SpeakerLabels {
+			if err := writer.WriteField("timestamp_granularities[]", "segment"); err != nil {
+				return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write segment timestamp_granularities: %v", err)
+			}
+		}
+	}
+
+	responseType := "verbose_json"
+	if options.WordTimestamps || options.SpeakerLabels {
+		responseType = "verbose_json"
+	}
+	if err := writer.WriteField("response_format", responseType); err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write response_format field: %v", err)
+	}
+
+	if err := writer.Close(); err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to close multipart writer: %v", err)
+	}
+
+	var endpoint string
+	switch options.Mode {
+	case ModeTranslation:
+		endpoint = "/v1/audio/translations"
+	default:
+		endpoint = "/v1/audio/transcriptions"
+	}
+
+	url := p.baseURL + endpoint
+
+	req, err := http.NewRequestWithContext(ctx, "POST", url, &body)
+	if err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to create request: %v", err)
+	}
+
+	req.Header.Set("Authorization", "Bearer "+p.apiKey)
+	req.Header.Set("Content-Type", writer.FormDataContentType())
+	req.Header.Set("User-Agent", "anyclaw-stt/1.0")
+
+	resp, err := p.client.Do(req)
+	if err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: request failed: %v", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		respBody, _ := io.ReadAll(resp.Body)
+		return nil, p.handleErrorResponse(resp.StatusCode, respBody)
+	}
+
+	respBody, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to read response: %v", err)
+	}
+
+	return p.parseResponse(respBody, options)
+}
+
+func (p *WhisperProvider) handleErrorResponse(statusCode int, body []byte) error {
+	var errResp whisperErrorResponse
+	if err := json.Unmarshal(body, &errResp); err == nil && errResp.Error.Message != "" {
+		msg := fmt.Sprintf("openai-whisper: API error: %s (type: %s, code: %s)",
+			errResp.Error.Message, errResp.Error.Type, errResp.Error.Code)
+		switch statusCode {
+		case http.StatusUnauthorized:
+			return NewSTTError(ErrAuthentication, msg)
+		case http.StatusTooManyRequests:
+			return NewSTTError(ErrRateLimited, msg)
+		case http.StatusBadRequest:
+			return NewSTTError(ErrAudioFormatInvalid, msg)
+		default:
+			return NewSTTError(ErrTranscriptionFailed, msg)
+		}
+	}
+
+	switch statusCode {
+	case http.StatusUnauthorized:
+		return NewSTTError(ErrAuthentication, fmt.Sprintf("openai-whisper: authentication failed: %s", string(body)))
+	case http.StatusTooManyRequests:
+		return NewSTTError(ErrRateLimited, fmt.Sprintf("openai-whisper: rate limited: %s", string(body)))
+	case http.StatusBadRequest:
+		return NewSTTErrorf(ErrAudioFormatInvalid, "openai-whisper: invalid request: %s", string(body))
+	case http.StatusServiceUnavailable:
+		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: service unavailable: %s", string(body))
+	default:
+		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: unexpected status %d: %s", statusCode, string(body))
+	}
+}
+
+type whisperResponse struct {
+	Text     string  `json:"text"`
+	Language string  `json:"language"`
+	Duration float64 `json:"duration,omitempty"`
+	Segments []struct {
+		ID           int     `json:"id"`
+		Seek         int     `json:"seek"`
+		Start        float64 `json:"start"`
+		End          float64 `json:"end"`
+		Text         string  `json:"text"`
+		Tokens       []int   `json:"tokens"`
+		Temperature  float64 `json:"temperature"`
+		AvgLogProb   float64 `json:"avg_logprob"`
+		Compression  float64 `json:"compression_ratio"`
+		NoSpeechProb float64 `json:"no_speech_prob"`
+		Words        []struct {
+			Word       string  `json:"word"`
+			Start      float64 `json:"start"`
+			End        float64 `json:"end"`
+			Confidence float64 `json:"probability"`
+		} `json:"words,omitempty"`
+	} `json:"segments,omitempty"`
+	LanguageProbability float64 `json:"language_probability,omitempty"`
+}
+
+type whisperErrorResponse struct {
+	Error struct {
+		Message string `json:"message"`
+		Type    string `json:"type"`
+		Code    string `json:"code"`
+	} `json:"error"`
+}
+
+func (p *WhisperProvider) parseResponse(body []byte, options TranscribeOptions) (*TranscriptResult, error) {
+	var resp whisperResponse
+	if err := json.Unmarshal(body, &resp); err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to parse JSON response: %v", err)
+	}
+
+	result := &TranscriptResult{
+		Text:       strings.TrimSpace(resp.Text),
+		Language:   resp.Language,
+		Duration:   time.Duration(resp.Duration * float64(time.Second)),
+		Confidence: resp.LanguageProbability,
+	}
+
+	if len(resp.Segments) > 0 {
+		result.Segments = make([]SegmentInfo, 0, len(resp.Segments))
+		for _, seg := range resp.Segments {
+			segment := SegmentInfo{
+				ID:        seg.ID,
+				Text:      seg.Text,
+				StartTime: time.Duration(seg.Start * float64(time.Second)),
+				EndTime:   time.Duration(seg.End * float64(time.Second)),
+			}
+
+			if seg.AvgLogProb != 0 {
+				segment.Confidence = normalizeLogProb(seg.AvgLogProb)
+			}
+
+			if len(seg.Words) > 0 {
+				segment.Words = make([]WordInfo, 0, len(seg.Words))
+				for _, w := range seg.Words {
+					segment.Words = append(segment.Words, WordInfo{
+						Word:       w.Word,
+						StartTime:  time.Duration(w.Start * float64(time.Second)),
+						EndTime:    time.Duration(w.End * float64(time.Second)),
+						Confidence: w.Confidence,
+					})
+				}
+			}
+
+			result.Segments = append(result.Segments, segment)
+		}
+
+		if len(result.Segments) > 0 && result.Confidence == 0 {
+			totalConfidence := 0.0
+			for _, seg := range result.Segments {
+				totalConfidence += seg.Confidence
+			}
+			result.Confidence = totalConfidence / float64(len(result.Segments))
+		}
+	}
+
+	if options.WordTimestamps && len(result.Segments) > 0 {
+		words := make([]WordInfo, 0)
+		for _, seg := range result.Segments {
+			words = append(words, seg.Words...)
+		}
+		result.Words = words
+	}
+
+	return result, nil
+}
+
+func normalizeLogProb(logProb float64) float64 {
+	if logProb > 0 {
+		return 1.0
+	}
+	prob := 1.0 / (1.0 + logProb*-1)
+	if prob < 0 {
+		return 0
+	}
+	if prob > 1 {
+		return 1
+	}
+	return prob
+}
+
+func (p *WhisperProvider) TranscribeSSE(ctx context.Context, audio []byte, onChunk func(chunk *TranscriptResult), opts ...TranscribeOption) error {
+	options := TranscribeOptions{
+		Model:       string(p.model),
+		Language:    p.language,
+		Temperature: 0,
+		Mode:        ModeTranscription,
+		InputFormat: InputMP3,
+	}
+	for _, opt := range opts {
+		opt(&options)
+	}
+
+	if err := p.validateTranscribeOptions(options); err != nil {
+		return err
+	}
+
+	if len(audio) == 0 {
+		return NewSTTError(ErrAudioFormatInvalid, "openai-whisper: audio data is empty")
+	}
+
+	var body bytes.Buffer
+	writer := multipart.NewWriter(&body)
+
+	filename := "audio." + string(options.InputFormat)
+	part, err := writer.CreateFormFile("file", filename)
+	if err != nil {
+		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to create form file: %v", err)
+	}
+
+	if _, err := part.Write(audio); err != nil {
+		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write audio data: %v", err)
+	}
+
+	if err := writer.WriteField("model", options.Model); err != nil {
+		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write model field: %v", err)
+	}
+
+	if options.Language != "" {
+		if err := writer.WriteField("language", options.Language); err != nil {
+			return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write language field: %v", err)
+		}
+	}
+
+	if err := writer.WriteField("response_format", "json"); err != nil {
+		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write response_format field: %v", err)
+	}
+
+	if err := writer.WriteField("stream", "true"); err != nil {
+		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write stream field: %v", err)
+	}
+
+	if err := writer.Close(); err != nil {
+		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to close multipart writer: %v", err)
+	}
+
+	endpoint := "/v1/audio/transcriptions"
+	if options.Mode == ModeTranslation {
+		endpoint = "/v1/audio/translations"
+	}
+
+	url := p.baseURL + endpoint
+
+	req, err := http.NewRequestWithContext(ctx, "POST", url, &body)
+	if err != nil {
+		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to create request: %v", err)
+	}
+
+	req.Header.Set("Authorization", "Bearer "+p.apiKey)
+	req.Header.Set("Content-Type", writer.FormDataContentType())
+	req.Header.Set("Accept", "text/event-stream")
+
+	resp, err := p.client.Do(req)
+	if err != nil {
+		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: streaming request failed: %v", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		respBody, _ := io.ReadAll(resp.Body)
+		return p.handleErrorResponse(resp.StatusCode, respBody)
+	}
+
+	return p.readSSEStream(resp.Body, onChunk)
+}
+
+func (p *WhisperProvider) readSSEStream(reader io.Reader, onChunk func(chunk *TranscriptResult)) error {
+	scanner := bufio.NewScanner(reader)
+	scanner.Split(bufio.ScanLines)
+
+	var currentText strings.Builder
+	var detectedLanguage string
+
+	for scanner.Scan() {
+		line := scanner.Text()
+
+		if strings.HasPrefix(line, "data: ") {
+			data := strings.TrimPrefix(line, "data: ")
+			if data == "[DONE]" {
+				break
+			}
+
+			var chunk struct {
+				Text     string `json:"text"`
+				Language string `json:"language"`
+				Done     bool   `json:"done"`
+			}
+			if err := json.Unmarshal([]byte(data), &chunk); err != nil {
+				continue
+			}
+
+			if chunk.Text != "" {
+				currentText.WriteString(chunk.Text)
+			}
+			if chunk.Language != "" {
+				detectedLanguage = chunk.Language
+			}
+
+			onChunk(&TranscriptResult{
+				Text:     currentText.String(),
+				Language: detectedLanguage,
+			})
+
+			if chunk.Done {
+				break
+			}
+		}
+	}
+
+	return scanner.Err()
+}
+
+func (p *WhisperProvider) ListLanguages(ctx context.Context) ([]string, error) {
+	return []string{
+		"af", "am", "ar", "as", "az", "ba", "be", "bg", "bn", "bo", "br", "bs", "ca", "cs", "cy", "da",
+		"de", "el", "en", "es", "et", "eu", "fa", "fi", "fo", "fr", "gl", "gu", "ha", "haw", "he", "hi",
+		"hr", "ht", "hu", "hy", "id", "is", "it", "ja", "jw", "ka", "kk", "km", "kn", "ko", "la", "lb",
+		"ln", "lo", "lt", "lv", "mg", "mi", "mk", "ml", "mn", "mr", "ms", "mt", "my", "ne", "nl", "nn",
+		"no", "oc", "pa", "pl", "ps", "pt", "ro", "ru", "sa", "sd", "si", "sk", "sl", "sn", "so", "sq",
+		"sr", "su", "sv", "sw", "ta", "te", "tg", "th", "tk", "tl", "tr", "tt", "uk", "ur", "uz", "vi",
+		"yi", "yo", "zh",
+	}, nil
+}
diff --git a/pkg/speech/_migration_backup/vad.go b/pkg/speech/_migration_backup/vad.go
new file mode 100644
index 00000000..fc36af30
--- /dev/null
+++ b/pkg/speech/_migration_backup/vad.go
@@ -0,0 +1,306 @@
+package speech
+
+import (
+	"math"
+	"sync"
+)
+
+type VADState string
+
+const (
+	VADStateSilence VADState = "silence"
+	VADStateSpeech  VADState = "speech"
+)
+
+type VADConfig struct {
+	SampleRate         int
+	FrameSize          int
+	EnergyThreshold    float64
+	ZeroCrossThreshold int
+	SpeechMinFrames    int
+	SilenceFrames      int
+	HangoverFrames     int
+}
+
+func DefaultVADConfig() VADConfig {
+	return VADConfig{
+		SampleRate:         16000,
+		FrameSize:          320,
+		EnergyThreshold:    0.01,
+		ZeroCrossThreshold: 50,
+		SpeechMinFrames:    3,
+		SilenceFrames:      30,
+		HangoverFrames:     10,
+	}
+}
+
+type VAD struct {
+	mu                 sync.Mutex
+	cfg                VADConfig
+	state              VADState
+	consecutiveSpeech  int
+	consecutiveSilence int
+	listeners          []VADStateListener
+}
+
+type VADStateListener func(state VADState, energy float64, zcr float64)
+
+func NewVAD(cfg VADConfig) *VAD {
+	if cfg.SampleRate == 0 {
+		cfg.SampleRate = 16000
+	}
+	if cfg.FrameSize == 0 {
+		cfg.FrameSize = 320
+	}
+	if cfg.EnergyThreshold == 0 {
+		cfg.EnergyThreshold = 0.01
+	}
+	if cfg.ZeroCrossThreshold == 0 {
+		cfg.ZeroCrossThreshold = 50
+	}
+	if cfg.SpeechMinFrames == 0 {
+		cfg.SpeechMinFrames = 3
+	}
+	if cfg.SilenceFrames == 0 {
+		cfg.SilenceFrames = 30
+	}
+	if cfg.HangoverFrames == 0 {
+		cfg.HangoverFrames = 10
+	}
+
+	return &VAD{
+		cfg:   cfg,
+		state: VADStateSilence,
+	}
+}
+
+func (v *VAD) RegisterListener(listener VADStateListener) {
+	v.mu.Lock()
+	defer v.mu.Unlock()
+	v.listeners = append(v.listeners, listener)
+}
+
+func (v *VAD) ProcessFrame(samples []int16) VADState {
+	v.mu.Lock()
+	defer v.mu.Unlock()
+
+	energy := v.calculateRMS(samples)
+	zcr := v.calculateZeroCrossingRate(samples)
+
+	isSpeech := v.isSpeechFrame(energy, zcr)
+
+	if isSpeech {
+		v.consecutiveSpeech++
+		v.consecutiveSilence = 0
+	} else {
+		v.consecutiveSilence++
+		v.consecutiveSpeech = 0
+	}
+
+	switch v.state {
+	case VADStateSilence:
+		if isSpeech {
+			if v.consecutiveSpeech >= v.cfg.SpeechMinFrames {
+				v.state = VADStateSpeech
+				v.notifyListeners(VADStateSpeech, energy, zcr)
+			}
+		} else {
+			v.consecutiveSpeech = 0
+		}
+
+	case VADStateSpeech:
+		if isSpeech {
+			v.consecutiveSilence = 0
+		} else {
+			if v.consecutiveSilence >= v.cfg.HangoverFrames {
+				v.state = VADStateSilence
+				v.consecutiveSpeech = 0
+				v.consecutiveSilence = 0
+				v.notifyListeners(VADStateSilence, energy, zcr)
+			}
+		}
+	}
+
+	return v.state
+}
+
+func (v *VAD) ProcessFloatFrame(samples []float32) VADState {
+	intSamples := make([]int16, len(samples))
+	for i, s := range samples {
+		clamped := s
+		if clamped > 1.0 {
+			clamped = 1.0
+		}
+		if clamped < -1.0 {
+			clamped = -1.0
+		}
+		intSamples[i] = int16(clamped * 32767.0)
+	}
+	return v.ProcessFrame(intSamples)
+}
+
+func (v *VAD) isSpeechFrame(energy float64, zcr float64) bool {
+	return energy > v.cfg.EnergyThreshold || zcr > float64(v.cfg.ZeroCrossThreshold)
+}
+
+func (v *VAD) calculateRMS(samples []int16) float64 {
+	if len(samples) == 0 {
+		return 0
+	}
+
+	var sumSquares float64
+	for _, s := range samples {
+		normalized := float64(s) / 32768.0
+		sumSquares += normalized * normalized
+	}
+
+	return math.Sqrt(sumSquares / float64(len(samples)))
+}
+
+func (v *VAD) calculateZeroCrossingRate(samples []int16) float64 {
+	if len(samples) < 2 {
+		return 0
+	}
+
+	var crossings int
+	for i := 1; i < len(samples); i++ {
+		if (samples[i] >= 0 && samples[i-1] < 0) || (samples[i] < 0 && samples[i-1] >= 0) {
+			crossings++
+		}
+	}
+
+	return float64(crossings)
+}
+
+func (v *VAD) State() VADState {
+	v.mu.Lock()
+	defer v.mu.Unlock()
+	return v.state
+}
+
+func (v *VAD) Reset() {
+	v.mu.Lock()
+	defer v.mu.Unlock()
+	v.state = VADStateSilence
+	v.consecutiveSpeech = 0
+	v.consecutiveSilence = 0
+}
+
+func (v *VAD) notifyListeners(state VADState, energy float64, zcr float64) {
+	for _, listener := range v.listeners {
+		listener(state, energy, zcr)
+	}
+}
+
+func (v *VAD) UpdateConfig(cfg VADConfig) {
+	v.mu.Lock()
+	defer v.mu.Unlock()
+	if cfg.EnergyThreshold > 0 {
+		v.cfg.EnergyThreshold = cfg.EnergyThreshold
+	}
+	if cfg.ZeroCrossThreshold > 0 {
+		v.cfg.ZeroCrossThreshold = cfg.ZeroCrossThreshold
+	}
+	if cfg.SpeechMinFrames > 0 {
+		v.cfg.SpeechMinFrames = cfg.SpeechMinFrames
+	}
+	if cfg.SilenceFrames > 0 {
+		v.cfg.SilenceFrames = cfg.SilenceFrames
+	}
+	if cfg.HangoverFrames > 0 {
+		v.cfg.HangoverFrames = cfg.HangoverFrames
+	}
+}
+
+func (v *VAD) Config() VADConfig {
+	v.mu.Lock()
+	defer v.mu.Unlock()
+	return v.cfg
+}
+
+func NormalizeAudio(samples []int16) []float64 {
+	result := make([]float64, len(samples))
+	for i, s := range samples {
+		result[i] = float64(s) / 32768.0
+	}
+	return result
+}
+
+func Float32ToInt16(samples []float32) []int16 {
+	result := make([]int16, len(samples))
+	for i, s := range samples {
+		clamped := s
+		if clamped > 1.0 {
+			clamped = 1.0
+		}
+		if clamped < -1.0 {
+			clamped = -1.0
+		}
+		result[i] = int16(clamped * 32767.0)
+	}
+	return result
+}
+
+func Int16ToWAV(samples []int16, sampleRate int, channels int) []byte {
+	if len(samples) == 0 {
+		return nil
+	}
+
+	bitsPerSample := 16
+	byteRate := sampleRate * channels * bitsPerSample / 8
+	blockAlign := channels * bitsPerSample / 8
+	dataSize := len(samples) * 2
+	fileSize := 36 + dataSize
+
+	buf := make([]byte, 44+dataSize)
+
+	copy(buf[0:4], []byte("RIFF"))
+	buf[4] = byte(fileSize)
+	buf[5] = byte(fileSize >> 8)
+	buf[6] = byte(fileSize >> 16)
+	buf[7] = byte(fileSize >> 24)
+
+	copy(buf[8:12], []byte("WAVE"))
+
+	copy(buf[12:16], []byte("fmt "))
+	buf[16] = 16
+	buf[17] = 0
+	buf[18] = 0
+	buf[19] = 0
+
+	buf[20] = 1
+	buf[21] = 0
+
+	buf[22] = byte(channels)
+	buf[23] = 0
+
+	buf[24] = byte(sampleRate)
+	buf[25] = byte(sampleRate >> 8)
+	buf[26] = byte(sampleRate >> 16)
+	buf[27] = byte(sampleRate >> 24)
+
+	buf[28] = byte(byteRate)
+	buf[29] = byte(byteRate >> 8)
+	buf[30] = byte(byteRate >> 16)
+	buf[31] = byte(byteRate >> 24)
+
+	buf[32] = byte(blockAlign)
+	buf[33] = 0
+
+	buf[34] = byte(bitsPerSample)
+	buf[35] = 0
+
+	copy(buf[36:40], []byte("data"))
+	buf[40] = byte(dataSize)
+	buf[41] = byte(dataSize >> 8)
+	buf[42] = byte(dataSize >> 16)
+	buf[43] = byte(dataSize >> 24)
+
+	for i, s := range samples {
+		offset := 44 + i*2
+		buf[offset] = byte(s)
+		buf[offset+1] = byte(s >> 8)
+	}
+
+	return buf
+}
diff --git a/pkg/speech/_migration_backup/voicewake.go b/pkg/speech/_migration_backup/voicewake.go
new file mode 100644
index 00000000..5807bbf9
--- /dev/null
+++ b/pkg/speech/_migration_backup/voicewake.go
@@ -0,0 +1,620 @@
+package speech
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"sync"
+	"time"
+)
+
+type VoiceWakeState string
+
+const (
+	VoiceWakeStateIdle       VoiceWakeState = "idle"
+	VoiceWakeStateListening  VoiceWakeState = "listening"
+	VoiceWakeStateRecording  VoiceWakeState = "recording"
+	VoiceWakeStateProcessing VoiceWakeState = "processing"
+	VoiceWakeStateTriggered  VoiceWakeState = "triggered"
+)
+
+type VoiceWakeEventType string
+
+const (
+	VoiceWakeEventStateChanged VoiceWakeEventType = "state_changed"
+	VoiceWakeEventWakeDetected VoiceWakeEventType = "wake_detected"
+	VoiceWakeEventSpeechStart  VoiceWakeEventType = "speech_start"
+	VoiceWakeEventSpeechEnd    VoiceWakeEventType = "speech_end"
+	VoiceWakeEventError        VoiceWakeEventType = "error"
+)
+
+type VoiceWakeEvent struct {
+	Type      VoiceWakeEventType
+	State     VoiceWakeState
+	Timestamp time.Time
+	Data      map[string]any
+}
+
+type VoiceWakeListener func(event VoiceWakeEvent)
+
+type AudioSource interface {
+	Start(ctx context.Context) error
+	Stop() error
+	Read(samples []int16) (int, error)
+	SampleRate() int
+	Channels() int
+}
+
+type VoiceWakeConfig struct {
+	VADConfig        VADConfig
+	WakeWordConfig   WakeWordConfig
+	EngineConfig     WakeWordEngineConfig
+	SampleRate       int
+	Channels         int
+	FrameSize        int
+	MaxRecordingTime time.Duration
+	CooldownTime     time.Duration
+	AudioSource      AudioSource
+	STTPipeline      *STTPipeline
+	AutoTranscribe   bool
+	WakeWordEngine   WakeWordEngineType
+}
+
+func DefaultVoiceWakeConfig() VoiceWakeConfig {
+	return VoiceWakeConfig{
+		VADConfig:        DefaultVADConfig(),
+		WakeWordConfig:   DefaultWakeWordConfig(),
+		SampleRate:       16000,
+		Channels:         1,
+		FrameSize:        320,
+		MaxRecordingTime: 30 * time.Second,
+		CooldownTime:     2 * time.Second,
+		AutoTranscribe:   true,
+	}
+}
+
+type VoiceWake struct {
+	mu              sync.Mutex
+	cfg             VoiceWakeConfig
+	state           VoiceWakeState
+	vad             *VAD
+	wakeDetector    *WakeWordDetector
+	engineRouter    *WakeWordEngineRouter
+	engineAdapter   *WakeWordEngineAdapter
+	listeners       []VoiceWakeListener
+	audioBuffer     []int16
+	recordingBuffer []int16
+	isRecording     bool
+	recordingStart  time.Time
+	cooldownUntil   time.Time
+	ctx             context.Context
+	cancel          context.CancelFunc
+	wg              sync.WaitGroup
+	transcriber     *STTPipeline
+	lastTranscript  string
+	lastWakeMatch   string
+	lastConfidence  float64
+	lastEnergy      float64
+}
+
+func NewVoiceWake(cfg VoiceWakeConfig) *VoiceWake {
+	if cfg.SampleRate == 0 {
+		cfg.SampleRate = 16000
+	}
+	if cfg.Channels == 0 {
+		cfg.Channels = 1
+	}
+	if cfg.FrameSize == 0 {
+		cfg.FrameSize = 320
+	}
+	if cfg.MaxRecordingTime == 0 {
+		cfg.MaxRecordingTime = 30 * time.Second
+	}
+	if cfg.CooldownTime == 0 {
+		cfg.CooldownTime = 2 * time.Second
+	}
+
+	cfg.VADConfig.SampleRate = cfg.SampleRate
+	cfg.VADConfig.FrameSize = cfg.FrameSize
+
+	cfg.EngineConfig.SampleRate = cfg.SampleRate
+	cfg.EngineConfig.FrameSize = cfg.FrameSize
+
+	vad := NewVAD(cfg.VADConfig)
+	wakeDetector := NewWakeWordDetector(cfg.WakeWordConfig)
+
+	router := NewWakeWordEngineRouter(cfg.EngineConfig)
+	adapter := NewWakeWordEngineAdapter(router, wakeDetector)
+
+	vw := &VoiceWake{
+		cfg:           cfg,
+		state:         VoiceWakeStateIdle,
+		vad:           vad,
+		wakeDetector:  wakeDetector,
+		engineRouter:  router,
+		engineAdapter: adapter,
+		transcriber:   cfg.STTPipeline,
+	}
+
+	vad.RegisterListener(vw.onVADStateChanged)
+
+	return vw
+}
+
+func (vw *VoiceWake) RegisterListener(listener VoiceWakeListener) {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	vw.listeners = append(vw.listeners, listener)
+}
+
+func (vw *VoiceWake) Start(ctx context.Context) error {
+	vw.mu.Lock()
+	if vw.state != VoiceWakeStateIdle {
+		vw.mu.Unlock()
+		return fmt.Errorf("voicewake: already in state %s", vw.state)
+	}
+	vw.state = VoiceWakeStateListening
+	vw.mu.Unlock()
+
+	vw.ctx, vw.cancel = context.WithCancel(ctx)
+
+	if vw.cfg.AudioSource != nil {
+		if err := vw.cfg.AudioSource.Start(vw.ctx); err != nil {
+			vw.mu.Lock()
+			vw.state = VoiceWakeStateIdle
+			vw.mu.Unlock()
+			return fmt.Errorf("voicewake: failed to start audio source: %w", err)
+		}
+	}
+
+	vw.wg.Add(1)
+	go vw.listenLoop()
+
+	vw.notifyListeners(VoiceWakeEvent{
+		Type:      VoiceWakeEventStateChanged,
+		State:     VoiceWakeStateListening,
+		Timestamp: time.Now(),
+		Data:      map[string]any{"message": "Voice wake listener started"},
+	})
+
+	return nil
+}
+
+func (vw *VoiceWake) Stop() error {
+	vw.mu.Lock()
+	if vw.state == VoiceWakeStateIdle {
+		vw.mu.Unlock()
+		return nil
+	}
+
+	if vw.cancel != nil {
+		vw.cancel()
+	}
+	vw.state = VoiceWakeStateIdle
+	vw.mu.Unlock()
+
+	if vw.cfg.AudioSource != nil {
+		_ = vw.cfg.AudioSource.Stop()
+	}
+
+	vw.wg.Wait()
+
+	vw.notifyListeners(VoiceWakeEvent{
+		Type:      VoiceWakeEventStateChanged,
+		State:     VoiceWakeStateIdle,
+		Timestamp: time.Now(),
+		Data:      map[string]any{"message": "Voice wake listener stopped"},
+	})
+
+	return nil
+}
+
+func (vw *VoiceWake) listenLoop() {
+	defer vw.wg.Done()
+
+	samples := make([]int16, vw.cfg.FrameSize)
+
+	for {
+		select {
+		case <-vw.ctx.Done():
+			return
+		default:
+		}
+
+		var n int
+		var err error
+
+		if vw.cfg.AudioSource != nil {
+			n, err = vw.cfg.AudioSource.Read(samples)
+			if err != nil {
+				log.Printf("voicewake: error reading audio: %v", err)
+				time.Sleep(10 * time.Millisecond)
+				continue
+			}
+		} else {
+			time.Sleep(time.Duration(vw.cfg.FrameSize) * time.Second / time.Duration(vw.cfg.SampleRate))
+			continue
+		}
+
+		if n == 0 {
+			continue
+		}
+
+		vw.mu.Lock()
+		inCooldown := time.Now().Before(vw.cooldownUntil)
+		vw.mu.Unlock()
+
+		if inCooldown {
+			continue
+		}
+
+		if vw.engineAdapter != nil && vw.engineAdapter.UseEngine() {
+			result, detected := vw.engineAdapter.ProcessFrame(samples[:n])
+			if detected && result != nil {
+				vw.mu.Lock()
+				vw.lastWakeMatch = result.Keyword
+				vw.lastConfidence = result.Confidence
+				vw.cooldownUntil = time.Now().Add(vw.cfg.CooldownTime)
+				vw.mu.Unlock()
+
+				vw.notifyListeners(VoiceWakeEvent{
+					Type:      VoiceWakeEventWakeDetected,
+					State:     VoiceWakeStateTriggered,
+					Timestamp: time.Now(),
+					Data: map[string]any{
+						"phrase":     result.Keyword,
+						"confidence": result.Confidence,
+						"engine":     string(result.Engine),
+						"energy":     0.0,
+					},
+				})
+
+				vw.mu.Lock()
+				vw.setState(VoiceWakeStateTriggered)
+				vw.mu.Unlock()
+
+				time.Sleep(vw.cfg.CooldownTime)
+
+				vw.mu.Lock()
+				vw.setState(VoiceWakeStateListening)
+				vw.mu.Unlock()
+
+				continue
+			}
+		}
+
+		vw.processAudio(samples[:n])
+	}
+}
+
+func (vw *VoiceWake) processAudio(samples []int16) {
+	vw.mu.Lock()
+	vw.audioBuffer = append(vw.audioBuffer, samples...)
+	vw.mu.Unlock()
+
+	state := vw.vad.ProcessFrame(samples)
+
+	switch state {
+	case VADStateSpeech:
+		vw.mu.Lock()
+		if !vw.isRecording {
+			vw.isRecording = true
+			vw.recordingStart = time.Now()
+			vw.recordingBuffer = make([]int16, 0, vw.cfg.SampleRate*int(vw.cfg.MaxRecordingTime.Seconds()))
+			vw.setState(VoiceWakeStateRecording)
+		}
+		vw.recordingBuffer = append(vw.recordingBuffer, samples...)
+		vw.mu.Unlock()
+
+	case VADStateSilence:
+		vw.mu.Lock()
+		if vw.isRecording {
+			vw.isRecording = false
+			recording := make([]int16, len(vw.recordingBuffer))
+			copy(recording, vw.recordingBuffer)
+			vw.recordingBuffer = nil
+			vw.mu.Unlock()
+
+			vw.processRecording(recording)
+		} else {
+			vw.mu.Unlock()
+		}
+	}
+}
+
+func (vw *VoiceWake) processRecording(samples []int16) {
+	if len(samples) == 0 {
+		return
+	}
+
+	vw.mu.Lock()
+	vw.setState(VoiceWakeStateProcessing)
+	vw.mu.Unlock()
+
+	if vw.cfg.AutoTranscribe && vw.transcriber != nil {
+		audioData := Int16ToWAV(samples, vw.cfg.SampleRate, vw.cfg.Channels)
+
+		go func() {
+			ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+			defer cancel()
+
+			result, err := vw.transcriber.TranscribeDirect(ctx, audioData, WithSTTInputFormat(InputWAV))
+			if err != nil {
+				log.Printf("voicewake: transcription error: %v", err)
+				vw.notifyListeners(VoiceWakeEvent{
+					Type:      VoiceWakeEventError,
+					State:     VoiceWakeStateProcessing,
+					Timestamp: time.Now(),
+					Data:      map[string]any{"error": err.Error()},
+				})
+				vw.mu.Lock()
+				vw.setState(VoiceWakeStateListening)
+				vw.mu.Unlock()
+				return
+			}
+
+			vw.mu.Lock()
+			vw.lastTranscript = result.Text
+			vw.mu.Unlock()
+
+			vw.checkWakeWord(result.Text)
+		}()
+	} else {
+		vw.mu.Lock()
+		vw.setState(VoiceWakeStateListening)
+		vw.mu.Unlock()
+	}
+}
+
+func (vw *VoiceWake) checkWakeWord(transcript string) {
+	if transcript == "" {
+		vw.mu.Lock()
+		vw.setState(VoiceWakeStateListening)
+		vw.mu.Unlock()
+		return
+	}
+
+	phrase, confidence, matched := vw.wakeDetector.Detect(transcript)
+
+	vw.mu.Lock()
+	vw.lastTranscript = transcript
+	vw.lastWakeMatch = phrase
+	vw.lastConfidence = confidence
+	vw.mu.Unlock()
+
+	if matched {
+		vw.mu.Lock()
+		vw.setState(VoiceWakeStateTriggered)
+		vw.cooldownUntil = time.Now().Add(vw.cfg.CooldownTime)
+		energy := vw.lastEnergy
+		vw.mu.Unlock()
+
+		vw.notifyListeners(VoiceWakeEvent{
+			Type:      VoiceWakeEventWakeDetected,
+			State:     VoiceWakeStateTriggered,
+			Timestamp: time.Now(),
+			Data: map[string]any{
+				"phrase":     phrase,
+				"confidence": confidence,
+				"transcript": transcript,
+				"energy":     energy,
+			},
+		})
+
+		time.Sleep(vw.cfg.CooldownTime)
+
+		vw.mu.Lock()
+		vw.setState(VoiceWakeStateListening)
+		vw.mu.Unlock()
+	} else {
+		vw.mu.Lock()
+		vw.setState(VoiceWakeStateListening)
+		vw.mu.Unlock()
+	}
+}
+
+func (vw *VoiceWake) onVADStateChanged(state VADState, energy float64, zcr float64) {
+	vw.mu.Lock()
+	vw.lastEnergy = energy
+	vw.mu.Unlock()
+
+	switch state {
+	case VADStateSpeech:
+		vw.notifyListeners(VoiceWakeEvent{
+			Type:      VoiceWakeEventSpeechStart,
+			State:     vw.State(),
+			Timestamp: time.Now(),
+			Data: map[string]any{
+				"energy": energy,
+				"zcr":    zcr,
+			},
+		})
+
+	case VADStateSilence:
+		vw.notifyListeners(VoiceWakeEvent{
+			Type:      VoiceWakeEventSpeechEnd,
+			State:     vw.State(),
+			Timestamp: time.Now(),
+			Data: map[string]any{
+				"energy": energy,
+				"zcr":    zcr,
+			},
+		})
+	}
+}
+
+func (vw *VoiceWake) setState(state VoiceWakeState) {
+	oldState := vw.state
+	vw.state = state
+
+	if oldState != state {
+		vw.notifyListeners(VoiceWakeEvent{
+			Type:      VoiceWakeEventStateChanged,
+			State:     state,
+			Timestamp: time.Now(),
+			Data: map[string]any{
+				"previous_state": oldState,
+				"new_state":      state,
+			},
+		})
+	}
+}
+
+func (vw *VoiceWake) State() VoiceWakeState {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	return vw.state
+}
+
+func (vw *VoiceWake) notifyListeners(event VoiceWakeEvent) {
+	vw.mu.Lock()
+	listeners := make([]VoiceWakeListener, len(vw.listeners))
+	copy(listeners, vw.listeners)
+	vw.mu.Unlock()
+
+	for _, listener := range listeners {
+		listener(event)
+	}
+}
+
+func (vw *VoiceWake) LastTranscript() string {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	return vw.lastTranscript
+}
+
+func (vw *VoiceWake) LastWakeMatch() (string, float64) {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	return vw.lastWakeMatch, vw.lastConfidence
+}
+
+func (vw *VoiceWake) VAD() *VAD {
+	return vw.vad
+}
+
+func (vw *VoiceWake) WakeDetector() *WakeWordDetector {
+	return vw.wakeDetector
+}
+
+func (vw *VoiceWake) UpdateConfig(cfg VoiceWakeConfig) {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+
+	if cfg.SampleRate > 0 {
+		vw.cfg.SampleRate = cfg.SampleRate
+	}
+	if cfg.Channels > 0 {
+		vw.cfg.Channels = cfg.Channels
+	}
+	if cfg.FrameSize > 0 {
+		vw.cfg.FrameSize = cfg.FrameSize
+	}
+	if cfg.MaxRecordingTime > 0 {
+		vw.cfg.MaxRecordingTime = cfg.MaxRecordingTime
+	}
+	if cfg.CooldownTime > 0 {
+		vw.cfg.CooldownTime = cfg.CooldownTime
+	}
+
+	vw.cfg.AutoTranscribe = cfg.AutoTranscribe
+}
+
+func (vw *VoiceWake) Config() VoiceWakeConfig {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	return vw.cfg
+}
+
+func (vw *VoiceWake) SetTranscriber(pipeline *STTPipeline) {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	vw.transcriber = pipeline
+}
+
+func (vw *VoiceWake) RegisterEngine(engineType WakeWordEngineType, cfg WakeWordEngineConfig) error {
+	vw.mu.Lock()
+	router := vw.engineRouter
+	vw.mu.Unlock()
+
+	if router == nil {
+		return fmt.Errorf("voicewake: no engine router available")
+	}
+
+	if err := router.CreateEngine(engineType, cfg); err != nil {
+		return err
+	}
+
+	vw.engineAdapter.SetUseEngine(true)
+	return nil
+}
+
+func (vw *VoiceWake) SetActiveEngine(name string) error {
+	vw.mu.Lock()
+	router := vw.engineRouter
+	vw.mu.Unlock()
+
+	if router == nil {
+		return fmt.Errorf("voicewake: no engine router available")
+	}
+
+	return router.SetActive(name)
+}
+
+func (vw *VoiceWake) UseWakeWordEngine(use bool) {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	if vw.engineAdapter != nil {
+		vw.engineAdapter.SetUseEngine(use)
+	}
+}
+
+func (vw *VoiceWake) IsUsingWakeWordEngine() bool {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	if vw.engineAdapter == nil {
+		return false
+	}
+	return vw.engineAdapter.UseEngine()
+}
+
+func (vw *VoiceWake) AvailableEngines() []string {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	if vw.engineRouter == nil {
+		return nil
+	}
+	return vw.engineRouter.Engines()
+}
+
+func (vw *VoiceWake) ActiveEngine() string {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	if vw.engineRouter == nil {
+		return ""
+	}
+	return vw.engineRouter.ActiveEngine()
+}
+
+func (vw *VoiceWake) EngineRouter() *WakeWordEngineRouter {
+	return vw.engineRouter
+}
+
+func (vw *VoiceWake) EngineAdapter() *WakeWordEngineAdapter {
+	return vw.engineAdapter
+}
+
+func (vw *VoiceWake) Close() error {
+	if err := vw.Stop(); err != nil {
+		return err
+	}
+
+	vw.mu.Lock()
+	router := vw.engineRouter
+	vw.mu.Unlock()
+
+	if router != nil {
+		return router.Close()
+	}
+	return nil
+}
diff --git a/pkg/speech/stt_openai_client.go b/pkg/speech/stt_openai_client.go
new file mode 100644
index 00000000..3d986daa
--- /dev/null
+++ b/pkg/speech/stt_openai_client.go
@@ -0,0 +1,126 @@
+package speech
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"mime/multipart"
+	"net/http"
+)
+
+type openAIAudioAPIClient struct {
+	apiKey  string
+	baseURL string
+	client  *http.Client
+}
+
+func newOpenAIAudioAPIClient(apiKey, baseURL string, client *http.Client) *openAIAudioAPIClient {
+	return &openAIAudioAPIClient{
+		apiKey:  apiKey,
+		baseURL: baseURL,
+		client:  client,
+	}
+}
+
+func (c *openAIAudioAPIClient) DoTranscriptionRequest(ctx context.Context, endpoint string, audio []byte, options TranscribeOptions, stream bool) (*http.Response, error) {
+	body, contentType, err := c.buildMultipartBody(audio, options, stream)
+	if err != nil {
+		return nil, err
+	}
+
+	req, err := http.NewRequestWithContext(ctx, "POST", c.baseURL+endpoint, body)
+	if err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to create request: %v", err)
+	}
+
+	req.Header.Set("Authorization", "Bearer "+c.apiKey)
+	req.Header.Set("Content-Type", contentType)
+	req.Header.Set("User-Agent", "anyclaw-stt/1.0")
+	if stream {
+		req.Header.Set("Accept", "text/event-stream")
+	}
+
+	resp, err := c.client.Do(req)
+	if err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: request failed: %v", err)
+	}
+
+	return resp, nil
+}
+
+func (c *openAIAudioAPIClient) buildMultipartBody(audio []byte, options TranscribeOptions, stream bool) (*bytes.Buffer, string, error) {
+	var body bytes.Buffer
+	writer := multipart.NewWriter(&body)
+
+	filename := "audio." + string(options.InputFormat)
+	part, err := writer.CreateFormFile("file", filename)
+	if err != nil {
+		return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to create form file: %v", err)
+	}
+
+	if _, err := part.Write(audio); err != nil {
+		return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write audio data: %v", err)
+	}
+
+	if err := writer.WriteField("model", options.Model); err != nil {
+		return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write model field: %v", err)
+	}
+
+	if options.Language != "" {
+		if err := writer.WriteField("language", options.Language); err != nil {
+			return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write language field: %v", err)
+		}
+	}
+
+	if options.Prompt != "" {
+		if err := writer.WriteField("prompt", options.Prompt); err != nil {
+			return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write prompt field: %v", err)
+		}
+	}
+
+	if options.Temperature > 0 {
+		if err := writer.WriteField("temperature", fmt.Sprintf("%.2f", options.Temperature)); err != nil {
+			return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write temperature field: %v", err)
+		}
+	}
+
+	if options.MaxAlternatives > 0 {
+		if err := writer.WriteField("max_alternatives", fmt.Sprintf("%d", options.MaxAlternatives)); err != nil {
+			return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write max_alternatives field: %v", err)
+		}
+	}
+
+	if options.WordTimestamps || options.SpeakerLabels {
+		if options.WordTimestamps {
+			if err := writer.WriteField("timestamp_granularities[]", "word"); err != nil {
+				return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write word timestamp_granularities: %v", err)
+			}
+		}
+		if options.SpeakerLabels {
+			if err := writer.WriteField("timestamp_granularities[]", "segment"); err != nil {
+				return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write segment timestamp_granularities: %v", err)
+			}
+		}
+	}
+
+	responseType := "verbose_json"
+	if stream {
+		responseType = "json"
+	}
+	if err := writer.WriteField("response_format", responseType); err != nil {
+		return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write response_format field: %v", err)
+	}
+
+	if stream {
+		if err := writer.WriteField("stream", "true"); err != nil {
+			return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write stream field: %v", err)
+		}
+	}
+
+	if err := writer.Close(); err != nil {
+		return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to close multipart writer: %v", err)
+	}
+
+	return &body, writer.FormDataContentType(), nil
+}
+
diff --git a/pkg/speech/stt_whisper.go b/pkg/speech/stt_whisper.go
index 9ac04a86..e94b451d 100644
--- a/pkg/speech/stt_whisper.go
+++ b/pkg/speech/stt_whisper.go
@@ -2,12 +2,10 @@ package speech
 
 import (
 	"bufio"
-	"bytes"
 	"context"
 	"encoding/json"
 	"fmt"
 	"io"
-	"mime/multipart"
 	"net/http"
 	"os"
 	"path/filepath"
@@ -45,6 +43,7 @@ type WhisperProvider struct {
 	timeout       time.Duration
 	retries       int
 	client        *http.Client
+	apiClient     *openAIAudioAPIClient
 	httpTransport *http.Transport
 }
 
@@ -108,6 +107,7 @@ func NewWhisperProvider(apiKey string, opts ...WhisperOption) (*WhisperProvider,
 		p.client.Transport = p.httpTransport
 	}
 	p.client.Timeout = p.timeout
+	p.apiClient = newOpenAIAudioAPIClient(p.apiKey, p.baseURL, p.client)
 
 	if !validWhisperModels[p.model] {
 		return nil, NewSTTErrorf(ErrProviderNotSupported, "openai: invalid whisper model: %s", p.model)
@@ -256,72 +256,6 @@ func (p *WhisperProvider) validateTranscribeOptions(options TranscribeOptions) e
 }
 
 func (p *WhisperProvider) doTranscribe(ctx context.Context, audio []byte, options TranscribeOptions) (*TranscriptResult, error) {
-	var body bytes.Buffer
-	writer := multipart.NewWriter(&body)
-
-	filename := "audio." + string(options.InputFormat)
-	part, err := writer.CreateFormFile("file", filename)
-	if err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to create form file: %v", err)
-	}
-
-	if _, err := part.Write(audio); err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write audio data: %v", err)
-	}
-
-	if err := writer.WriteField("model", options.Model); err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write model field: %v", err)
-	}
-
-	if options.Language != "" {
-		if err := writer.WriteField("language", options.Language); err != nil {
-			return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write language field: %v", err)
-		}
-	}
-
-	if options.Prompt != "" {
-		if err := writer.WriteField("prompt", options.Prompt); err != nil {
-			return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write prompt field: %v", err)
-		}
-	}
-
-	if options.Temperature > 0 {
-		if err := writer.WriteField("temperature", fmt.Sprintf("%.2f", options.Temperature)); err != nil {
-			return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write temperature field: %v", err)
-		}
-	}
-
-	if options.MaxAlternatives > 0 {
-		if err := writer.WriteField("max_alternatives", fmt.Sprintf("%d", options.MaxAlternatives)); err != nil {
-			return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write max_alternatives field: %v", err)
-		}
-	}
-
-	if options.WordTimestamps || options.SpeakerLabels {
-		if options.WordTimestamps {
-			if err := writer.WriteField("timestamp_granularities[]", "word"); err != nil {
-				return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write word timestamp_granularities: %v", err)
-			}
-		}
-		if options.SpeakerLabels {
-			if err := writer.WriteField("timestamp_granularities[]", "segment"); err != nil {
-				return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write segment timestamp_granularities: %v", err)
-			}
-		}
-	}
-
-	responseType := "verbose_json"
-	if options.WordTimestamps || options.SpeakerLabels {
-		responseType = "verbose_json"
-	}
-	if err := writer.WriteField("response_format", responseType); err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write response_format field: %v", err)
-	}
-
-	if err := writer.Close(); err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to close multipart writer: %v", err)
-	}
-
 	var endpoint string
 	switch options.Mode {
 	case ModeTranslation:
@@ -330,20 +264,9 @@ func (p *WhisperProvider) doTranscribe(ctx context.Context, audio []byte, option
 		endpoint = "/v1/audio/transcriptions"
 	}
 
-	url := p.baseURL + endpoint
-
-	req, err := http.NewRequestWithContext(ctx, "POST", url, &body)
-	if err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to create request: %v", err)
-	}
-
-	req.Header.Set("Authorization", "Bearer "+p.apiKey)
-	req.Header.Set("Content-Type", writer.FormDataContentType())
-	req.Header.Set("User-Agent", "anyclaw-stt/1.0")
-
-	resp, err := p.client.Do(req)
+	resp, err := p.apiClient.DoTranscriptionRequest(ctx, endpoint, audio, options, false)
 	if err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: request failed: %v", err)
+		return nil, err
 	}
 	defer resp.Body.Close()
 
@@ -520,60 +443,14 @@ func (p *WhisperProvider) TranscribeSSE(ctx context.Context, audio []byte, onChu
 		return NewSTTError(ErrAudioFormatInvalid, "openai-whisper: audio data is empty")
 	}
 
-	var body bytes.Buffer
-	writer := multipart.NewWriter(&body)
-
-	filename := "audio." + string(options.InputFormat)
-	part, err := writer.CreateFormFile("file", filename)
-	if err != nil {
-		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to create form file: %v", err)
-	}
-
-	if _, err := part.Write(audio); err != nil {
-		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write audio data: %v", err)
-	}
-
-	if err := writer.WriteField("model", options.Model); err != nil {
-		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write model field: %v", err)
-	}
-
-	if options.Language != "" {
-		if err := writer.WriteField("language", options.Language); err != nil {
-			return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write language field: %v", err)
-		}
-	}
-
-	if err := writer.WriteField("response_format", "json"); err != nil {
-		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write response_format field: %v", err)
-	}
-
-	if err := writer.WriteField("stream", "true"); err != nil {
-		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write stream field: %v", err)
-	}
-
-	if err := writer.Close(); err != nil {
-		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to close multipart writer: %v", err)
-	}
-
 	endpoint := "/v1/audio/transcriptions"
 	if options.Mode == ModeTranslation {
 		endpoint = "/v1/audio/translations"
 	}
 
-	url := p.baseURL + endpoint
-
-	req, err := http.NewRequestWithContext(ctx, "POST", url, &body)
+	resp, err := p.apiClient.DoTranscriptionRequest(ctx, endpoint, audio, options, true)
 	if err != nil {
-		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to create request: %v", err)
-	}
-
-	req.Header.Set("Authorization", "Bearer "+p.apiKey)
-	req.Header.Set("Content-Type", writer.FormDataContentType())
-	req.Header.Set("Accept", "text/event-stream")
-
-	resp, err := p.client.Do(req)
-	if err != nil {
-		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: streaming request failed: %v", err)
+		return err
 	}
 	defer resp.Body.Close()
 
diff --git a/pkg/speech/vad.go b/pkg/speech/vad.go
index fc36af30..89ca1fe2 100644
--- a/pkg/speech/vad.go
+++ b/pkg/speech/vad.go
@@ -45,6 +45,14 @@ type VAD struct {
 
 type VADStateListener func(state VADState, energy float64, zcr float64)
 
+func (v *VAD) Name() string {
+	return "heuristic-vad"
+}
+
+func (v *VAD) Type() VADProviderType {
+	return VADProviderHeuristic
+}
+
 func NewVAD(cfg VADConfig) *VAD {
 	if cfg.SampleRate == 0 {
 		cfg.SampleRate = 16000
diff --git a/pkg/speech/vad_provider.go b/pkg/speech/vad_provider.go
new file mode 100644
index 00000000..ace4c278
--- /dev/null
+++ b/pkg/speech/vad_provider.go
@@ -0,0 +1,55 @@
+package speech
+
+import "fmt"
+
+type VADProviderType string
+
+const (
+	VADProviderHeuristic VADProviderType = "heuristic"
+)
+
+type VADProcessor interface {
+	Name() string
+	Type() VADProviderType
+	ProcessFrame(samples []int16) VADState
+	ProcessFloatFrame(samples []float32) VADState
+	RegisterListener(listener VADStateListener)
+	State() VADState
+	Reset()
+	UpdateConfig(cfg VADConfig)
+	Config() VADConfig
+}
+
+type VADProviderFactory func(cfg VADConfig) (VADProcessor, error)
+
+type VADManager struct {
+	factories map[VADProviderType]VADProviderFactory
+}
+
+func NewVADManager() *VADManager {
+	m := &VADManager{
+		factories: map[VADProviderType]VADProviderFactory{},
+	}
+	m.Register(VADProviderHeuristic, func(cfg VADConfig) (VADProcessor, error) {
+		return NewVAD(cfg), nil
+	})
+	return m
+}
+
+func (m *VADManager) Register(providerType VADProviderType, factory VADProviderFactory) {
+	m.factories[providerType] = factory
+}
+
+func (m *VADManager) New(cfg VADConfig, providerType VADProviderType) (VADProcessor, error) {
+	if providerType == "" {
+		providerType = VADProviderHeuristic
+	}
+
+	factory, ok := m.factories[providerType]
+	if !ok {
+		return nil, fmt.Errorf("vad: unsupported provider %q", providerType)
+	}
+
+	return factory(cfg)
+}
+
diff --git a/pkg/speech/voicewake.go b/pkg/speech/voicewake.go
index 5807bbf9..044ffa62 100644
--- a/pkg/speech/voicewake.go
+++ b/pkg/speech/voicewake.go
@@ -47,6 +47,7 @@ type AudioSource interface {
 
 type VoiceWakeConfig struct {
 	VADConfig        VADConfig
+	VADProvider      VADProviderType
 	WakeWordConfig   WakeWordConfig
 	EngineConfig     WakeWordEngineConfig
 	SampleRate       int
@@ -63,6 +64,7 @@ type VoiceWakeConfig struct {
 func DefaultVoiceWakeConfig() VoiceWakeConfig {
 	return VoiceWakeConfig{
 		VADConfig:        DefaultVADConfig(),
+		VADProvider:      VADProviderHeuristic,
 		WakeWordConfig:   DefaultWakeWordConfig(),
 		SampleRate:       16000,
 		Channels:         1,
@@ -77,7 +79,7 @@ type VoiceWake struct {
 	mu              sync.Mutex
 	cfg             VoiceWakeConfig
 	state           VoiceWakeState
-	vad             *VAD
+	vad             VADProcessor
 	wakeDetector    *WakeWordDetector
 	engineRouter    *WakeWordEngineRouter
 	engineAdapter   *WakeWordEngineAdapter
@@ -120,7 +122,16 @@ func NewVoiceWake(cfg VoiceWakeConfig) *VoiceWake {
 	cfg.EngineConfig.SampleRate = cfg.SampleRate
 	cfg.EngineConfig.FrameSize = cfg.FrameSize
 
-	vad := NewVAD(cfg.VADConfig)
+	if cfg.VADProvider == "" {
+		cfg.VADProvider = VADProviderHeuristic
+	}
+
+	vadManager := NewVADManager()
+	vad, err := vadManager.New(cfg.VADConfig, cfg.VADProvider)
+	if err != nil {
+		log.Printf("voicewake: failed to create VAD provider %q, fallback to heuristic: %v", cfg.VADProvider, err)
+		vad = NewVAD(cfg.VADConfig)
+	}
 	wakeDetector := NewWakeWordDetector(cfg.WakeWordConfig)
 
 	router := NewWakeWordEngineRouter(cfg.EngineConfig)
@@ -489,7 +500,7 @@ func (vw *VoiceWake) LastWakeMatch() (string, float64) {
 	return vw.lastWakeMatch, vw.lastConfidence
 }
 
-func (vw *VoiceWake) VAD() *VAD {
+func (vw *VoiceWake) VAD() VADProcessor {
 	return vw.vad
 }
 

From 763a012646b47cae35e2f79e840fb3131791d8f3 Mon Sep 17 00:00:00 2001
From: TheShigure7 <2947458856@qq.com>
Date: Tue, 28 Apr 2026 17:02:34 +0800
Subject: [PATCH 2/6] feat(speech): switch voice wake default VAD to webrtc

---
 go.mod                     |   4 +-
 go.sum                     |   2 +
 pkg/speech/vad.go          |   5 ++
 pkg/speech/vad_provider.go |   5 +-
 pkg/speech/vad_webrtc.go   | 144 +++++++++++++++++++++++++++++++++++++
 pkg/speech/voicewake.go    |   2 +-
 6 files changed, 159 insertions(+), 3 deletions(-)
 create mode 100644 pkg/speech/vad_webrtc.go

diff --git a/go.mod b/go.mod
index e53016d0..af8b594e 100644
--- a/go.mod
+++ b/go.mod
@@ -1,8 +1,9 @@
 module github.com/1024XEngineer/anyclaw
 
-go 1.25.0
+go 1.25.1
 
 require (
+	github.com/anyclaw/anyclaw v0.0.0
 	github.com/charmbracelet/bubbles v1.0.0
 	github.com/charmbracelet/bubbletea v1.3.10
 	github.com/charmbracelet/lipgloss v1.1.0
@@ -30,6 +31,7 @@ require (
 	github.com/gobwas/httphead v0.1.0 // indirect
 	github.com/gobwas/pool v0.2.1 // indirect
 	github.com/gobwas/ws v1.4.0 // indirect
+	github.com/godeps/webrtcvad-go v0.1.0 // indirect
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/lucasb-eyer/go-colorful v1.3.0 // indirect
diff --git a/go.sum b/go.sum
index 336affad..ce47680e 100644
--- a/go.sum
+++ b/go.sum
@@ -36,6 +36,8 @@ github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
 github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
 github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs=
 github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc=
+github.com/godeps/webrtcvad-go v0.1.0 h1:JpVfJHSzND9p/iuO7xqko1UlB/UJjKxskEWEbzKKjrQ=
+github.com/godeps/webrtcvad-go v0.1.0/go.mod h1:487THSHEZrYU29LRm4AKYCm/Y8PPq3pIJSuz1KX3MwU=
 github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
 github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
diff --git a/pkg/speech/vad.go b/pkg/speech/vad.go
index 89ca1fe2..045f0f9f 100644
--- a/pkg/speech/vad.go
+++ b/pkg/speech/vad.go
@@ -15,6 +15,7 @@ const (
 type VADConfig struct {
 	SampleRate         int
 	FrameSize          int
+	Aggressiveness     int
 	EnergyThreshold    float64
 	ZeroCrossThreshold int
 	SpeechMinFrames    int
@@ -26,6 +27,7 @@ func DefaultVADConfig() VADConfig {
 	return VADConfig{
 		SampleRate:         16000,
 		FrameSize:          320,
+		Aggressiveness:     2,
 		EnergyThreshold:    0.01,
 		ZeroCrossThreshold: 50,
 		SpeechMinFrames:    3,
@@ -60,6 +62,9 @@ func NewVAD(cfg VADConfig) *VAD {
 	if cfg.FrameSize == 0 {
 		cfg.FrameSize = 320
 	}
+	if cfg.Aggressiveness < 0 || cfg.Aggressiveness > 3 {
+		cfg.Aggressiveness = 2
+	}
 	if cfg.EnergyThreshold == 0 {
 		cfg.EnergyThreshold = 0.01
 	}
diff --git a/pkg/speech/vad_provider.go b/pkg/speech/vad_provider.go
index ace4c278..ce0de4e2 100644
--- a/pkg/speech/vad_provider.go
+++ b/pkg/speech/vad_provider.go
@@ -6,6 +6,7 @@ type VADProviderType string
 
 const (
 	VADProviderHeuristic VADProviderType = "heuristic"
+	VADProviderWebRTC    VADProviderType = "webrtc"
 )
 
 type VADProcessor interface {
@@ -33,6 +34,9 @@ func NewVADManager() *VADManager {
 	m.Register(VADProviderHeuristic, func(cfg VADConfig) (VADProcessor, error) {
 		return NewVAD(cfg), nil
 	})
+	m.Register(VADProviderWebRTC, func(cfg VADConfig) (VADProcessor, error) {
+		return NewWebRTCVAD(cfg)
+	})
 	return m
 }
 
@@ -52,4 +56,3 @@ func (m *VADManager) New(cfg VADConfig, providerType VADProviderType) (VADProces
 
 	return factory(cfg)
 }
-
diff --git a/pkg/speech/vad_webrtc.go b/pkg/speech/vad_webrtc.go
new file mode 100644
index 00000000..4a7d2163
--- /dev/null
+++ b/pkg/speech/vad_webrtc.go
@@ -0,0 +1,144 @@
+package speech
+
+import (
+	"encoding/binary"
+	"fmt"
+
+	webrtcvad "github.com/godeps/webrtcvad-go"
+)
+
+type WebRTCVAD struct {
+	inner      *VAD
+	detector   *webrtcvad.VAD
+	mode       int
+	sampleRate int
+	frameSize  int
+}
+
+func NewWebRTCVAD(cfg VADConfig) (*WebRTCVAD, error) {
+	if cfg.SampleRate == 0 {
+		cfg.SampleRate = 16000
+	}
+	if cfg.FrameSize == 0 {
+		cfg.FrameSize = 320
+	}
+	if cfg.Aggressiveness < 0 || cfg.Aggressiveness > 3 {
+		cfg.Aggressiveness = 2
+	}
+
+	if !webrtcvad.ValidRateAndFrameLength(cfg.SampleRate, cfg.FrameSize) {
+		return nil, fmt.Errorf("vad: invalid WebRTC sampleRate/frameSize combination: %d/%d", cfg.SampleRate, cfg.FrameSize)
+	}
+
+	detector, err := webrtcvad.New(cfg.Aggressiveness)
+	if err != nil {
+		return nil, fmt.Errorf("vad: failed to create WebRTC VAD: %w", err)
+	}
+
+	return &WebRTCVAD{
+		inner:      NewVAD(cfg),
+		detector:   detector,
+		mode:       cfg.Aggressiveness,
+		sampleRate: cfg.SampleRate,
+		frameSize:  cfg.FrameSize,
+	}, nil
+}
+
+func (v *WebRTCVAD) Name() string {
+	return "webrtc-vad"
+}
+
+func (v *WebRTCVAD) Type() VADProviderType {
+	return VADProviderWebRTC
+}
+
+func (v *WebRTCVAD) ProcessFrame(samples []int16) VADState {
+	if len(samples) == 0 {
+		return v.inner.ProcessFrame(samples)
+	}
+
+	audio := int16ToLittleEndianBytes(samples)
+	isSpeech, err := v.detector.IsSpeech(audio, v.sampleRate)
+	if err != nil {
+		return v.inner.ProcessFrame(samples)
+	}
+
+	v.inner.mu.Lock()
+	defer v.inner.mu.Unlock()
+
+	energy := v.inner.calculateRMS(samples)
+	zcr := v.inner.calculateZeroCrossingRate(samples)
+
+	if isSpeech {
+		v.inner.consecutiveSpeech++
+		v.inner.consecutiveSilence = 0
+	} else {
+		v.inner.consecutiveSilence++
+		v.inner.consecutiveSpeech = 0
+	}
+
+	switch v.inner.state {
+	case VADStateSilence:
+		if isSpeech {
+			if v.inner.consecutiveSpeech >= v.inner.cfg.SpeechMinFrames {
+				v.inner.state = VADStateSpeech
+				v.inner.notifyListeners(VADStateSpeech, energy, zcr)
+			}
+		} else {
+			v.inner.consecutiveSpeech = 0
+		}
+
+	case VADStateSpeech:
+		if isSpeech {
+			v.inner.consecutiveSilence = 0
+		} else {
+			if v.inner.consecutiveSilence >= v.inner.cfg.HangoverFrames {
+				v.inner.state = VADStateSilence
+				v.inner.consecutiveSpeech = 0
+				v.inner.consecutiveSilence = 0
+				v.inner.notifyListeners(VADStateSilence, energy, zcr)
+			}
+		}
+	}
+
+	return v.inner.state
+}
+
+func (v *WebRTCVAD) ProcessFloatFrame(samples []float32) VADState {
+	return v.ProcessFrame(Float32ToInt16(samples))
+}
+
+func (v *WebRTCVAD) RegisterListener(listener VADStateListener) {
+	v.inner.RegisterListener(listener)
+}
+
+func (v *WebRTCVAD) State() VADState {
+	return v.inner.State()
+}
+
+func (v *WebRTCVAD) Reset() {
+	v.inner.Reset()
+}
+
+func (v *WebRTCVAD) UpdateConfig(cfg VADConfig) {
+	v.inner.UpdateConfig(cfg)
+	if cfg.Aggressiveness >= 0 && cfg.Aggressiveness <= 3 {
+		_ = v.detector.SetMode(cfg.Aggressiveness)
+		v.mode = cfg.Aggressiveness
+	}
+}
+
+func (v *WebRTCVAD) Config() VADConfig {
+	cfg := v.inner.Config()
+	cfg.Aggressiveness = v.mode
+	return cfg
+}
+
+func int16ToLittleEndianBytes(samples []int16) []byte {
+	out := make([]byte, len(samples)*2)
+	for i, s := range samples {
+		binary.LittleEndian.PutUint16(out[i*2:], uint16(s))
+	}
+	return out
+}
+
diff --git a/pkg/speech/voicewake.go b/pkg/speech/voicewake.go
index 044ffa62..64d89ffc 100644
--- a/pkg/speech/voicewake.go
+++ b/pkg/speech/voicewake.go
@@ -64,7 +64,7 @@ type VoiceWakeConfig struct {
 func DefaultVoiceWakeConfig() VoiceWakeConfig {
 	return VoiceWakeConfig{
 		VADConfig:        DefaultVADConfig(),
-		VADProvider:      VADProviderHeuristic,
+		VADProvider:      VADProviderWebRTC,
 		WakeWordConfig:   DefaultWakeWordConfig(),
 		SampleRate:       16000,
 		Channels:         1,

From 7595c995946ae435cfeea2604c2f2e9d1f86de64 Mon Sep 17 00:00:00 2001
From: TheShigure7 <2947458856@qq.com>
Date: Tue, 28 Apr 2026 17:12:47 +0800
Subject: [PATCH 3/6] refactor(speech): switch google stt to official client

---
 go.mod                          |  34 +-
 go.sum                          |  94 ++++-
 pkg/speech/stt_google.go        | 329 ++++++++--------
 pkg/speech/stt_google_client.go |  34 ++
 pkg/speech/stt_google_test.go   | 661 +++++++++++++-------------------
 pkg/speech/stt_provider.go      |  18 +-
 6 files changed, 581 insertions(+), 589 deletions(-)
 create mode 100644 pkg/speech/stt_google_client.go

diff --git a/go.mod b/go.mod
index af8b594e..0e916bca 100644
--- a/go.mod
+++ b/go.mod
@@ -3,7 +3,7 @@ module github.com/1024XEngineer/anyclaw
 go 1.25.1
 
 require (
-	github.com/anyclaw/anyclaw v0.0.0
+	cloud.google.com/go/speech v1.33.0
 	github.com/charmbracelet/bubbles v1.0.0
 	github.com/charmbracelet/bubbletea v1.3.10
 	github.com/charmbracelet/lipgloss v1.1.0
@@ -11,15 +11,25 @@ require (
 	github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335
 	github.com/chromedp/chromedp v0.10.0
 	github.com/clipperhouse/uax29/v2 v2.5.0
+	github.com/godeps/webrtcvad-go v0.1.0
 	github.com/gorilla/websocket v1.5.3
 	github.com/philippgille/chromem-go v0.7.0
 	golang.org/x/sys v0.42.0
-	golang.org/x/text v0.22.0
+	golang.org/x/text v0.35.0
+	google.golang.org/api v0.275.0
+	google.golang.org/grpc v1.80.0
+	google.golang.org/protobuf v1.36.11
 	modernc.org/sqlite v1.48.1
 )
 
 require (
+	cloud.google.com/go v0.123.0 // indirect
+	cloud.google.com/go/auth v0.20.0 // indirect
+	cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect
+	cloud.google.com/go/compute/metadata v0.9.0 // indirect
+	cloud.google.com/go/longrunning v0.9.0 // indirect
 	github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
+	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/charmbracelet/colorprofile v0.4.1 // indirect
 	github.com/charmbracelet/x/ansi v0.11.6 // indirect
 	github.com/charmbracelet/x/cellbuf v0.0.15 // indirect
@@ -28,11 +38,16 @@ require (
 	github.com/clipperhouse/stringish v0.1.1 // indirect
 	github.com/dustin/go-humanize v1.0.1 // indirect
 	github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
+	github.com/felixge/httpsnoop v1.0.4 // indirect
+	github.com/go-logr/logr v1.4.3 // indirect
+	github.com/go-logr/stdr v1.2.2 // indirect
 	github.com/gobwas/httphead v0.1.0 // indirect
 	github.com/gobwas/pool v0.2.1 // indirect
 	github.com/gobwas/ws v1.4.0 // indirect
-	github.com/godeps/webrtcvad-go v0.1.0 // indirect
+	github.com/google/s2a-go v0.1.9 // indirect
 	github.com/google/uuid v1.6.0 // indirect
+	github.com/googleapis/enterprise-certificate-proxy v0.3.14 // indirect
+	github.com/googleapis/gax-go/v2 v2.21.0 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/lucasb-eyer/go-colorful v1.3.0 // indirect
 	github.com/mailru/easyjson v0.7.7 // indirect
@@ -46,6 +61,19 @@ require (
 	github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
 	github.com/rivo/uniseg v0.4.7 // indirect
 	github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
+	go.opentelemetry.io/auto/sdk v1.2.1 // indirect
+	go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.67.0 // indirect
+	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0 // indirect
+	go.opentelemetry.io/otel v1.43.0 // indirect
+	go.opentelemetry.io/otel/metric v1.43.0 // indirect
+	go.opentelemetry.io/otel/trace v1.43.0 // indirect
+	golang.org/x/crypto v0.49.0 // indirect
+	golang.org/x/net v0.52.0 // indirect
+	golang.org/x/oauth2 v0.36.0 // indirect
+	golang.org/x/sync v0.20.0 // indirect
+	golang.org/x/time v0.15.0 // indirect
+	google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect
+	google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect
 	modernc.org/libc v1.70.0 // indirect
 	modernc.org/mathutil v1.7.1 // indirect
 	modernc.org/memory v1.11.0 // indirect
diff --git a/go.sum b/go.sum
index ce47680e..bd4ff23b 100644
--- a/go.sum
+++ b/go.sum
@@ -1,5 +1,19 @@
+cloud.google.com/go v0.123.0 h1:2NAUJwPR47q+E35uaJeYoNhuNEM9kM8SjgRgdeOJUSE=
+cloud.google.com/go v0.123.0/go.mod h1:xBoMV08QcqUGuPW65Qfm1o9Y4zKZBpGS+7bImXLTAZU=
+cloud.google.com/go/auth v0.20.0 h1:kXTssoVb4azsVDoUiF8KvxAqrsQcQtB53DcSgta74CA=
+cloud.google.com/go/auth v0.20.0/go.mod h1:942/yi/itH1SsmpyrbnTMDgGfdy2BUqIKyd0cyYLc5Q=
+cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc=
+cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c=
+cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs=
+cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10=
+cloud.google.com/go/longrunning v0.9.0 h1:0EzbDEGsAvOZNbqXopgniY0w0a1phvu5IdUFq8grmqY=
+cloud.google.com/go/longrunning v0.9.0/go.mod h1:pkTz846W7bF4o2SzdWJ40Hu0Re+UoNT6Q5t+igIcb8E=
+cloud.google.com/go/speech v1.33.0 h1:555yroj4HCS7SPgfHuDU8zX+E5KrhccVWG96HNyBUAk=
+cloud.google.com/go/speech v1.33.0/go.mod h1:shnf33sZbGnQQZyek1fdLOR5rRKV6D3jsNqpqyijvj8=
 github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
 github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
+github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
+github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
 github.com/charmbracelet/bubbles v1.0.0 h1:12J8/ak/uCZEMQ6KU7pcfwceyjLlWsDLAxB5fXonfvc=
 github.com/charmbracelet/bubbles v1.0.0/go.mod h1:9d/Zd5GdnauMI5ivUIVisuEm3ave1XwXtD1ckyV6r3E=
 github.com/charmbracelet/bubbletea v1.3.10 h1:otUDHWMMzQSB0Pkc87rm691KZ3SWa4KUlvF9nRvCICw=
@@ -26,10 +40,26 @@ github.com/clipperhouse/stringish v0.1.1 h1:+NSqMOr3GR6k1FdRhhnXrLfztGzuG+VuFDfa
 github.com/clipperhouse/stringish v0.1.1/go.mod h1:v/WhFtE1q0ovMta2+m+UbpZ+2/HEXNWYXQgCt4hdOzA=
 github.com/clipperhouse/uax29/v2 v2.5.0 h1:x7T0T4eTHDONxFJsL94uKNKPHrclyFI0lm7+w94cO8U=
 github.com/clipperhouse/uax29/v2 v2.5.0/go.mod h1:Wn1g7MK6OoeDT0vL+Q0SQLDz/KpfsVRgg6W7ihQeh4g=
+github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5 h1:6xNmx7iTtyBRev0+D/Tv1FZd4SCg8axKApyNyRsAt/w=
+github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5/go.mod h1:KdCmV+x/BuvyMxRnYBlmVaq4OLiKW6iRQfvC62cvdkI=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
 github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
+github.com/envoyproxy/go-control-plane v0.14.0 h1:hbG2kr4RuFj222B6+7T83thSPqLjwBIfQawTkC++2HA=
+github.com/envoyproxy/go-control-plane/envoy v1.36.0 h1:yg/JjO5E7ubRyKX3m07GF3reDNEnfOboJ0QySbH736g=
+github.com/envoyproxy/go-control-plane/envoy v1.36.0/go.mod h1:ty89S1YCCVruQAm9OtKeEkQLTb+Lkz0k8v9W0Oxsv98=
+github.com/envoyproxy/protoc-gen-validate v1.3.0 h1:TvGH1wof4H33rezVKWSpqKz5NXWg5VPuZ0uONDT6eb4=
+github.com/envoyproxy/protoc-gen-validate v1.3.0/go.mod h1:HvYl7zwPa5mffgyeTUHA9zHIH36nmrm7oCbo4YKoSWA=
 github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4=
 github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM=
+github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
+github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
+github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
+github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
+github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
+github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
+github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
 github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU=
 github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
 github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
@@ -38,10 +68,20 @@ github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs=
 github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc=
 github.com/godeps/webrtcvad-go v0.1.0 h1:JpVfJHSzND9p/iuO7xqko1UlB/UJjKxskEWEbzKKjrQ=
 github.com/godeps/webrtcvad-go v0.1.0/go.mod h1:487THSHEZrYU29LRm4AKYCm/Y8PPq3pIJSuz1KX3MwU=
+github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
+github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
+github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
+github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
 github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
+github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0=
+github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/googleapis/enterprise-certificate-proxy v0.3.14 h1:yh8ncqsbUY4shRD5dA6RlzjJaT4hi3kII+zYw8wmLb8=
+github.com/googleapis/enterprise-certificate-proxy v0.3.14/go.mod h1:vqVt9yG9480NtzREnTlmGSBmFrA+bzb0yl0TxoBQXOg=
+github.com/googleapis/gax-go/v2 v2.21.0 h1:h45NjjzEO3faG9Lg/cFrBh2PgegVVgzqKzuZl/wMbiI=
+github.com/googleapis/gax-go/v2 v2.21.0/go.mod h1:But/NJU6TnZsrLai/xBAQLLz+Hc7fHZJt/hsCz3Fih4=
 github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
 github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
 github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
@@ -72,27 +112,73 @@ github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhA
 github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0=
 github.com/philippgille/chromem-go v0.7.0 h1:4jfvfyKymjKNfGxBUhHUcj1kp7B17NL/I1P+vGh1RvY=
 github.com/philippgille/chromem-go v0.7.0/go.mod h1:hTd+wGEm/fFPQl7ilfCwQXkgEUxceYh86iIdoKMolPo=
+github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 h1:GFCKgmp0tecUJ0sJuv4pzYCqS9+RGSn52M3FUwPs+uo=
+github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1:t/avpk3KcrXxUnYOhZhMXJlSEyie6gQbtLq5NM3loB8=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
 github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
 github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
+github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
+github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
 github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no=
 github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM=
+go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
+go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
+go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.67.0 h1:yI1/OhfEPy7J9eoa6Sj051C7n5dvpj0QX8g4sRchg04=
+go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.67.0/go.mod h1:NoUCKYWK+3ecatC4HjkRktREheMeEtrXoQxrqYFeHSc=
+go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0 h1:OyrsyzuttWTSur2qN/Lm0m2a8yqyIjUVBZcxFPuXq2o=
+go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0/go.mod h1:C2NGBr+kAB4bk3xtMXfZ94gqFDtg/GkI7e9zqGh5Beg=
+go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I=
+go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0=
+go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM=
+go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY=
+go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg=
+go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg=
+go.opentelemetry.io/otel/sdk/metric v1.42.0 h1:D/1QR46Clz6ajyZ3G8SgNlTJKBdGp84q9RKCAZ3YGuA=
+go.opentelemetry.io/otel/sdk/metric v1.42.0/go.mod h1:Ua6AAlDKdZ7tdvaQKfSmnFTdHx37+J4ba8MwVCYM5hc=
+go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A=
+go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0=
+golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4=
+golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA=
 golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 h1:mgKeJMpvi0yx/sU5GsxQ7p6s2wtOnGAHZWCHUM4KGzY=
 golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546/go.mod h1:j/pmGrbnkbPtQfxEe5D0VQhZC6qKbfKifgD0oM7sR70=
 golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
 golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
-golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
-golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
+golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0=
+golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw=
+golang.org/x/oauth2 v0.36.0 h1:peZ/1z27fi9hUOFCAZaHyrpWG5lwe0RJEEEeH0ThlIs=
+golang.org/x/oauth2 v0.36.0/go.mod h1:YDBUJMTkDnJS+A4BP4eZBjCqtokkg1hODuPjwiGPO7Q=
+golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
+golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
 golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
 golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
 golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
-golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
-golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
+golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8=
+golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA=
+golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
+golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
 golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
 golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
+gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4=
+gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E=
+google.golang.org/api v0.275.0 h1:vfY5d9vFVJeWEZT65QDd9hbndr7FyZ2+6mIzGAh71NI=
+google.golang.org/api v0.275.0/go.mod h1:Fnag/EWUPIcJXuIkP1pjoTgS5vdxlk3eeemL7Do6bvw=
+google.golang.org/genproto v0.0.0-20260319201613-d00831a3d3e7 h1:XzmzkmB14QhVhgnawEVsOn6OFsnpyxNPRY9QV01dNB0=
+google.golang.org/genproto v0.0.0-20260319201613-d00831a3d3e7/go.mod h1:L43LFes82YgSonw6iTXTxXUX1OlULt4AQtkik4ULL/I=
+google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA=
+google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8=
+google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM=
+google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4=
+google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
+google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis=
 modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0=
 modernc.org/ccgo/v4 v4.32.0 h1:hjG66bI/kqIPX1b2yT6fr/jt+QedtP2fqojG2VrFuVw=
diff --git a/pkg/speech/stt_google.go b/pkg/speech/stt_google.go
index be0d14a7..ebb7145b 100644
--- a/pkg/speech/stt_google.go
+++ b/pkg/speech/stt_google.go
@@ -1,15 +1,19 @@
 package speech
 
 import (
-	"bytes"
 	"context"
-	"encoding/base64"
-	"encoding/json"
+	"errors"
 	"fmt"
 	"io"
-	"net/http"
 	"strings"
 	"time"
+
+	speechpb "cloud.google.com/go/speech/apiv1/speechpb"
+	"google.golang.org/api/googleapi"
+	"google.golang.org/api/option"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+	"google.golang.org/protobuf/types/known/durationpb"
 )
 
 type GoogleModel string
@@ -71,7 +75,7 @@ type GoogleProvider struct {
 	useEnhanced     bool
 	timeout         time.Duration
 	retries         int
-	client          *http.Client
+	client          googleRecognizeAPI
 }
 
 type GoogleOption func(*GoogleProvider)
@@ -118,11 +122,13 @@ func WithGoogleCredentialsJSON(credentialsJSON string) GoogleOption {
 	}
 }
 
-func NewGoogleProvider(apiKey string, opts ...GoogleOption) (*GoogleProvider, error) {
-	if apiKey == "" {
-		return nil, NewSTTError(ErrAuthentication, "google: API key is required")
+func withGoogleRecognizeClient(client googleRecognizeAPI) GoogleOption {
+	return func(p *GoogleProvider) {
+		p.client = client
 	}
+}
 
+func NewGoogleProvider(apiKey string, opts ...GoogleOption) (*GoogleProvider, error) {
 	p := &GoogleProvider{
 		apiKey:       apiKey,
 		baseURL:      "https://speech.googleapis.com",
@@ -130,18 +136,40 @@ func NewGoogleProvider(apiKey string, opts ...GoogleOption) (*GoogleProvider, er
 		model:        GoogleModelDefault,
 		timeout:      120 * time.Second,
 		retries:      2,
-		client:       &http.Client{Timeout: 120 * time.Second},
 	}
 
 	for _, opt := range opts {
 		opt(p)
 	}
 
-	p.client.Timeout = p.timeout
+	if p.apiKey == "" && p.credentialsJSON == "" {
+		return nil, NewSTTError(ErrAuthentication, "google: API key or credentials JSON is required")
+	}
+
+	if p.client == nil {
+		client, err := newGoogleRecognizeClient(context.Background(), p.clientOptions()...)
+		if err != nil {
+			return nil, NewSTTErrorf(ErrAuthentication, "google-speech: failed to initialize official client: %v", err)
+		}
+		p.client = client
+	}
 
 	return p, nil
 }
 
+func (p *GoogleProvider) clientOptions() []option.ClientOption {
+	opts := make([]option.ClientOption, 0, 2)
+	if p.credentialsJSON != "" {
+		opts = append(opts, option.WithCredentialsJSON([]byte(p.credentialsJSON)))
+	} else {
+		opts = append(opts, option.WithAPIKey(p.apiKey))
+	}
+	if p.baseURL != "" && p.baseURL != "https://speech.googleapis.com" {
+		opts = append(opts, option.WithEndpoint(p.baseURL))
+	}
+	return opts
+}
+
 func (p *GoogleProvider) Name() string {
 	return "google-speech"
 }
@@ -219,6 +247,24 @@ func (p *GoogleProvider) TranscribeStream(ctx context.Context, reader io.Reader,
 }
 
 func (p *GoogleProvider) doTranscribe(ctx context.Context, audio []byte, options TranscribeOptions) (*TranscriptResult, error) {
+	req := p.buildRecognizeRequest(audio, options)
+
+	requestCtx := ctx
+	var cancel context.CancelFunc
+	if _, hasDeadline := ctx.Deadline(); !hasDeadline && p.timeout > 0 {
+		requestCtx, cancel = context.WithTimeout(ctx, p.timeout)
+		defer cancel()
+	}
+
+	resp, err := p.client.Recognize(requestCtx, req)
+	if err != nil {
+		return nil, p.handleClientError(err)
+	}
+
+	return p.parseRecognizeResponse(resp, options)
+}
+
+func (p *GoogleProvider) buildRecognizeRequest(audio []byte, options TranscribeOptions) *speechpb.RecognizeRequest {
 	encoding := p.mapInputFormatToEncoding(options.InputFormat)
 
 	sampleRate := int32(options.SampleRate)
@@ -226,9 +272,9 @@ func (p *GoogleProvider) doTranscribe(ctx context.Context, audio []byte, options
 		sampleRate = p.guessSampleRate(options.InputFormat)
 	}
 
-	reqBody := googleRecognizeRequest{
-		Config: googleRecognitionConfigRequest{
-			Encoding:                   string(encoding),
+	return &speechpb.RecognizeRequest{
+		Config: &speechpb.RecognitionConfig{
+			Encoding:                   p.toProtoRecognitionEncoding(encoding),
 			SampleRateHertz:            sampleRate,
 			LanguageCode:               options.Language,
 			Model:                      string(p.model),
@@ -238,43 +284,10 @@ func (p *GoogleProvider) doTranscribe(ctx context.Context, audio []byte, options
 			EnableWordConfidence:       true,
 			EnableAutomaticPunctuation: true,
 		},
-		Audio: googleAudioRequest{
-			Content: base64.StdEncoding.EncodeToString(audio),
+		Audio: &speechpb.RecognitionAudio{
+			AudioSource: &speechpb.RecognitionAudio_Content{Content: audio},
 		},
 	}
-
-	body, err := json.Marshal(reqBody)
-	if err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "google-speech: failed to marshal request: %v", err)
-	}
-
-	url := fmt.Sprintf("%s/v1/speech:recognize?key=%s", p.baseURL, p.apiKey)
-
-	req, err := http.NewRequestWithContext(ctx, "POST", url, bytes.NewReader(body))
-	if err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "google-speech: failed to create request: %v", err)
-	}
-
-	req.Header.Set("Content-Type", "application/json")
-	req.Header.Set("User-Agent", "anyclaw-stt/1.0")
-
-	resp, err := p.client.Do(req)
-	if err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "google-speech: request failed: %v", err)
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode != http.StatusOK {
-		respBody, _ := io.ReadAll(resp.Body)
-		return nil, p.handleErrorResponse(resp.StatusCode, respBody)
-	}
-
-	respBody, err := io.ReadAll(resp.Body)
-	if err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "google-speech: failed to read response: %v", err)
-	}
-
-	return p.parseResponse(respBody, options)
 }
 
 func (p *GoogleProvider) mapInputFormatToEncoding(format AudioInputFormat) RecognitionEncoding {
@@ -285,7 +298,9 @@ func (p *GoogleProvider) mapInputFormatToEncoding(format AudioInputFormat) Recog
 		return EncodingFLAC
 	case InputMP3:
 		return EncodingMP3
-	case InputOGG, InputWEBM:
+	case InputOGG:
+		return EncodingOGGOpus
+	case InputWEBM:
 		return EncodingWEBMOpus
 	case InputM4A, InputMP4:
 		return EncodingWEBMOpus
@@ -296,6 +311,31 @@ func (p *GoogleProvider) mapInputFormatToEncoding(format AudioInputFormat) Recog
 	}
 }
 
+func (p *GoogleProvider) toProtoRecognitionEncoding(encoding RecognitionEncoding) speechpb.RecognitionConfig_AudioEncoding {
+	switch encoding {
+	case EncodingLinear16:
+		return speechpb.RecognitionConfig_LINEAR16
+	case EncodingFLAC:
+		return speechpb.RecognitionConfig_FLAC
+	case EncodingMULAW:
+		return speechpb.RecognitionConfig_MULAW
+	case EncodingAMR:
+		return speechpb.RecognitionConfig_AMR
+	case EncodingAMRWB:
+		return speechpb.RecognitionConfig_AMR_WB
+	case EncodingOGGOpus:
+		return speechpb.RecognitionConfig_OGG_OPUS
+	case EncodingSpeexWithHeaderByte:
+		return speechpb.RecognitionConfig_SPEEX_WITH_HEADER_BYTE
+	case EncodingWEBMOpus:
+		return speechpb.RecognitionConfig_WEBM_OPUS
+	case EncodingMP3:
+		return speechpb.RecognitionConfig_MP3
+	default:
+		return speechpb.RecognitionConfig_ENCODING_UNSPECIFIED
+	}
+}
+
 func (p *GoogleProvider) guessSampleRate(format AudioInputFormat) int32 {
 	switch format {
 	case InputWAV, InputPCM:
@@ -313,104 +353,41 @@ func (p *GoogleProvider) guessSampleRate(format AudioInputFormat) int32 {
 	}
 }
 
-func (p *GoogleProvider) handleErrorResponse(statusCode int, body []byte) error {
-	var errResp googleErrorResponse
-	if err := json.Unmarshal(body, &errResp); err == nil && errResp.Error.Message != "" {
-		msg := fmt.Sprintf("google-speech: API error: %s (status: %s)", errResp.Error.Message, errResp.Error.Status)
-		switch statusCode {
-		case http.StatusUnauthorized, http.StatusForbidden:
+func (p *GoogleProvider) handleClientError(err error) error {
+	if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
+		return NewSTTErrorf(ErrTranscriptionFailed, "google-speech: request context error: %v", err)
+	}
+
+	var apiErr *googleapi.Error
+	if errors.As(err, &apiErr) {
+		msg := fmt.Sprintf("google-speech: API error: %s", apiErr.Message)
+		switch apiErr.Code {
+		case 400:
+			return NewSTTError(ErrAudioFormatInvalid, msg)
+		case 401, 403:
 			return NewSTTError(ErrAuthentication, msg)
-		case http.StatusTooManyRequests:
+		case 429:
 			return NewSTTError(ErrRateLimited, msg)
-		case http.StatusBadRequest:
-			return NewSTTError(ErrAudioFormatInvalid, msg)
 		default:
 			return NewSTTError(ErrTranscriptionFailed, msg)
 		}
 	}
 
-	switch statusCode {
-	case http.StatusUnauthorized, http.StatusForbidden:
-		return NewSTTError(ErrAuthentication, fmt.Sprintf("google-speech: authentication failed: %s", string(body)))
-	case http.StatusTooManyRequests:
-		return NewSTTError(ErrRateLimited, fmt.Sprintf("google-speech: rate limited: %s", string(body)))
-	case http.StatusBadRequest:
-		return NewSTTError(ErrAudioFormatInvalid, fmt.Sprintf("google-speech: invalid request: %s", string(body)))
-	case http.StatusServiceUnavailable:
-		return NewSTTError(ErrTranscriptionFailed, fmt.Sprintf("google-speech: service unavailable: %s", string(body)))
+	switch status.Code(err) {
+	case codes.InvalidArgument:
+		return NewSTTError(ErrAudioFormatInvalid, "google-speech: invalid recognition request")
+	case codes.Unauthenticated, codes.PermissionDenied:
+		return NewSTTError(ErrAuthentication, "google-speech: authentication failed")
+	case codes.ResourceExhausted:
+		return NewSTTError(ErrRateLimited, "google-speech: rate limited")
 	default:
-		return NewSTTErrorf(ErrTranscriptionFailed, "google-speech: unexpected status %d: %s", statusCode, string(body))
+		return NewSTTErrorf(ErrTranscriptionFailed, "google-speech: request failed: %v", err)
 	}
 }
 
-type googleRecognizeRequest struct {
-	Config googleRecognitionConfigRequest `json:"config"`
-	Audio  googleAudioRequest             `json:"audio"`
-}
-
-type googleRecognitionConfigRequest struct {
-	Encoding                   string `json:"encoding"`
-	SampleRateHertz            int32  `json:"sampleRateHertz"`
-	LanguageCode               string `json:"languageCode"`
-	Model                      string `json:"model,omitempty"`
-	UseEnhanced                bool   `json:"useEnhanced,omitempty"`
-	MaxAlternatives            int32  `json:"maxAlternatives,omitempty"`
-	EnableWordTimeOffsets      bool   `json:"enableWordTimeOffsets,omitempty"`
-	EnableWordConfidence       bool   `json:"enableWordConfidence,omitempty"`
-	EnableAutomaticPunctuation bool   `json:"enableAutomaticPunctuation,omitempty"`
-	EnableSpokenPunctuation    bool   `json:"enableSpokenPunctuation,omitempty"`
-}
-
-type googleAudioRequest struct {
-	Content string `json:"content"`
-}
-
-type googleResponse struct {
-	Results []googleResult `json:"results"`
-}
-
-type googleResult struct {
-	Alternatives  []googleAlternative `json:"alternatives"`
-	LanguageCode  string              `json:"languageCode"`
-	ResultEndTime struct {
-		Seconds string `json:"seconds"`
-		Nanos   int    `json:"nanos"`
-	} `json:"resultEndTime"`
-}
-
-type googleAlternative struct {
-	Transcript string           `json:"transcript"`
-	Confidence float64          `json:"confidence"`
-	Words      []googleWordInfo `json:"words"`
-}
-
-type googleWordInfo struct {
-	StartTime  googleDuration `json:"startTime"`
-	EndTime    googleDuration `json:"endTime"`
-	Word       string         `json:"word"`
-	Confidence float64        `json:"confidence"`
-}
-
-type googleDuration struct {
-	Seconds string `json:"seconds"`
-	Nanos   int    `json:"nanos"`
-}
-
-type googleErrorResponse struct {
-	Error struct {
-		Code    int    `json:"code"`
-		Message string `json:"message"`
-		Status  string `json:"status"`
-	} `json:"error"`
-}
-
-func (p *GoogleProvider) parseResponse(body []byte, options TranscribeOptions) (*TranscriptResult, error) {
-	var resp googleResponse
-	if err := json.Unmarshal(body, &resp); err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "google-speech: failed to parse JSON response: %v", err)
-	}
-
-	if len(resp.Results) == 0 {
+func (p *GoogleProvider) parseRecognizeResponse(resp *speechpb.RecognizeResponse, options TranscribeOptions) (*TranscriptResult, error) {
+	results := resp.GetResults()
+	if len(results) == 0 {
 		return &TranscriptResult{
 			Text:     "",
 			Language: options.Language,
@@ -420,35 +397,40 @@ func (p *GoogleProvider) parseResponse(body []byte, options TranscribeOptions) (
 	result := &TranscriptResult{}
 	var totalConfidence float64
 	var confidenceCount int
+	var lastEnd time.Duration
 
-	for i, res := range resp.Results {
-		if len(res.Alternatives) == 0 {
+	for i, res := range results {
+		if len(res.GetAlternatives()) == 0 {
 			continue
 		}
 
-		primary := res.Alternatives[0]
-
+		primary := res.GetAlternatives()[0]
 		segment := SegmentInfo{
 			ID:   i,
-			Text: primary.Transcript,
+			Text: primary.GetTranscript(),
 		}
 
-		if primary.Confidence > 0 {
-			segment.Confidence = primary.Confidence
-			totalConfidence += primary.Confidence
+		if confidence := primary.GetConfidence(); confidence > 0 {
+			segment.Confidence = float64(confidence)
+			totalConfidence += segment.Confidence
 			confidenceCount++
 		}
 
-		if len(primary.Words) > 0 {
-			segment.Words = make([]WordInfo, 0, len(primary.Words))
-			for _, w := range primary.Words {
-				segment.Words = append(segment.Words, WordInfo{
-					Word:       w.Word,
-					StartTime:  parseGoogleDuration(w.StartTime),
-					EndTime:    parseGoogleDuration(w.EndTime),
-					Confidence: w.Confidence,
-				})
+		if len(primary.GetWords()) > 0 {
+			segment.Words = make([]WordInfo, 0, len(primary.GetWords()))
+			for _, word := range primary.GetWords() {
+				wordInfo := WordInfo{
+					Word:       word.GetWord(),
+					StartTime:  parseProtoDuration(word.GetStartTime()),
+					EndTime:    parseProtoDuration(word.GetEndTime()),
+					Confidence: float64(word.GetConfidence()),
+				}
+				segment.Words = append(segment.Words, wordInfo)
 			}
+			segment.StartTime = segment.Words[0].StartTime
+			segment.EndTime = segment.Words[len(segment.Words)-1].EndTime
+		} else {
+			segment.EndTime = parseProtoDuration(res.GetResultEndTime())
 		}
 
 		if options.WordTimestamps && len(segment.Words) > 0 {
@@ -458,40 +440,47 @@ func (p *GoogleProvider) parseResponse(body []byte, options TranscribeOptions) (
 		result.Segments = append(result.Segments, segment)
 
 		if i == 0 {
-			result.Text = primary.Transcript
-			result.Language = res.LanguageCode
+			result.Text = primary.GetTranscript()
+			if lang := res.GetLanguageCode(); lang != "" {
+				result.Language = lang
+			}
 		} else {
-			result.Text += " " + primary.Transcript
+			result.Text += " " + primary.GetTranscript()
 		}
 
-		if options.MaxAlternatives > 1 && len(res.Alternatives) > 1 {
-			for _, alt := range res.Alternatives[1:] {
-				result.Alternatives = append(result.Alternatives, alt.Transcript)
+		if options.MaxAlternatives > 1 && len(res.GetAlternatives()) > 1 {
+			for _, alt := range res.GetAlternatives()[1:] {
+				result.Alternatives = append(result.Alternatives, alt.GetTranscript())
 			}
 		}
+
+		if segment.EndTime > lastEnd {
+			lastEnd = segment.EndTime
+		}
+	}
+
+	if result.Language == "" {
+		result.Language = options.Language
 	}
 
 	if confidenceCount > 0 {
 		result.Confidence = totalConfidence / float64(confidenceCount)
 	}
 
-	if len(resp.Results) > 0 {
-		endTime := resp.Results[len(resp.Results)-1].ResultEndTime
-		result.Duration = parseGoogleDuration(googleDuration{
-			Seconds: endTime.Seconds,
-			Nanos:   endTime.Nanos,
-		})
+	if lastEnd > 0 {
+		result.Duration = lastEnd
+	} else {
+		result.Duration = parseProtoDuration(resp.GetTotalBilledTime())
 	}
 
 	return result, nil
 }
 
-func parseGoogleDuration(d googleDuration) time.Duration {
-	var seconds int64
-	if d.Seconds != "" {
-		fmt.Sscanf(d.Seconds, "%d", &seconds)
+func parseProtoDuration(d *durationpb.Duration) time.Duration {
+	if d == nil {
+		return 0
 	}
-	return time.Duration(seconds)*time.Second + time.Duration(d.Nanos)*time.Nanosecond
+	return d.AsDuration()
 }
 
 func (p *GoogleProvider) ListLanguages(ctx context.Context) ([]string, error) {
diff --git a/pkg/speech/stt_google_client.go b/pkg/speech/stt_google_client.go
new file mode 100644
index 00000000..aed01cb6
--- /dev/null
+++ b/pkg/speech/stt_google_client.go
@@ -0,0 +1,34 @@
+package speech
+
+import (
+	"context"
+
+	speechapi "cloud.google.com/go/speech/apiv1"
+	speechpb "cloud.google.com/go/speech/apiv1/speechpb"
+	"google.golang.org/api/option"
+)
+
+type googleRecognizeAPI interface {
+	Recognize(context.Context, *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error)
+	Close() error
+}
+
+type googleRecognizeClient struct {
+	client *speechapi.Client
+}
+
+func newGoogleRecognizeClient(ctx context.Context, clientOpts ...option.ClientOption) (googleRecognizeAPI, error) {
+	client, err := speechapi.NewRESTClient(ctx, clientOpts...)
+	if err != nil {
+		return nil, err
+	}
+	return &googleRecognizeClient{client: client}, nil
+}
+
+func (c *googleRecognizeClient) Recognize(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
+	return c.client.Recognize(ctx, req)
+}
+
+func (c *googleRecognizeClient) Close() error {
+	return c.client.Close()
+}
diff --git a/pkg/speech/stt_google_test.go b/pkg/speech/stt_google_test.go
index b9fea4a2..cb7fbdac 100644
--- a/pkg/speech/stt_google_test.go
+++ b/pkg/speech/stt_google_test.go
@@ -2,19 +2,41 @@ package speech
 
 import (
 	"context"
-	"encoding/json"
-	"net/http"
-	"net/http/httptest"
+	"errors"
+	"math"
 	"strings"
 	"testing"
 	"time"
+
+	speechpb "cloud.google.com/go/speech/apiv1/speechpb"
+	"google.golang.org/api/googleapi"
+	"google.golang.org/protobuf/types/known/durationpb"
 )
 
+type fakeGoogleRecognizeClient struct {
+	calls       int
+	lastRequest *speechpb.RecognizeRequest
+	recognizeFn func(context.Context, *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error)
+}
+
+func (f *fakeGoogleRecognizeClient) Recognize(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
+	f.calls++
+	f.lastRequest = req
+	if f.recognizeFn != nil {
+		return f.recognizeFn(ctx, req)
+	}
+	return &speechpb.RecognizeResponse{}, nil
+}
+
+func (f *fakeGoogleRecognizeClient) Close() error {
+	return nil
+}
+
 func TestNewGoogleProvider(t *testing.T) {
-	t.Run("requires API key", func(t *testing.T) {
-		_, err := NewGoogleProvider("")
+	t.Run("requires API key or credentials JSON", func(t *testing.T) {
+		_, err := NewGoogleProvider("", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
 		if err == nil {
-			t.Fatal("expected error when API key is empty")
+			t.Fatal("expected error when auth config is empty")
 		}
 		sttErr, ok := err.(*STTError)
 		if !ok {
@@ -26,7 +48,8 @@ func TestNewGoogleProvider(t *testing.T) {
 	})
 
 	t.Run("creates provider with defaults", func(t *testing.T) {
-		p, err := NewGoogleProvider("test-key")
+		fake := &fakeGoogleRecognizeClient{}
+		p, err := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake))
 		if err != nil {
 			t.Fatalf("unexpected error: %v", err)
 		}
@@ -48,16 +71,21 @@ func TestNewGoogleProvider(t *testing.T) {
 		if p.retries != 2 {
 			t.Errorf("expected 2 retries, got %d", p.retries)
 		}
+		if p.client != fake {
+			t.Fatal("expected injected fake client to be used")
+		}
 	})
 
 	t.Run("applies options", func(t *testing.T) {
 		p, err := NewGoogleProvider("test-key",
-			WithGoogleBaseURL("https://custom.speech.api.com"),
+			withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}),
+			WithGoogleBaseURL("https://custom.speech.api.com/"),
 			WithGoogleLanguageCode("zh-CN"),
 			WithGoogleModel(GoogleModelLatestLong),
 			WithGoogleEnhanced(true),
 			WithGoogleTimeout(30*time.Second),
 			WithGoogleRetries(5),
+			WithGoogleCredentialsJSON(`{"type":"service_account"}`),
 		)
 		if err != nil {
 			t.Fatalf("unexpected error: %v", err)
@@ -80,25 +108,15 @@ func TestNewGoogleProvider(t *testing.T) {
 		if p.retries != 5 {
 			t.Errorf("expected 5 retries, got %d", p.retries)
 		}
-	})
-
-	t.Run("trims trailing slash from baseURL", func(t *testing.T) {
-		p, err := NewGoogleProvider("test-key", WithGoogleBaseURL("https://api.example.com/"))
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-		if strings.HasSuffix(p.baseURL, "/") {
-			t.Errorf("baseURL should not have trailing slash: %s", p.baseURL)
+		if p.credentialsJSON == "" {
+			t.Error("expected credentials JSON to be stored")
 		}
 	})
 }
 
 func TestGoogleProviderTranscribe(t *testing.T) {
 	t.Run("rejects empty audio", func(t *testing.T) {
-		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {}))
-		defer server.Close()
-
-		p, _ := NewGoogleProvider("test-key", WithGoogleBaseURL(server.URL))
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
 		_, err := p.Transcribe(context.Background(), nil)
 		if err == nil {
 			t.Fatal("expected error for empty audio")
@@ -106,10 +124,7 @@ func TestGoogleProviderTranscribe(t *testing.T) {
 	})
 
 	t.Run("rejects audio too large", func(t *testing.T) {
-		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {}))
-		defer server.Close()
-
-		p, _ := NewGoogleProvider("test-key", WithGoogleBaseURL(server.URL))
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
 		largeAudio := make([]byte, 101*1024*1024)
 		_, err := p.Transcribe(context.Background(), largeAudio)
 		if err == nil {
@@ -124,49 +139,46 @@ func TestGoogleProviderTranscribe(t *testing.T) {
 		}
 	})
 
-	t.Run("successful transcription", func(t *testing.T) {
-		response := googleResponse{
-			Results: []googleResult{
-				{
-					Alternatives: []googleAlternative{
+	t.Run("successful transcription and request mapping", func(t *testing.T) {
+		fake := &fakeGoogleRecognizeClient{
+			recognizeFn: func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
+				return &speechpb.RecognizeResponse{
+					Results: []*speechpb.SpeechRecognitionResult{
 						{
-							Transcript: "Hello world",
-							Confidence: 0.95,
-							Words: []googleWordInfo{
-								{Word: "Hello", Confidence: 0.96, StartTime: googleDuration{Seconds: "0", Nanos: 0}, EndTime: googleDuration{Seconds: "0", Nanos: 500000000}},
-								{Word: "world", Confidence: 0.94, StartTime: googleDuration{Seconds: "0", Nanos: 600000000}, EndTime: googleDuration{Seconds: "1", Nanos: 0}},
+							Alternatives: []*speechpb.SpeechRecognitionAlternative{
+								{
+									Transcript: "Hello world",
+									Confidence: 0.95,
+									Words: []*speechpb.WordInfo{
+										{
+											Word:       "Hello",
+											Confidence: 0.96,
+											StartTime:  durationpb.New(0),
+											EndTime:    durationpb.New(500 * time.Millisecond),
+										},
+										{
+											Word:       "world",
+											Confidence: 0.94,
+											StartTime:  durationpb.New(600 * time.Millisecond),
+											EndTime:    durationpb.New(time.Second),
+										},
+									},
+								},
 							},
+							LanguageCode:  "en-US",
+							ResultEndTime: durationpb.New(2500 * time.Millisecond),
 						},
 					},
-					LanguageCode: "en-US",
-					ResultEndTime: struct {
-						Seconds string `json:"seconds"`
-						Nanos   int    `json:"nanos"`
-					}{Seconds: "2", Nanos: 500000000},
-				},
+				}, nil
 			},
 		}
 
-		respBody, _ := json.Marshal(response)
-
-		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			if r.Method != "POST" {
-				t.Errorf("expected POST, got %s", r.Method)
-			}
-			if !strings.Contains(r.URL.Query().Get("key"), "test-key") {
-				t.Error("missing API key in query")
-			}
-			if r.Header.Get("Content-Type") != "application/json" {
-				t.Errorf("expected application/json, got %s", r.Header.Get("Content-Type"))
-			}
-			w.Header().Set("Content-Type", "application/json")
-			w.WriteHeader(http.StatusOK)
-			w.Write(respBody)
-		}))
-		defer server.Close()
-
-		p, _ := NewGoogleProvider("test-key", WithGoogleBaseURL(server.URL))
-		result, err := p.Transcribe(context.Background(), []byte("fake-audio-data"))
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake))
+		result, err := p.Transcribe(context.Background(), []byte("fake-audio-data"),
+			WithSTTLanguage("zh-CN"),
+			WithSTTWordTimestamps(true),
+			WithSTTMaxAlternatives(3),
+		)
 		if err != nil {
 			t.Fatalf("unexpected error: %v", err)
 		}
@@ -177,8 +189,8 @@ func TestGoogleProviderTranscribe(t *testing.T) {
 		if result.Language != "en-US" {
 			t.Errorf("expected language 'en-US', got '%s'", result.Language)
 		}
-		if result.Duration != 2500*time.Millisecond {
-			t.Errorf("expected duration 2.5s, got %v", result.Duration)
+		if result.Duration != time.Second {
+			t.Errorf("expected duration 1s from word timestamps, got %v", result.Duration)
 		}
 		if len(result.Segments) != 1 {
 			t.Fatalf("expected 1 segment, got %d", len(result.Segments))
@@ -186,243 +198,151 @@ func TestGoogleProviderTranscribe(t *testing.T) {
 		if len(result.Segments[0].Words) != 2 {
 			t.Fatalf("expected 2 words, got %d", len(result.Segments[0].Words))
 		}
-		if result.Confidence != 0.95 {
+		if math.Abs(result.Confidence-0.95) > 0.0001 {
 			t.Errorf("expected confidence 0.95, got %f", result.Confidence)
 		}
-	})
 
-	t.Run("multiple segments", func(t *testing.T) {
-		response := googleResponse{
-			Results: []googleResult{
-				{Alternatives: []googleAlternative{{Transcript: "First segment"}}, LanguageCode: "en-US"},
-				{Alternatives: []googleAlternative{{Transcript: "Second segment"}}, LanguageCode: "en-US"},
-			},
+		req := fake.lastRequest
+		if req == nil {
+			t.Fatal("expected request to be captured")
 		}
-
-		respBody, _ := json.Marshal(response)
-
-		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			w.Header().Set("Content-Type", "application/json")
-			w.Write(respBody)
-		}))
-		defer server.Close()
-
-		p, _ := NewGoogleProvider("test-key", WithGoogleBaseURL(server.URL))
-		result, err := p.Transcribe(context.Background(), []byte("fake-audio"))
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
+		if req.GetConfig().GetLanguageCode() != "zh-CN" {
+			t.Errorf("expected request language zh-CN, got %s", req.GetConfig().GetLanguageCode())
 		}
-
-		expectedText := "First segment Second segment"
-		if result.Text != expectedText {
-			t.Errorf("expected '%s', got '%s'", expectedText, result.Text)
+		if !req.GetConfig().GetEnableWordTimeOffsets() {
+			t.Error("expected EnableWordTimeOffsets to be true")
 		}
-		if len(result.Segments) != 2 {
-			t.Fatalf("expected 2 segments, got %d", len(result.Segments))
+		if req.GetConfig().GetMaxAlternatives() != 3 {
+			t.Errorf("expected max alternatives 3, got %d", req.GetConfig().GetMaxAlternatives())
+		}
+		if req.GetConfig().GetEncoding() != speechpb.RecognitionConfig_MP3 {
+			t.Errorf("expected MP3 encoding, got %v", req.GetConfig().GetEncoding())
+		}
+		if len(req.GetAudio().GetContent()) == 0 {
+			t.Error("expected inline audio content to be populated")
 		}
 	})
 
-	t.Run("alternatives", func(t *testing.T) {
-		response := googleResponse{
-			Results: []googleResult{
-				{
-					Alternatives: []googleAlternative{
-						{Transcript: "Hello world", Confidence: 0.95},
-						{Transcript: "Hello word", Confidence: 0.80},
-						{Transcript: "Halo world", Confidence: 0.70},
+	t.Run("multiple segments and alternatives", func(t *testing.T) {
+		fake := &fakeGoogleRecognizeClient{
+			recognizeFn: func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
+				return &speechpb.RecognizeResponse{
+					Results: []*speechpb.SpeechRecognitionResult{
+						{
+							Alternatives: []*speechpb.SpeechRecognitionAlternative{
+								{Transcript: "First segment", Confidence: 0.9},
+								{Transcript: "First segments", Confidence: 0.7},
+							},
+							LanguageCode:  "en-US",
+							ResultEndTime: durationpb.New(time.Second),
+						},
+						{
+							Alternatives: []*speechpb.SpeechRecognitionAlternative{
+								{Transcript: "Second segment", Confidence: 0.8},
+								{Transcript: "Second segments", Confidence: 0.6},
+							},
+							LanguageCode:  "en-US",
+							ResultEndTime: durationpb.New(2 * time.Second),
+						},
 					},
-					LanguageCode: "en-US",
-				},
+				}, nil
 			},
 		}
 
-		respBody, _ := json.Marshal(response)
-
-		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			w.Header().Set("Content-Type", "application/json")
-			w.Write(respBody)
-		}))
-		defer server.Close()
-
-		p, _ := NewGoogleProvider("test-key", WithGoogleBaseURL(server.URL))
-		result, err := p.Transcribe(context.Background(), []byte("fake-audio"),
-			WithSTTMaxAlternatives(3))
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake))
+		result, err := p.Transcribe(context.Background(), []byte("fake-audio"), WithSTTMaxAlternatives(3))
 		if err != nil {
 			t.Fatalf("unexpected error: %v", err)
 		}
 
+		if result.Text != "First segment Second segment" {
+			t.Errorf("unexpected combined text: %s", result.Text)
+		}
+		if len(result.Segments) != 2 {
+			t.Fatalf("expected 2 segments, got %d", len(result.Segments))
+		}
 		if len(result.Alternatives) != 2 {
 			t.Fatalf("expected 2 alternatives, got %d", len(result.Alternatives))
 		}
-		if result.Alternatives[0] != "Hello word" {
-			t.Errorf("expected first alternative 'Hello word', got '%s'", result.Alternatives[0])
+		if result.Duration != 2*time.Second {
+			t.Errorf("expected duration 2s, got %v", result.Duration)
 		}
 	})
 
 	t.Run("empty results", func(t *testing.T) {
-		response := googleResponse{Results: []googleResult{}}
-		respBody, _ := json.Marshal(response)
-
-		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			w.Header().Set("Content-Type", "application/json")
-			w.Write(respBody)
-		}))
-		defer server.Close()
-
-		p, _ := NewGoogleProvider("test-key", WithGoogleBaseURL(server.URL))
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
 		result, err := p.Transcribe(context.Background(), []byte("fake-audio"))
 		if err != nil {
 			t.Fatalf("unexpected error: %v", err)
 		}
 		if result.Text != "" {
-			t.Errorf("expected empty text, got '%s'", result.Text)
+			t.Errorf("expected empty text, got %q", result.Text)
 		}
 	})
 
-	t.Run("handles authentication error", func(t *testing.T) {
-		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			w.Header().Set("Content-Type", "application/json")
-			w.WriteHeader(http.StatusUnauthorized)
-			w.Write([]byte(`{"error":{"code":401,"message":"API key not valid. Please pass a valid API key.","status":"UNAUTHENTICATED"}}`))
-		}))
-		defer server.Close()
-
-		p, _ := NewGoogleProvider("bad-key", WithGoogleBaseURL(server.URL), WithGoogleRetries(0))
-		_, err := p.Transcribe(context.Background(), []byte("fake-audio"))
-		if err == nil {
-			t.Fatal("expected authentication error")
-		}
-		sttErr, ok := err.(*STTError)
-		if !ok {
-			t.Fatalf("expected *STTError, got %T", err)
-		}
-		if sttErr.Code != ErrAuthentication {
-			t.Errorf("expected ErrAuthentication, got %s", sttErr.Code)
+	t.Run("does not retry auth errors", func(t *testing.T) {
+		fake := &fakeGoogleRecognizeClient{
+			recognizeFn: func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
+				return nil, &googleapi.Error{Code: 401, Message: "invalid API key"}
+			},
 		}
-	})
-
-	t.Run("handles forbidden error", func(t *testing.T) {
-		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			w.Header().Set("Content-Type", "application/json")
-			w.WriteHeader(http.StatusForbidden)
-			w.Write([]byte(`{"error":{"code":403,"message":"API key expired.","status":"PERMISSION_DENIED"}}`))
-		}))
-		defer server.Close()
-
-		p, _ := NewGoogleProvider("expired-key", WithGoogleBaseURL(server.URL), WithGoogleRetries(0))
+		p, _ := NewGoogleProvider("bad-key", withGoogleRecognizeClient(fake), WithGoogleRetries(3))
 		_, err := p.Transcribe(context.Background(), []byte("fake-audio"))
 		if err == nil {
 			t.Fatal("expected authentication error")
 		}
-		sttErr, ok := err.(*STTError)
-		if !ok {
-			t.Fatalf("expected *STTError, got %T", err)
-		}
-		if sttErr.Code != ErrAuthentication {
-			t.Errorf("expected ErrAuthentication, got %s", sttErr.Code)
-		}
-	})
-
-	t.Run("handles rate limit error", func(t *testing.T) {
-		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			w.Header().Set("Content-Type", "application/json")
-			w.WriteHeader(http.StatusTooManyRequests)
-			w.Write([]byte(`{"error":{"code":429,"message":"Quota exceeded.","status":"RESOURCE_EXHAUSTED"}}`))
-		}))
-		defer server.Close()
-
-		p, _ := NewGoogleProvider("test-key", WithGoogleBaseURL(server.URL), WithGoogleRetries(0))
-		_, err := p.Transcribe(context.Background(), []byte("fake-audio"))
-		if err == nil {
-			t.Fatal("expected rate limit error")
-		}
-		sttErr, ok := err.(*STTError)
-		if !ok {
-			t.Fatalf("expected *STTError, got %T", err)
-		}
-		if sttErr.Code != ErrRateLimited {
-			t.Errorf("expected ErrRateLimited, got %s", sttErr.Code)
+		if fake.calls != 1 {
+			t.Errorf("expected 1 call, got %d", fake.calls)
 		}
 	})
 
-	t.Run("context cancellation", func(t *testing.T) {
-		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			w.WriteHeader(http.StatusServiceUnavailable)
-		}))
-		defer server.Close()
-
-		p, _ := NewGoogleProvider("test-key", WithGoogleBaseURL(server.URL), WithGoogleRetries(1))
-
-		ctx, cancel := context.WithCancel(context.Background())
-		cancel()
-
-		_, err := p.Transcribe(ctx, []byte("fake-audio"))
-		if err == nil {
-			t.Fatal("expected context cancellation error")
+	t.Run("retries transient errors then succeeds", func(t *testing.T) {
+		fake := &fakeGoogleRecognizeClient{}
+		fake.recognizeFn = func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
+			if fake.calls == 1 {
+				return nil, &googleapi.Error{Code: 503, Message: "service unavailable"}
+			}
+			return &speechpb.RecognizeResponse{
+				Results: []*speechpb.SpeechRecognitionResult{
+					{
+						Alternatives:  []*speechpb.SpeechRecognitionAlternative{{Transcript: "Success after retry"}},
+						LanguageCode:  "en-US",
+						ResultEndTime: durationpb.New(time.Second),
+					},
+				},
+			}, nil
 		}
-	})
 
-	t.Run("uses correct URL with API key", func(t *testing.T) {
-		var receivedURL string
-		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			receivedURL = r.URL.String()
-			w.Header().Set("Content-Type", "application/json")
-			w.Write([]byte(`{"results":[{"alternatives":[{"transcript":"test"}],"languageCode":"en-US"}]}`))
-		}))
-		defer server.Close()
-
-		p, _ := NewGoogleProvider("my-api-key", WithGoogleBaseURL(server.URL))
-		_, err := p.Transcribe(context.Background(), []byte("fake-audio"))
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake), WithGoogleRetries(2))
+		result, err := p.Transcribe(context.Background(), []byte("fake-audio"))
 		if err != nil {
 			t.Fatalf("unexpected error: %v", err)
 		}
-		if !strings.Contains(receivedURL, "key=my-api-key") {
-			t.Errorf("expected URL to contain 'key=my-api-key', got %s", receivedURL)
+		if result.Text != "Success after retry" {
+			t.Errorf("expected 'Success after retry', got '%s'", result.Text)
 		}
-		if !strings.Contains(receivedURL, "/v1/speech:recognize") {
-			t.Errorf("expected URL to contain '/v1/speech:recognize', got %s", receivedURL)
+		if fake.calls != 2 {
+			t.Errorf("expected 2 calls, got %d", fake.calls)
 		}
 	})
 
-	t.Run("sends correct request body", func(t *testing.T) {
-		var receivedBody googleRecognizeRequest
-		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			json.NewDecoder(r.Body).Decode(&receivedBody)
-			w.Header().Set("Content-Type", "application/json")
-			w.Write([]byte(`{"results":[{"alternatives":[{"transcript":"test"}],"languageCode":"en-US"}]}`))
-		}))
-		defer server.Close()
-
-		p, _ := NewGoogleProvider("test-key", WithGoogleBaseURL(server.URL))
-		_, err := p.Transcribe(context.Background(), []byte("fake-audio"),
-			WithSTTLanguage("zh-CN"),
-			WithSTTWordTimestamps(true),
-			WithSTTMaxAlternatives(3))
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-
-		if receivedBody.Config.LanguageCode != "zh-CN" {
-			t.Errorf("expected language zh-CN, got %s", receivedBody.Config.LanguageCode)
-		}
-		if !receivedBody.Config.EnableWordTimeOffsets {
-			t.Error("expected EnableWordTimeOffsets to be true")
-		}
-		if receivedBody.Config.MaxAlternatives != 3 {
-			t.Errorf("expected maxAlternatives 3, got %d", receivedBody.Config.MaxAlternatives)
+	t.Run("context cancellation", func(t *testing.T) {
+		fake := &fakeGoogleRecognizeClient{
+			recognizeFn: func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
+				return nil, context.Canceled
+			},
 		}
-		if receivedBody.Config.EnableAutomaticPunctuation != true {
-			t.Error("expected EnableAutomaticPunctuation to be true")
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake), WithGoogleRetries(0))
+		_, err := p.Transcribe(context.Background(), []byte("fake-audio"))
+		if err == nil {
+			t.Fatal("expected context cancellation error")
 		}
 	})
 }
 
 func TestGoogleProviderTranscribeStream(t *testing.T) {
 	t.Run("rejects nil reader", func(t *testing.T) {
-		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {}))
-		defer server.Close()
-
-		p, _ := NewGoogleProvider("test-key", WithGoogleBaseURL(server.URL))
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
 		_, err := p.TranscribeStream(context.Background(), nil)
 		if err == nil {
 			t.Fatal("expected error for nil reader")
@@ -430,13 +350,19 @@ func TestGoogleProviderTranscribeStream(t *testing.T) {
 	})
 
 	t.Run("successful stream transcription", func(t *testing.T) {
-		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			w.Header().Set("Content-Type", "application/json")
-			w.Write([]byte(`{"results":[{"alternatives":[{"transcript":"Stream content","confidence":0.9}],"languageCode":"en-US"}]}`))
-		}))
-		defer server.Close()
-
-		p, _ := NewGoogleProvider("test-key", WithGoogleBaseURL(server.URL))
+		fake := &fakeGoogleRecognizeClient{
+			recognizeFn: func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
+				return &speechpb.RecognizeResponse{
+					Results: []*speechpb.SpeechRecognitionResult{
+						{
+							Alternatives: []*speechpb.SpeechRecognitionAlternative{{Transcript: "Stream content", Confidence: 0.9}},
+							LanguageCode: "en-US",
+						},
+					},
+				}, nil
+			},
+		}
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake))
 		reader := strings.NewReader("stream-audio-data")
 		result, err := p.TranscribeStream(context.Background(), reader)
 		if err != nil {
@@ -449,27 +375,22 @@ func TestGoogleProviderTranscribeStream(t *testing.T) {
 }
 
 func TestGoogleProviderTranscribeFile(t *testing.T) {
-	t.Run("returns not supported error", func(t *testing.T) {
-		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {}))
-		defer server.Close()
-
-		p, _ := NewGoogleProvider("test-key", WithGoogleBaseURL(server.URL))
-		_, err := p.TranscribeFile(context.Background(), "/some/file.mp3")
-		if err == nil {
-			t.Fatal("expected error for file transcription")
-		}
-		sttErr, ok := err.(*STTError)
-		if !ok {
-			t.Fatalf("expected *STTError, got %T", err)
-		}
-		if sttErr.Code != ErrProviderNotSupported {
-			t.Errorf("expected ErrProviderNotSupported, got %s", sttErr.Code)
-		}
-	})
+	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
+	_, err := p.TranscribeFile(context.Background(), "/some/file.mp3")
+	if err == nil {
+		t.Fatal("expected error for file transcription")
+	}
+	sttErr, ok := err.(*STTError)
+	if !ok {
+		t.Fatalf("expected *STTError, got %T", err)
+	}
+	if sttErr.Code != ErrProviderNotSupported {
+		t.Errorf("expected ErrProviderNotSupported, got %s", sttErr.Code)
+	}
 }
 
 func TestGoogleProviderListLanguages(t *testing.T) {
-	p, _ := NewGoogleProvider("test-key")
+	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
 	langs, err := p.ListLanguages(context.Background())
 	if err != nil {
 		t.Fatalf("unexpected error: %v", err)
@@ -477,32 +398,10 @@ func TestGoogleProviderListLanguages(t *testing.T) {
 	if len(langs) == 0 {
 		t.Fatal("expected non-empty language list")
 	}
-
-	found := false
-	for _, lang := range langs {
-		if lang == "en-US" {
-			found = true
-			break
-		}
-	}
-	if !found {
-		t.Error("expected 'en-US' in language list")
-	}
-
-	found = false
-	for _, lang := range langs {
-		if lang == "zh-CN" {
-			found = true
-			break
-		}
-	}
-	if !found {
-		t.Error("expected 'zh-CN' in language list")
-	}
 }
 
 func TestGoogleProviderEncodingMapping(t *testing.T) {
-	p, _ := NewGoogleProvider("test-key")
+	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
 
 	tests := []struct {
 		format AudioInputFormat
@@ -512,7 +411,7 @@ func TestGoogleProviderEncodingMapping(t *testing.T) {
 		{InputPCM, EncodingLinear16},
 		{InputFLAC, EncodingFLAC},
 		{InputMP3, EncodingMP3},
-		{InputOGG, EncodingWEBMOpus},
+		{InputOGG, EncodingOGGOpus},
 		{InputWEBM, EncodingWEBMOpus},
 		{InputM4A, EncodingWEBMOpus},
 		{InputMP4, EncodingWEBMOpus},
@@ -531,7 +430,7 @@ func TestGoogleProviderEncodingMapping(t *testing.T) {
 }
 
 func TestGoogleProviderSampleRateGuessing(t *testing.T) {
-	p, _ := NewGoogleProvider("test-key")
+	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
 
 	tests := []struct {
 		format AudioInputFormat
@@ -557,137 +456,89 @@ func TestGoogleProviderSampleRateGuessing(t *testing.T) {
 	}
 }
 
-func TestGoogleProviderRetries(t *testing.T) {
-	t.Run("retries on server error then succeeds", func(t *testing.T) {
-		callCount := 0
-		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			callCount++
-			if callCount < 2 {
-				w.Header().Set("Content-Type", "application/json")
-				w.WriteHeader(http.StatusServiceUnavailable)
-				w.Write([]byte(`{"error":{"code":503,"message":"Service unavailable","status":"UNAVAILABLE"}}`))
-				return
-			}
-			w.Header().Set("Content-Type", "application/json")
-			w.Write([]byte(`{"results":[{"alternatives":[{"transcript":"Success after retry"}],"languageCode":"en-US"}]}`))
-		}))
-		defer server.Close()
-
-		p, _ := NewGoogleProvider("test-key", WithGoogleBaseURL(server.URL), WithGoogleRetries(2))
-		result, err := p.Transcribe(context.Background(), []byte("fake-audio"))
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-		if result.Text != "Success after retry" {
-			t.Errorf("expected 'Success after retry', got '%s'", result.Text)
-		}
-		if callCount != 2 {
-			t.Errorf("expected 2 calls, got %d", callCount)
-		}
-	})
-
-	t.Run("does not retry on auth error", func(t *testing.T) {
-		callCount := 0
-		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
-			callCount++
-			w.Header().Set("Content-Type", "application/json")
-			w.WriteHeader(http.StatusUnauthorized)
-			w.Write([]byte(`{"error":{"code":401,"message":"Invalid API key","status":"UNAUTHENTICATED"}}`))
-		}))
-		defer server.Close()
-
-		p, _ := NewGoogleProvider("bad-key", WithGoogleBaseURL(server.URL), WithGoogleRetries(3))
-		_, err := p.Transcribe(context.Background(), []byte("fake-audio"))
-		if err == nil {
-			t.Fatal("expected error")
-		}
-		if callCount != 1 {
-			t.Errorf("expected 1 call (no retry on auth error), got %d", callCount)
-		}
-	})
-}
-
-func TestParseGoogleDuration(t *testing.T) {
+func TestParseProtoDuration(t *testing.T) {
 	tests := []struct {
 		name string
-		d    googleDuration
+		d    *durationpb.Duration
 		want time.Duration
 	}{
-		{"zero", googleDuration{Seconds: "0", Nanos: 0}, 0},
-		{"one second", googleDuration{Seconds: "1", Nanos: 0}, time.Second},
-		{"500ms", googleDuration{Seconds: "0", Nanos: 500000000}, 500 * time.Millisecond},
-		{"2.5s", googleDuration{Seconds: "2", Nanos: 500000000}, 2500 * time.Millisecond},
-		{"1.234s", googleDuration{Seconds: "1", Nanos: 234000000}, time.Second + 234*time.Millisecond},
+		{"nil", nil, 0},
+		{"zero", durationpb.New(0), 0},
+		{"one second", durationpb.New(time.Second), time.Second},
+		{"500ms", durationpb.New(500 * time.Millisecond), 500 * time.Millisecond},
+		{"2.5s", durationpb.New(2500 * time.Millisecond), 2500 * time.Millisecond},
 	}
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			got := parseGoogleDuration(tt.d)
+			got := parseProtoDuration(tt.d)
 			if got != tt.want {
-				t.Errorf("parseGoogleDuration(%v) = %v, want %v", tt.d, got, tt.want)
+				t.Errorf("parseProtoDuration(%v) = %v, want %v", tt.d, got, tt.want)
 			}
 		})
 	}
 }
 
 func TestNewSTTProviderGoogle(t *testing.T) {
-	t.Run("creates Google provider", func(t *testing.T) {
-		p, err := NewSTTProvider(STTConfig{
-			Type:    STTProviderGoogle,
-			APIKey:  "test-key",
-			Timeout: 30 * time.Second,
-		})
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-		if p.Type() != STTProviderGoogle {
-			t.Errorf("expected STTProviderGoogle, got %s", p.Type())
-		}
-		if p.Name() != "google-speech" {
-			t.Errorf("expected name 'google-speech', got %s", p.Name())
-		}
-	})
-
-	t.Run("creates Google provider with language", func(t *testing.T) {
-		p, err := NewSTTProvider(STTConfig{
-			Type:     STTProviderGoogle,
-			APIKey:   "test-key",
-			Language: "zh-CN",
-		})
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-		gp, ok := p.(*GoogleProvider)
-		if !ok {
-			t.Fatalf("expected *GoogleProvider, got %T", p)
-		}
-		if gp.languageCode != "zh-CN" {
-			t.Errorf("expected language zh-CN, got %s", gp.languageCode)
-		}
+	p, err := NewSTTProvider(STTConfig{
+		Type:    STTProviderGoogle,
+		APIKey:  "test-key",
+		Timeout: 30 * time.Second,
 	})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if p.Type() != STTProviderGoogle {
+		t.Errorf("expected STTProviderGoogle, got %s", p.Type())
+	}
+	if p.Name() != "google-speech" {
+		t.Errorf("expected name 'google-speech', got %s", p.Name())
+	}
 }
 
 func TestGoogleSTTManager(t *testing.T) {
-	t.Run("register and use Google provider", func(t *testing.T) {
-		m := NewSTTManager()
-		p, _ := NewGoogleProvider("test-key")
+	m := NewSTTManager()
+	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
 
-		err := m.Register("google", p)
-		if err != nil {
-			t.Fatalf("failed to register provider: %v", err)
-		}
+	err := m.Register("google", p)
+	if err != nil {
+		t.Fatalf("failed to register provider: %v", err)
+	}
 
-		providers := m.ListProviders()
-		if len(providers) != 1 {
-			t.Fatalf("expected 1 provider, got %d", len(providers))
-		}
+	got, err := m.Get("google")
+	if err != nil {
+		t.Fatalf("failed to get provider: %v", err)
+	}
+	if got.Type() != STTProviderGoogle {
+		t.Errorf("expected STTProviderGoogle, got %s", got.Type())
+	}
+}
 
-		got, err := m.Get("google")
-		if err != nil {
-			t.Fatalf("failed to get provider: %v", err)
-		}
-		if got.Type() != STTProviderGoogle {
-			t.Errorf("expected STTProviderGoogle, got %s", got.Type())
-		}
-	})
+func TestGoogleProviderHandleClientError(t *testing.T) {
+	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
+
+	tests := []struct {
+		name string
+		err  error
+		want STTErrorCode
+	}{
+		{"bad request", &googleapi.Error{Code: 400, Message: "bad request"}, ErrAudioFormatInvalid},
+		{"unauthorized", &googleapi.Error{Code: 401, Message: "unauthorized"}, ErrAuthentication},
+		{"forbidden", &googleapi.Error{Code: 403, Message: "forbidden"}, ErrAuthentication},
+		{"rate limited", &googleapi.Error{Code: 429, Message: "quota"}, ErrRateLimited},
+		{"generic", errors.New("boom"), ErrTranscriptionFailed},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := p.handleClientError(tt.err)
+			sttErr, ok := err.(*STTError)
+			if !ok {
+				t.Fatalf("expected *STTError, got %T", err)
+			}
+			if sttErr.Code != tt.want {
+				t.Errorf("expected %s, got %s", tt.want, sttErr.Code)
+			}
+		})
+	}
 }
diff --git a/pkg/speech/stt_provider.go b/pkg/speech/stt_provider.go
index 5f6c9ba6..10f6bd5a 100644
--- a/pkg/speech/stt_provider.go
+++ b/pkg/speech/stt_provider.go
@@ -43,13 +43,14 @@ type STTProvider interface {
 }
 
 type STTConfig struct {
-	Type       STTProviderType
-	APIKey     string
-	BaseURL    string
-	Model      string
-	Language   string
-	SampleRate int
-	Timeout    time.Duration
+	Type            STTProviderType
+	APIKey          string
+	CredentialsJSON string
+	BaseURL         string
+	Model           string
+	Language        string
+	SampleRate      int
+	Timeout         time.Duration
 }
 
 func NewSTTProvider(cfg STTConfig) (STTProvider, error) {
@@ -71,6 +72,9 @@ func NewSTTProvider(cfg STTConfig) (STTProvider, error) {
 		return NewWhisperProvider(cfg.APIKey, opts...)
 	case STTProviderGoogle:
 		opts := []GoogleOption{}
+		if cfg.CredentialsJSON != "" {
+			opts = append(opts, WithGoogleCredentialsJSON(cfg.CredentialsJSON))
+		}
 		if cfg.BaseURL != "" {
 			opts = append(opts, WithGoogleBaseURL(cfg.BaseURL))
 		}

From cf3b68a4ad2293caae1eef75bcf26181b2e54e2b Mon Sep 17 00:00:00 2001
From: TheShigure7 <2947458856@qq.com>
Date: Tue, 28 Apr 2026 17:20:11 +0800
Subject: [PATCH 4/6] chore(export): add speech vad stt grouped export folder

---
 .../pkg/speech/vad.go                         | 319 +++++++++
 .../pkg/speech/vad_provider.go                |  58 ++
 .../pkg/speech/vad_webrtc.go                  | 144 ++++
 .../pkg/speech/voicewake.go                   | 631 ++++++++++++++++++
 .../pkg/speech/stt_openai_client.go           | 126 ++++
 .../pkg/speech/stt_whisper.go                 | 521 +++++++++++++++
 .../03_stt_google_official_client/go.mod      | 102 +++
 .../03_stt_google_official_client/go.sum      | 251 +++++++
 .../pkg/speech/stt_google.go                  | 502 ++++++++++++++
 .../pkg/speech/stt_google_client.go           |  34 +
 .../pkg/speech/stt_google_test.go             | 544 +++++++++++++++
 .../pkg/speech/stt_provider.go                | 249 +++++++
 .../README.md                                 |  46 ++
 13 files changed, 3527 insertions(+)
 create mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad.go
 create mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad_provider.go
 create mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad_webrtc.go
 create mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/voicewake.go
 create mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/02_stt_openai_client_refactor/pkg/speech/stt_openai_client.go
 create mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/02_stt_openai_client_refactor/pkg/speech/stt_whisper.go
 create mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/go.mod
 create mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/go.sum
 create mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google.go
 create mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google_client.go
 create mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google_test.go
 create mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_provider.go
 create mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/README.md

diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad.go
new file mode 100644
index 00000000..045f0f9f
--- /dev/null
+++ b/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad.go
@@ -0,0 +1,319 @@
+package speech
+
+import (
+	"math"
+	"sync"
+)
+
+type VADState string
+
+const (
+	VADStateSilence VADState = "silence"
+	VADStateSpeech  VADState = "speech"
+)
+
+type VADConfig struct {
+	SampleRate         int
+	FrameSize          int
+	Aggressiveness     int
+	EnergyThreshold    float64
+	ZeroCrossThreshold int
+	SpeechMinFrames    int
+	SilenceFrames      int
+	HangoverFrames     int
+}
+
+func DefaultVADConfig() VADConfig {
+	return VADConfig{
+		SampleRate:         16000,
+		FrameSize:          320,
+		Aggressiveness:     2,
+		EnergyThreshold:    0.01,
+		ZeroCrossThreshold: 50,
+		SpeechMinFrames:    3,
+		SilenceFrames:      30,
+		HangoverFrames:     10,
+	}
+}
+
+type VAD struct {
+	mu                 sync.Mutex
+	cfg                VADConfig
+	state              VADState
+	consecutiveSpeech  int
+	consecutiveSilence int
+	listeners          []VADStateListener
+}
+
+type VADStateListener func(state VADState, energy float64, zcr float64)
+
+func (v *VAD) Name() string {
+	return "heuristic-vad"
+}
+
+func (v *VAD) Type() VADProviderType {
+	return VADProviderHeuristic
+}
+
+func NewVAD(cfg VADConfig) *VAD {
+	if cfg.SampleRate == 0 {
+		cfg.SampleRate = 16000
+	}
+	if cfg.FrameSize == 0 {
+		cfg.FrameSize = 320
+	}
+	if cfg.Aggressiveness < 0 || cfg.Aggressiveness > 3 {
+		cfg.Aggressiveness = 2
+	}
+	if cfg.EnergyThreshold == 0 {
+		cfg.EnergyThreshold = 0.01
+	}
+	if cfg.ZeroCrossThreshold == 0 {
+		cfg.ZeroCrossThreshold = 50
+	}
+	if cfg.SpeechMinFrames == 0 {
+		cfg.SpeechMinFrames = 3
+	}
+	if cfg.SilenceFrames == 0 {
+		cfg.SilenceFrames = 30
+	}
+	if cfg.HangoverFrames == 0 {
+		cfg.HangoverFrames = 10
+	}
+
+	return &VAD{
+		cfg:   cfg,
+		state: VADStateSilence,
+	}
+}
+
+func (v *VAD) RegisterListener(listener VADStateListener) {
+	v.mu.Lock()
+	defer v.mu.Unlock()
+	v.listeners = append(v.listeners, listener)
+}
+
+func (v *VAD) ProcessFrame(samples []int16) VADState {
+	v.mu.Lock()
+	defer v.mu.Unlock()
+
+	energy := v.calculateRMS(samples)
+	zcr := v.calculateZeroCrossingRate(samples)
+
+	isSpeech := v.isSpeechFrame(energy, zcr)
+
+	if isSpeech {
+		v.consecutiveSpeech++
+		v.consecutiveSilence = 0
+	} else {
+		v.consecutiveSilence++
+		v.consecutiveSpeech = 0
+	}
+
+	switch v.state {
+	case VADStateSilence:
+		if isSpeech {
+			if v.consecutiveSpeech >= v.cfg.SpeechMinFrames {
+				v.state = VADStateSpeech
+				v.notifyListeners(VADStateSpeech, energy, zcr)
+			}
+		} else {
+			v.consecutiveSpeech = 0
+		}
+
+	case VADStateSpeech:
+		if isSpeech {
+			v.consecutiveSilence = 0
+		} else {
+			if v.consecutiveSilence >= v.cfg.HangoverFrames {
+				v.state = VADStateSilence
+				v.consecutiveSpeech = 0
+				v.consecutiveSilence = 0
+				v.notifyListeners(VADStateSilence, energy, zcr)
+			}
+		}
+	}
+
+	return v.state
+}
+
+func (v *VAD) ProcessFloatFrame(samples []float32) VADState {
+	intSamples := make([]int16, len(samples))
+	for i, s := range samples {
+		clamped := s
+		if clamped > 1.0 {
+			clamped = 1.0
+		}
+		if clamped < -1.0 {
+			clamped = -1.0
+		}
+		intSamples[i] = int16(clamped * 32767.0)
+	}
+	return v.ProcessFrame(intSamples)
+}
+
+func (v *VAD) isSpeechFrame(energy float64, zcr float64) bool {
+	return energy > v.cfg.EnergyThreshold || zcr > float64(v.cfg.ZeroCrossThreshold)
+}
+
+func (v *VAD) calculateRMS(samples []int16) float64 {
+	if len(samples) == 0 {
+		return 0
+	}
+
+	var sumSquares float64
+	for _, s := range samples {
+		normalized := float64(s) / 32768.0
+		sumSquares += normalized * normalized
+	}
+
+	return math.Sqrt(sumSquares / float64(len(samples)))
+}
+
+func (v *VAD) calculateZeroCrossingRate(samples []int16) float64 {
+	if len(samples) < 2 {
+		return 0
+	}
+
+	var crossings int
+	for i := 1; i < len(samples); i++ {
+		if (samples[i] >= 0 && samples[i-1] < 0) || (samples[i] < 0 && samples[i-1] >= 0) {
+			crossings++
+		}
+	}
+
+	return float64(crossings)
+}
+
+func (v *VAD) State() VADState {
+	v.mu.Lock()
+	defer v.mu.Unlock()
+	return v.state
+}
+
+func (v *VAD) Reset() {
+	v.mu.Lock()
+	defer v.mu.Unlock()
+	v.state = VADStateSilence
+	v.consecutiveSpeech = 0
+	v.consecutiveSilence = 0
+}
+
+func (v *VAD) notifyListeners(state VADState, energy float64, zcr float64) {
+	for _, listener := range v.listeners {
+		listener(state, energy, zcr)
+	}
+}
+
+func (v *VAD) UpdateConfig(cfg VADConfig) {
+	v.mu.Lock()
+	defer v.mu.Unlock()
+	if cfg.EnergyThreshold > 0 {
+		v.cfg.EnergyThreshold = cfg.EnergyThreshold
+	}
+	if cfg.ZeroCrossThreshold > 0 {
+		v.cfg.ZeroCrossThreshold = cfg.ZeroCrossThreshold
+	}
+	if cfg.SpeechMinFrames > 0 {
+		v.cfg.SpeechMinFrames = cfg.SpeechMinFrames
+	}
+	if cfg.SilenceFrames > 0 {
+		v.cfg.SilenceFrames = cfg.SilenceFrames
+	}
+	if cfg.HangoverFrames > 0 {
+		v.cfg.HangoverFrames = cfg.HangoverFrames
+	}
+}
+
+func (v *VAD) Config() VADConfig {
+	v.mu.Lock()
+	defer v.mu.Unlock()
+	return v.cfg
+}
+
+func NormalizeAudio(samples []int16) []float64 {
+	result := make([]float64, len(samples))
+	for i, s := range samples {
+		result[i] = float64(s) / 32768.0
+	}
+	return result
+}
+
+func Float32ToInt16(samples []float32) []int16 {
+	result := make([]int16, len(samples))
+	for i, s := range samples {
+		clamped := s
+		if clamped > 1.0 {
+			clamped = 1.0
+		}
+		if clamped < -1.0 {
+			clamped = -1.0
+		}
+		result[i] = int16(clamped * 32767.0)
+	}
+	return result
+}
+
+func Int16ToWAV(samples []int16, sampleRate int, channels int) []byte {
+	if len(samples) == 0 {
+		return nil
+	}
+
+	bitsPerSample := 16
+	byteRate := sampleRate * channels * bitsPerSample / 8
+	blockAlign := channels * bitsPerSample / 8
+	dataSize := len(samples) * 2
+	fileSize := 36 + dataSize
+
+	buf := make([]byte, 44+dataSize)
+
+	copy(buf[0:4], []byte("RIFF"))
+	buf[4] = byte(fileSize)
+	buf[5] = byte(fileSize >> 8)
+	buf[6] = byte(fileSize >> 16)
+	buf[7] = byte(fileSize >> 24)
+
+	copy(buf[8:12], []byte("WAVE"))
+
+	copy(buf[12:16], []byte("fmt "))
+	buf[16] = 16
+	buf[17] = 0
+	buf[18] = 0
+	buf[19] = 0
+
+	buf[20] = 1
+	buf[21] = 0
+
+	buf[22] = byte(channels)
+	buf[23] = 0
+
+	buf[24] = byte(sampleRate)
+	buf[25] = byte(sampleRate >> 8)
+	buf[26] = byte(sampleRate >> 16)
+	buf[27] = byte(sampleRate >> 24)
+
+	buf[28] = byte(byteRate)
+	buf[29] = byte(byteRate >> 8)
+	buf[30] = byte(byteRate >> 16)
+	buf[31] = byte(byteRate >> 24)
+
+	buf[32] = byte(blockAlign)
+	buf[33] = 0
+
+	buf[34] = byte(bitsPerSample)
+	buf[35] = 0
+
+	copy(buf[36:40], []byte("data"))
+	buf[40] = byte(dataSize)
+	buf[41] = byte(dataSize >> 8)
+	buf[42] = byte(dataSize >> 16)
+	buf[43] = byte(dataSize >> 24)
+
+	for i, s := range samples {
+		offset := 44 + i*2
+		buf[offset] = byte(s)
+		buf[offset+1] = byte(s >> 8)
+	}
+
+	return buf
+}
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad_provider.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad_provider.go
new file mode 100644
index 00000000..ce0de4e2
--- /dev/null
+++ b/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad_provider.go
@@ -0,0 +1,58 @@
+package speech
+
+import "fmt"
+
+type VADProviderType string
+
+const (
+	VADProviderHeuristic VADProviderType = "heuristic"
+	VADProviderWebRTC    VADProviderType = "webrtc"
+)
+
+type VADProcessor interface {
+	Name() string
+	Type() VADProviderType
+	ProcessFrame(samples []int16) VADState
+	ProcessFloatFrame(samples []float32) VADState
+	RegisterListener(listener VADStateListener)
+	State() VADState
+	Reset()
+	UpdateConfig(cfg VADConfig)
+	Config() VADConfig
+}
+
+type VADProviderFactory func(cfg VADConfig) (VADProcessor, error)
+
+type VADManager struct {
+	factories map[VADProviderType]VADProviderFactory
+}
+
+func NewVADManager() *VADManager {
+	m := &VADManager{
+		factories: map[VADProviderType]VADProviderFactory{},
+	}
+	m.Register(VADProviderHeuristic, func(cfg VADConfig) (VADProcessor, error) {
+		return NewVAD(cfg), nil
+	})
+	m.Register(VADProviderWebRTC, func(cfg VADConfig) (VADProcessor, error) {
+		return NewWebRTCVAD(cfg)
+	})
+	return m
+}
+
+func (m *VADManager) Register(providerType VADProviderType, factory VADProviderFactory) {
+	m.factories[providerType] = factory
+}
+
+func (m *VADManager) New(cfg VADConfig, providerType VADProviderType) (VADProcessor, error) {
+	if providerType == "" {
+		providerType = VADProviderHeuristic
+	}
+
+	factory, ok := m.factories[providerType]
+	if !ok {
+		return nil, fmt.Errorf("vad: unsupported provider %q", providerType)
+	}
+
+	return factory(cfg)
+}
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad_webrtc.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad_webrtc.go
new file mode 100644
index 00000000..4a7d2163
--- /dev/null
+++ b/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad_webrtc.go
@@ -0,0 +1,144 @@
+package speech
+
+import (
+	"encoding/binary"
+	"fmt"
+
+	webrtcvad "github.com/godeps/webrtcvad-go"
+)
+
+type WebRTCVAD struct {
+	inner      *VAD
+	detector   *webrtcvad.VAD
+	mode       int
+	sampleRate int
+	frameSize  int
+}
+
+func NewWebRTCVAD(cfg VADConfig) (*WebRTCVAD, error) {
+	if cfg.SampleRate == 0 {
+		cfg.SampleRate = 16000
+	}
+	if cfg.FrameSize == 0 {
+		cfg.FrameSize = 320
+	}
+	if cfg.Aggressiveness < 0 || cfg.Aggressiveness > 3 {
+		cfg.Aggressiveness = 2
+	}
+
+	if !webrtcvad.ValidRateAndFrameLength(cfg.SampleRate, cfg.FrameSize) {
+		return nil, fmt.Errorf("vad: invalid WebRTC sampleRate/frameSize combination: %d/%d", cfg.SampleRate, cfg.FrameSize)
+	}
+
+	detector, err := webrtcvad.New(cfg.Aggressiveness)
+	if err != nil {
+		return nil, fmt.Errorf("vad: failed to create WebRTC VAD: %w", err)
+	}
+
+	return &WebRTCVAD{
+		inner:      NewVAD(cfg),
+		detector:   detector,
+		mode:       cfg.Aggressiveness,
+		sampleRate: cfg.SampleRate,
+		frameSize:  cfg.FrameSize,
+	}, nil
+}
+
+func (v *WebRTCVAD) Name() string {
+	return "webrtc-vad"
+}
+
+func (v *WebRTCVAD) Type() VADProviderType {
+	return VADProviderWebRTC
+}
+
+func (v *WebRTCVAD) ProcessFrame(samples []int16) VADState {
+	if len(samples) == 0 {
+		return v.inner.ProcessFrame(samples)
+	}
+
+	audio := int16ToLittleEndianBytes(samples)
+	isSpeech, err := v.detector.IsSpeech(audio, v.sampleRate)
+	if err != nil {
+		return v.inner.ProcessFrame(samples)
+	}
+
+	v.inner.mu.Lock()
+	defer v.inner.mu.Unlock()
+
+	energy := v.inner.calculateRMS(samples)
+	zcr := v.inner.calculateZeroCrossingRate(samples)
+
+	if isSpeech {
+		v.inner.consecutiveSpeech++
+		v.inner.consecutiveSilence = 0
+	} else {
+		v.inner.consecutiveSilence++
+		v.inner.consecutiveSpeech = 0
+	}
+
+	switch v.inner.state {
+	case VADStateSilence:
+		if isSpeech {
+			if v.inner.consecutiveSpeech >= v.inner.cfg.SpeechMinFrames {
+				v.inner.state = VADStateSpeech
+				v.inner.notifyListeners(VADStateSpeech, energy, zcr)
+			}
+		} else {
+			v.inner.consecutiveSpeech = 0
+		}
+
+	case VADStateSpeech:
+		if isSpeech {
+			v.inner.consecutiveSilence = 0
+		} else {
+			if v.inner.consecutiveSilence >= v.inner.cfg.HangoverFrames {
+				v.inner.state = VADStateSilence
+				v.inner.consecutiveSpeech = 0
+				v.inner.consecutiveSilence = 0
+				v.inner.notifyListeners(VADStateSilence, energy, zcr)
+			}
+		}
+	}
+
+	return v.inner.state
+}
+
+func (v *WebRTCVAD) ProcessFloatFrame(samples []float32) VADState {
+	return v.ProcessFrame(Float32ToInt16(samples))
+}
+
+func (v *WebRTCVAD) RegisterListener(listener VADStateListener) {
+	v.inner.RegisterListener(listener)
+}
+
+func (v *WebRTCVAD) State() VADState {
+	return v.inner.State()
+}
+
+func (v *WebRTCVAD) Reset() {
+	v.inner.Reset()
+}
+
+func (v *WebRTCVAD) UpdateConfig(cfg VADConfig) {
+	v.inner.UpdateConfig(cfg)
+	if cfg.Aggressiveness >= 0 && cfg.Aggressiveness <= 3 {
+		_ = v.detector.SetMode(cfg.Aggressiveness)
+		v.mode = cfg.Aggressiveness
+	}
+}
+
+func (v *WebRTCVAD) Config() VADConfig {
+	cfg := v.inner.Config()
+	cfg.Aggressiveness = v.mode
+	return cfg
+}
+
+func int16ToLittleEndianBytes(samples []int16) []byte {
+	out := make([]byte, len(samples)*2)
+	for i, s := range samples {
+		binary.LittleEndian.PutUint16(out[i*2:], uint16(s))
+	}
+	return out
+}
+
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/voicewake.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/voicewake.go
new file mode 100644
index 00000000..64d89ffc
--- /dev/null
+++ b/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/voicewake.go
@@ -0,0 +1,631 @@
+package speech
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"sync"
+	"time"
+)
+
+type VoiceWakeState string
+
+const (
+	VoiceWakeStateIdle       VoiceWakeState = "idle"
+	VoiceWakeStateListening  VoiceWakeState = "listening"
+	VoiceWakeStateRecording  VoiceWakeState = "recording"
+	VoiceWakeStateProcessing VoiceWakeState = "processing"
+	VoiceWakeStateTriggered  VoiceWakeState = "triggered"
+)
+
+type VoiceWakeEventType string
+
+const (
+	VoiceWakeEventStateChanged VoiceWakeEventType = "state_changed"
+	VoiceWakeEventWakeDetected VoiceWakeEventType = "wake_detected"
+	VoiceWakeEventSpeechStart  VoiceWakeEventType = "speech_start"
+	VoiceWakeEventSpeechEnd    VoiceWakeEventType = "speech_end"
+	VoiceWakeEventError        VoiceWakeEventType = "error"
+)
+
+type VoiceWakeEvent struct {
+	Type      VoiceWakeEventType
+	State     VoiceWakeState
+	Timestamp time.Time
+	Data      map[string]any
+}
+
+type VoiceWakeListener func(event VoiceWakeEvent)
+
+type AudioSource interface {
+	Start(ctx context.Context) error
+	Stop() error
+	Read(samples []int16) (int, error)
+	SampleRate() int
+	Channels() int
+}
+
+type VoiceWakeConfig struct {
+	VADConfig        VADConfig
+	VADProvider      VADProviderType
+	WakeWordConfig   WakeWordConfig
+	EngineConfig     WakeWordEngineConfig
+	SampleRate       int
+	Channels         int
+	FrameSize        int
+	MaxRecordingTime time.Duration
+	CooldownTime     time.Duration
+	AudioSource      AudioSource
+	STTPipeline      *STTPipeline
+	AutoTranscribe   bool
+	WakeWordEngine   WakeWordEngineType
+}
+
+func DefaultVoiceWakeConfig() VoiceWakeConfig {
+	return VoiceWakeConfig{
+		VADConfig:        DefaultVADConfig(),
+		VADProvider:      VADProviderWebRTC,
+		WakeWordConfig:   DefaultWakeWordConfig(),
+		SampleRate:       16000,
+		Channels:         1,
+		FrameSize:        320,
+		MaxRecordingTime: 30 * time.Second,
+		CooldownTime:     2 * time.Second,
+		AutoTranscribe:   true,
+	}
+}
+
+type VoiceWake struct {
+	mu              sync.Mutex
+	cfg             VoiceWakeConfig
+	state           VoiceWakeState
+	vad             VADProcessor
+	wakeDetector    *WakeWordDetector
+	engineRouter    *WakeWordEngineRouter
+	engineAdapter   *WakeWordEngineAdapter
+	listeners       []VoiceWakeListener
+	audioBuffer     []int16
+	recordingBuffer []int16
+	isRecording     bool
+	recordingStart  time.Time
+	cooldownUntil   time.Time
+	ctx             context.Context
+	cancel          context.CancelFunc
+	wg              sync.WaitGroup
+	transcriber     *STTPipeline
+	lastTranscript  string
+	lastWakeMatch   string
+	lastConfidence  float64
+	lastEnergy      float64
+}
+
+func NewVoiceWake(cfg VoiceWakeConfig) *VoiceWake {
+	if cfg.SampleRate == 0 {
+		cfg.SampleRate = 16000
+	}
+	if cfg.Channels == 0 {
+		cfg.Channels = 1
+	}
+	if cfg.FrameSize == 0 {
+		cfg.FrameSize = 320
+	}
+	if cfg.MaxRecordingTime == 0 {
+		cfg.MaxRecordingTime = 30 * time.Second
+	}
+	if cfg.CooldownTime == 0 {
+		cfg.CooldownTime = 2 * time.Second
+	}
+
+	cfg.VADConfig.SampleRate = cfg.SampleRate
+	cfg.VADConfig.FrameSize = cfg.FrameSize
+
+	cfg.EngineConfig.SampleRate = cfg.SampleRate
+	cfg.EngineConfig.FrameSize = cfg.FrameSize
+
+	if cfg.VADProvider == "" {
+		cfg.VADProvider = VADProviderHeuristic
+	}
+
+	vadManager := NewVADManager()
+	vad, err := vadManager.New(cfg.VADConfig, cfg.VADProvider)
+	if err != nil {
+		log.Printf("voicewake: failed to create VAD provider %q, fallback to heuristic: %v", cfg.VADProvider, err)
+		vad = NewVAD(cfg.VADConfig)
+	}
+	wakeDetector := NewWakeWordDetector(cfg.WakeWordConfig)
+
+	router := NewWakeWordEngineRouter(cfg.EngineConfig)
+	adapter := NewWakeWordEngineAdapter(router, wakeDetector)
+
+	vw := &VoiceWake{
+		cfg:           cfg,
+		state:         VoiceWakeStateIdle,
+		vad:           vad,
+		wakeDetector:  wakeDetector,
+		engineRouter:  router,
+		engineAdapter: adapter,
+		transcriber:   cfg.STTPipeline,
+	}
+
+	vad.RegisterListener(vw.onVADStateChanged)
+
+	return vw
+}
+
+func (vw *VoiceWake) RegisterListener(listener VoiceWakeListener) {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	vw.listeners = append(vw.listeners, listener)
+}
+
+func (vw *VoiceWake) Start(ctx context.Context) error {
+	vw.mu.Lock()
+	if vw.state != VoiceWakeStateIdle {
+		vw.mu.Unlock()
+		return fmt.Errorf("voicewake: already in state %s", vw.state)
+	}
+	vw.state = VoiceWakeStateListening
+	vw.mu.Unlock()
+
+	vw.ctx, vw.cancel = context.WithCancel(ctx)
+
+	if vw.cfg.AudioSource != nil {
+		if err := vw.cfg.AudioSource.Start(vw.ctx); err != nil {
+			vw.mu.Lock()
+			vw.state = VoiceWakeStateIdle
+			vw.mu.Unlock()
+			return fmt.Errorf("voicewake: failed to start audio source: %w", err)
+		}
+	}
+
+	vw.wg.Add(1)
+	go vw.listenLoop()
+
+	vw.notifyListeners(VoiceWakeEvent{
+		Type:      VoiceWakeEventStateChanged,
+		State:     VoiceWakeStateListening,
+		Timestamp: time.Now(),
+		Data:      map[string]any{"message": "Voice wake listener started"},
+	})
+
+	return nil
+}
+
+func (vw *VoiceWake) Stop() error {
+	vw.mu.Lock()
+	if vw.state == VoiceWakeStateIdle {
+		vw.mu.Unlock()
+		return nil
+	}
+
+	if vw.cancel != nil {
+		vw.cancel()
+	}
+	vw.state = VoiceWakeStateIdle
+	vw.mu.Unlock()
+
+	if vw.cfg.AudioSource != nil {
+		_ = vw.cfg.AudioSource.Stop()
+	}
+
+	vw.wg.Wait()
+
+	vw.notifyListeners(VoiceWakeEvent{
+		Type:      VoiceWakeEventStateChanged,
+		State:     VoiceWakeStateIdle,
+		Timestamp: time.Now(),
+		Data:      map[string]any{"message": "Voice wake listener stopped"},
+	})
+
+	return nil
+}
+
+func (vw *VoiceWake) listenLoop() {
+	defer vw.wg.Done()
+
+	samples := make([]int16, vw.cfg.FrameSize)
+
+	for {
+		select {
+		case <-vw.ctx.Done():
+			return
+		default:
+		}
+
+		var n int
+		var err error
+
+		if vw.cfg.AudioSource != nil {
+			n, err = vw.cfg.AudioSource.Read(samples)
+			if err != nil {
+				log.Printf("voicewake: error reading audio: %v", err)
+				time.Sleep(10 * time.Millisecond)
+				continue
+			}
+		} else {
+			time.Sleep(time.Duration(vw.cfg.FrameSize) * time.Second / time.Duration(vw.cfg.SampleRate))
+			continue
+		}
+
+		if n == 0 {
+			continue
+		}
+
+		vw.mu.Lock()
+		inCooldown := time.Now().Before(vw.cooldownUntil)
+		vw.mu.Unlock()
+
+		if inCooldown {
+			continue
+		}
+
+		if vw.engineAdapter != nil && vw.engineAdapter.UseEngine() {
+			result, detected := vw.engineAdapter.ProcessFrame(samples[:n])
+			if detected && result != nil {
+				vw.mu.Lock()
+				vw.lastWakeMatch = result.Keyword
+				vw.lastConfidence = result.Confidence
+				vw.cooldownUntil = time.Now().Add(vw.cfg.CooldownTime)
+				vw.mu.Unlock()
+
+				vw.notifyListeners(VoiceWakeEvent{
+					Type:      VoiceWakeEventWakeDetected,
+					State:     VoiceWakeStateTriggered,
+					Timestamp: time.Now(),
+					Data: map[string]any{
+						"phrase":     result.Keyword,
+						"confidence": result.Confidence,
+						"engine":     string(result.Engine),
+						"energy":     0.0,
+					},
+				})
+
+				vw.mu.Lock()
+				vw.setState(VoiceWakeStateTriggered)
+				vw.mu.Unlock()
+
+				time.Sleep(vw.cfg.CooldownTime)
+
+				vw.mu.Lock()
+				vw.setState(VoiceWakeStateListening)
+				vw.mu.Unlock()
+
+				continue
+			}
+		}
+
+		vw.processAudio(samples[:n])
+	}
+}
+
+func (vw *VoiceWake) processAudio(samples []int16) {
+	vw.mu.Lock()
+	vw.audioBuffer = append(vw.audioBuffer, samples...)
+	vw.mu.Unlock()
+
+	state := vw.vad.ProcessFrame(samples)
+
+	switch state {
+	case VADStateSpeech:
+		vw.mu.Lock()
+		if !vw.isRecording {
+			vw.isRecording = true
+			vw.recordingStart = time.Now()
+			vw.recordingBuffer = make([]int16, 0, vw.cfg.SampleRate*int(vw.cfg.MaxRecordingTime.Seconds()))
+			vw.setState(VoiceWakeStateRecording)
+		}
+		vw.recordingBuffer = append(vw.recordingBuffer, samples...)
+		vw.mu.Unlock()
+
+	case VADStateSilence:
+		vw.mu.Lock()
+		if vw.isRecording {
+			vw.isRecording = false
+			recording := make([]int16, len(vw.recordingBuffer))
+			copy(recording, vw.recordingBuffer)
+			vw.recordingBuffer = nil
+			vw.mu.Unlock()
+
+			vw.processRecording(recording)
+		} else {
+			vw.mu.Unlock()
+		}
+	}
+}
+
+func (vw *VoiceWake) processRecording(samples []int16) {
+	if len(samples) == 0 {
+		return
+	}
+
+	vw.mu.Lock()
+	vw.setState(VoiceWakeStateProcessing)
+	vw.mu.Unlock()
+
+	if vw.cfg.AutoTranscribe && vw.transcriber != nil {
+		audioData := Int16ToWAV(samples, vw.cfg.SampleRate, vw.cfg.Channels)
+
+		go func() {
+			ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+			defer cancel()
+
+			result, err := vw.transcriber.TranscribeDirect(ctx, audioData, WithSTTInputFormat(InputWAV))
+			if err != nil {
+				log.Printf("voicewake: transcription error: %v", err)
+				vw.notifyListeners(VoiceWakeEvent{
+					Type:      VoiceWakeEventError,
+					State:     VoiceWakeStateProcessing,
+					Timestamp: time.Now(),
+					Data:      map[string]any{"error": err.Error()},
+				})
+				vw.mu.Lock()
+				vw.setState(VoiceWakeStateListening)
+				vw.mu.Unlock()
+				return
+			}
+
+			vw.mu.Lock()
+			vw.lastTranscript = result.Text
+			vw.mu.Unlock()
+
+			vw.checkWakeWord(result.Text)
+		}()
+	} else {
+		vw.mu.Lock()
+		vw.setState(VoiceWakeStateListening)
+		vw.mu.Unlock()
+	}
+}
+
+func (vw *VoiceWake) checkWakeWord(transcript string) {
+	if transcript == "" {
+		vw.mu.Lock()
+		vw.setState(VoiceWakeStateListening)
+		vw.mu.Unlock()
+		return
+	}
+
+	phrase, confidence, matched := vw.wakeDetector.Detect(transcript)
+
+	vw.mu.Lock()
+	vw.lastTranscript = transcript
+	vw.lastWakeMatch = phrase
+	vw.lastConfidence = confidence
+	vw.mu.Unlock()
+
+	if matched {
+		vw.mu.Lock()
+		vw.setState(VoiceWakeStateTriggered)
+		vw.cooldownUntil = time.Now().Add(vw.cfg.CooldownTime)
+		energy := vw.lastEnergy
+		vw.mu.Unlock()
+
+		vw.notifyListeners(VoiceWakeEvent{
+			Type:      VoiceWakeEventWakeDetected,
+			State:     VoiceWakeStateTriggered,
+			Timestamp: time.Now(),
+			Data: map[string]any{
+				"phrase":     phrase,
+				"confidence": confidence,
+				"transcript": transcript,
+				"energy":     energy,
+			},
+		})
+
+		time.Sleep(vw.cfg.CooldownTime)
+
+		vw.mu.Lock()
+		vw.setState(VoiceWakeStateListening)
+		vw.mu.Unlock()
+	} else {
+		vw.mu.Lock()
+		vw.setState(VoiceWakeStateListening)
+		vw.mu.Unlock()
+	}
+}
+
+func (vw *VoiceWake) onVADStateChanged(state VADState, energy float64, zcr float64) {
+	vw.mu.Lock()
+	vw.lastEnergy = energy
+	vw.mu.Unlock()
+
+	switch state {
+	case VADStateSpeech:
+		vw.notifyListeners(VoiceWakeEvent{
+			Type:      VoiceWakeEventSpeechStart,
+			State:     vw.State(),
+			Timestamp: time.Now(),
+			Data: map[string]any{
+				"energy": energy,
+				"zcr":    zcr,
+			},
+		})
+
+	case VADStateSilence:
+		vw.notifyListeners(VoiceWakeEvent{
+			Type:      VoiceWakeEventSpeechEnd,
+			State:     vw.State(),
+			Timestamp: time.Now(),
+			Data: map[string]any{
+				"energy": energy,
+				"zcr":    zcr,
+			},
+		})
+	}
+}
+
+func (vw *VoiceWake) setState(state VoiceWakeState) {
+	oldState := vw.state
+	vw.state = state
+
+	if oldState != state {
+		vw.notifyListeners(VoiceWakeEvent{
+			Type:      VoiceWakeEventStateChanged,
+			State:     state,
+			Timestamp: time.Now(),
+			Data: map[string]any{
+				"previous_state": oldState,
+				"new_state":      state,
+			},
+		})
+	}
+}
+
+func (vw *VoiceWake) State() VoiceWakeState {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	return vw.state
+}
+
+func (vw *VoiceWake) notifyListeners(event VoiceWakeEvent) {
+	vw.mu.Lock()
+	listeners := make([]VoiceWakeListener, len(vw.listeners))
+	copy(listeners, vw.listeners)
+	vw.mu.Unlock()
+
+	for _, listener := range listeners {
+		listener(event)
+	}
+}
+
+func (vw *VoiceWake) LastTranscript() string {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	return vw.lastTranscript
+}
+
+func (vw *VoiceWake) LastWakeMatch() (string, float64) {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	return vw.lastWakeMatch, vw.lastConfidence
+}
+
+func (vw *VoiceWake) VAD() VADProcessor {
+	return vw.vad
+}
+
+func (vw *VoiceWake) WakeDetector() *WakeWordDetector {
+	return vw.wakeDetector
+}
+
+func (vw *VoiceWake) UpdateConfig(cfg VoiceWakeConfig) {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+
+	if cfg.SampleRate > 0 {
+		vw.cfg.SampleRate = cfg.SampleRate
+	}
+	if cfg.Channels > 0 {
+		vw.cfg.Channels = cfg.Channels
+	}
+	if cfg.FrameSize > 0 {
+		vw.cfg.FrameSize = cfg.FrameSize
+	}
+	if cfg.MaxRecordingTime > 0 {
+		vw.cfg.MaxRecordingTime = cfg.MaxRecordingTime
+	}
+	if cfg.CooldownTime > 0 {
+		vw.cfg.CooldownTime = cfg.CooldownTime
+	}
+
+	vw.cfg.AutoTranscribe = cfg.AutoTranscribe
+}
+
+func (vw *VoiceWake) Config() VoiceWakeConfig {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	return vw.cfg
+}
+
+func (vw *VoiceWake) SetTranscriber(pipeline *STTPipeline) {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	vw.transcriber = pipeline
+}
+
+func (vw *VoiceWake) RegisterEngine(engineType WakeWordEngineType, cfg WakeWordEngineConfig) error {
+	vw.mu.Lock()
+	router := vw.engineRouter
+	vw.mu.Unlock()
+
+	if router == nil {
+		return fmt.Errorf("voicewake: no engine router available")
+	}
+
+	if err := router.CreateEngine(engineType, cfg); err != nil {
+		return err
+	}
+
+	vw.engineAdapter.SetUseEngine(true)
+	return nil
+}
+
+func (vw *VoiceWake) SetActiveEngine(name string) error {
+	vw.mu.Lock()
+	router := vw.engineRouter
+	vw.mu.Unlock()
+
+	if router == nil {
+		return fmt.Errorf("voicewake: no engine router available")
+	}
+
+	return router.SetActive(name)
+}
+
+func (vw *VoiceWake) UseWakeWordEngine(use bool) {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	if vw.engineAdapter != nil {
+		vw.engineAdapter.SetUseEngine(use)
+	}
+}
+
+func (vw *VoiceWake) IsUsingWakeWordEngine() bool {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	if vw.engineAdapter == nil {
+		return false
+	}
+	return vw.engineAdapter.UseEngine()
+}
+
+func (vw *VoiceWake) AvailableEngines() []string {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	if vw.engineRouter == nil {
+		return nil
+	}
+	return vw.engineRouter.Engines()
+}
+
+func (vw *VoiceWake) ActiveEngine() string {
+	vw.mu.Lock()
+	defer vw.mu.Unlock()
+	if vw.engineRouter == nil {
+		return ""
+	}
+	return vw.engineRouter.ActiveEngine()
+}
+
+func (vw *VoiceWake) EngineRouter() *WakeWordEngineRouter {
+	return vw.engineRouter
+}
+
+func (vw *VoiceWake) EngineAdapter() *WakeWordEngineAdapter {
+	return vw.engineAdapter
+}
+
+func (vw *VoiceWake) Close() error {
+	if err := vw.Stop(); err != nil {
+		return err
+	}
+
+	vw.mu.Lock()
+	router := vw.engineRouter
+	vw.mu.Unlock()
+
+	if router != nil {
+		return router.Close()
+	}
+	return nil
+}
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/02_stt_openai_client_refactor/pkg/speech/stt_openai_client.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/02_stt_openai_client_refactor/pkg/speech/stt_openai_client.go
new file mode 100644
index 00000000..3d986daa
--- /dev/null
+++ b/_speech_vad_stt_remaining_unuploaded_20260428_171936/02_stt_openai_client_refactor/pkg/speech/stt_openai_client.go
@@ -0,0 +1,126 @@
+package speech
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"mime/multipart"
+	"net/http"
+)
+
+type openAIAudioAPIClient struct {
+	apiKey  string
+	baseURL string
+	client  *http.Client
+}
+
+func newOpenAIAudioAPIClient(apiKey, baseURL string, client *http.Client) *openAIAudioAPIClient {
+	return &openAIAudioAPIClient{
+		apiKey:  apiKey,
+		baseURL: baseURL,
+		client:  client,
+	}
+}
+
+func (c *openAIAudioAPIClient) DoTranscriptionRequest(ctx context.Context, endpoint string, audio []byte, options TranscribeOptions, stream bool) (*http.Response, error) {
+	body, contentType, err := c.buildMultipartBody(audio, options, stream)
+	if err != nil {
+		return nil, err
+	}
+
+	req, err := http.NewRequestWithContext(ctx, "POST", c.baseURL+endpoint, body)
+	if err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to create request: %v", err)
+	}
+
+	req.Header.Set("Authorization", "Bearer "+c.apiKey)
+	req.Header.Set("Content-Type", contentType)
+	req.Header.Set("User-Agent", "anyclaw-stt/1.0")
+	if stream {
+		req.Header.Set("Accept", "text/event-stream")
+	}
+
+	resp, err := c.client.Do(req)
+	if err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: request failed: %v", err)
+	}
+
+	return resp, nil
+}
+
+func (c *openAIAudioAPIClient) buildMultipartBody(audio []byte, options TranscribeOptions, stream bool) (*bytes.Buffer, string, error) {
+	var body bytes.Buffer
+	writer := multipart.NewWriter(&body)
+
+	filename := "audio." + string(options.InputFormat)
+	part, err := writer.CreateFormFile("file", filename)
+	if err != nil {
+		return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to create form file: %v", err)
+	}
+
+	if _, err := part.Write(audio); err != nil {
+		return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write audio data: %v", err)
+	}
+
+	if err := writer.WriteField("model", options.Model); err != nil {
+		return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write model field: %v", err)
+	}
+
+	if options.Language != "" {
+		if err := writer.WriteField("language", options.Language); err != nil {
+			return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write language field: %v", err)
+		}
+	}
+
+	if options.Prompt != "" {
+		if err := writer.WriteField("prompt", options.Prompt); err != nil {
+			return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write prompt field: %v", err)
+		}
+	}
+
+	if options.Temperature > 0 {
+		if err := writer.WriteField("temperature", fmt.Sprintf("%.2f", options.Temperature)); err != nil {
+			return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write temperature field: %v", err)
+		}
+	}
+
+	if options.MaxAlternatives > 0 {
+		if err := writer.WriteField("max_alternatives", fmt.Sprintf("%d", options.MaxAlternatives)); err != nil {
+			return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write max_alternatives field: %v", err)
+		}
+	}
+
+	if options.WordTimestamps || options.SpeakerLabels {
+		if options.WordTimestamps {
+			if err := writer.WriteField("timestamp_granularities[]", "word"); err != nil {
+				return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write word timestamp_granularities: %v", err)
+			}
+		}
+		if options.SpeakerLabels {
+			if err := writer.WriteField("timestamp_granularities[]", "segment"); err != nil {
+				return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write segment timestamp_granularities: %v", err)
+			}
+		}
+	}
+
+	responseType := "verbose_json"
+	if stream {
+		responseType = "json"
+	}
+	if err := writer.WriteField("response_format", responseType); err != nil {
+		return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write response_format field: %v", err)
+	}
+
+	if stream {
+		if err := writer.WriteField("stream", "true"); err != nil {
+			return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write stream field: %v", err)
+		}
+	}
+
+	if err := writer.Close(); err != nil {
+		return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to close multipart writer: %v", err)
+	}
+
+	return &body, writer.FormDataContentType(), nil
+}
+
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/02_stt_openai_client_refactor/pkg/speech/stt_whisper.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/02_stt_openai_client_refactor/pkg/speech/stt_whisper.go
new file mode 100644
index 00000000..e94b451d
--- /dev/null
+++ b/_speech_vad_stt_remaining_unuploaded_20260428_171936/02_stt_openai_client_refactor/pkg/speech/stt_whisper.go
@@ -0,0 +1,521 @@
+package speech
+
+import (
+	"bufio"
+	"context"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"os"
+	"path/filepath"
+	"strings"
+	"time"
+)
+
+type WhisperModel string
+
+const (
+	WhisperModelV1 WhisperModel = "whisper-1"
+)
+
+var validWhisperModels = map[WhisperModel]bool{
+	WhisperModelV1: true,
+}
+
+var validInputFormats = map[AudioInputFormat]bool{
+	InputMP3:  true,
+	InputWAV:  true,
+	InputOGG:  true,
+	InputFLAC: true,
+	InputM4A:  true,
+	InputMP4:  true,
+	InputMPEG: true,
+	InputMPGA: true,
+	InputWEBM: true,
+}
+
+type WhisperProvider struct {
+	apiKey        string
+	baseURL       string
+	model         WhisperModel
+	language      string
+	timeout       time.Duration
+	retries       int
+	client        *http.Client
+	apiClient     *openAIAudioAPIClient
+	httpTransport *http.Transport
+}
+
+type WhisperOption func(*WhisperProvider)
+
+func WithWhisperBaseURL(url string) WhisperOption {
+	return func(p *WhisperProvider) {
+		p.baseURL = strings.TrimRight(url, "/")
+	}
+}
+
+func WithWhisperModel(model WhisperModel) WhisperOption {
+	return func(p *WhisperProvider) {
+		p.model = model
+	}
+}
+
+func WithWhisperLanguage(lang string) WhisperOption {
+	return func(p *WhisperProvider) {
+		p.language = lang
+	}
+}
+
+func WithWhisperTimeout(timeout time.Duration) WhisperOption {
+	return func(p *WhisperProvider) {
+		p.timeout = timeout
+	}
+}
+
+func WithWhisperRetries(retries int) WhisperOption {
+	return func(p *WhisperProvider) {
+		p.retries = retries
+	}
+}
+
+func WithWhisperHTTPTransport(transport *http.Transport) WhisperOption {
+	return func(p *WhisperProvider) {
+		p.httpTransport = transport
+	}
+}
+
+func NewWhisperProvider(apiKey string, opts ...WhisperOption) (*WhisperProvider, error) {
+	if apiKey == "" {
+		return nil, NewSTTError(ErrAuthentication, "openai: API key is required")
+	}
+
+	p := &WhisperProvider{
+		apiKey:  apiKey,
+		baseURL: "https://api.openai.com",
+		model:   WhisperModelV1,
+		timeout: 120 * time.Second,
+		retries: 2,
+		client:  &http.Client{Timeout: 120 * time.Second},
+	}
+
+	for _, opt := range opts {
+		opt(p)
+	}
+
+	if p.httpTransport != nil {
+		p.client.Transport = p.httpTransport
+	}
+	p.client.Timeout = p.timeout
+	p.apiClient = newOpenAIAudioAPIClient(p.apiKey, p.baseURL, p.client)
+
+	if !validWhisperModels[p.model] {
+		return nil, NewSTTErrorf(ErrProviderNotSupported, "openai: invalid whisper model: %s", p.model)
+	}
+
+	return p, nil
+}
+
+func (p *WhisperProvider) Name() string {
+	return "openai-whisper"
+}
+
+func (p *WhisperProvider) Type() STTProviderType {
+	return STTProviderOpenAI
+}
+
+func (p *WhisperProvider) Transcribe(ctx context.Context, audio []byte, opts ...TranscribeOption) (*TranscriptResult, error) {
+	options := TranscribeOptions{
+		Model:       string(p.model),
+		Language:    p.language,
+		Temperature: 0,
+		Mode:        ModeTranscription,
+		InputFormat: InputMP3,
+	}
+	for _, opt := range opts {
+		opt(&options)
+	}
+
+	if err := p.validateTranscribeOptions(options); err != nil {
+		return nil, err
+	}
+
+	if len(audio) == 0 {
+		return nil, NewSTTError(ErrAudioFormatInvalid, "openai-whisper: audio data is empty")
+	}
+
+	const maxAudioSize = 25 * 1024 * 1024
+	if len(audio) > maxAudioSize {
+		return nil, NewSTTErrorf(ErrAudioTooLarge, "openai-whisper: audio exceeds 25MB limit (%d bytes)", len(audio))
+	}
+
+	if !validInputFormats[options.InputFormat] {
+		return nil, NewSTTErrorf(ErrAudioFormatInvalid, "openai-whisper: unsupported input format: %s", options.InputFormat)
+	}
+
+	var lastErr error
+	for attempt := 0; attempt <= p.retries; attempt++ {
+		if attempt > 0 {
+			backoff := time.Duration(attempt) * time.Second
+			select {
+			case <-ctx.Done():
+				return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: context cancelled during retry: %v", ctx.Err())
+			case <-time.After(backoff):
+			}
+		}
+
+		result, err := p.doTranscribe(ctx, audio, options)
+		if err == nil {
+			return result, nil
+		}
+
+		lastErr = err
+
+		if sttErr, ok := err.(*STTError); ok {
+			if sttErr.Code == ErrAuthentication || sttErr.Code == ErrAudioFormatInvalid || sttErr.Code == ErrAudioTooLarge || sttErr.Code == ErrRateLimited {
+				return nil, err
+			}
+		}
+	}
+
+	return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: all %d retries failed: %v", p.retries, lastErr)
+}
+
+func (p *WhisperProvider) TranscribeFile(ctx context.Context, filePath string, opts ...TranscribeOption) (*TranscriptResult, error) {
+	if filePath == "" {
+		return nil, NewSTTError(ErrAudioFormatInvalid, "openai-whisper: file path is empty")
+	}
+
+	info, err := os.Stat(filePath)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil, NewSTTErrorf(ErrAudioFormatInvalid, "openai-whisper: file not found: %s", filePath)
+		}
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to stat file: %v", err)
+	}
+
+	const maxAudioSize = 25 * 1024 * 1024
+	if info.Size() > maxAudioSize {
+		return nil, NewSTTErrorf(ErrAudioTooLarge, "openai-whisper: file exceeds 25MB limit (%d bytes)", info.Size())
+	}
+
+	audio, err := os.ReadFile(filePath)
+	if err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to read file: %v", err)
+	}
+
+	if len(opts) == 0 || anyInputFormatNotSet(opts) {
+		ext := strings.TrimPrefix(strings.ToLower(filepath.Ext(filePath)), ".")
+		if ext != "" {
+			formatOpts := append([]TranscribeOption{WithSTTInputFormat(AudioInputFormat(ext))}, opts...)
+			return p.Transcribe(ctx, audio, formatOpts...)
+		}
+	}
+
+	return p.Transcribe(ctx, audio, opts...)
+}
+
+func anyInputFormatNotSet(opts []TranscribeOption) bool {
+	for _, opt := range opts {
+		o := &TranscribeOptions{}
+		opt(o)
+		if o.InputFormat != "" {
+			return false
+		}
+	}
+	return true
+}
+
+func (p *WhisperProvider) TranscribeStream(ctx context.Context, reader io.Reader, opts ...TranscribeOption) (*TranscriptResult, error) {
+	if reader == nil {
+		return nil, NewSTTError(ErrAudioFormatInvalid, "openai-whisper: reader is nil")
+	}
+
+	audio, err := io.ReadAll(reader)
+	if err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to read stream: %v", err)
+	}
+
+	return p.Transcribe(ctx, audio, opts...)
+}
+
+func (p *WhisperProvider) validateTranscribeOptions(options TranscribeOptions) error {
+	if options.Temperature < 0 || options.Temperature > 1 {
+		return NewSTTErrorf(ErrAudioFormatInvalid, "openai-whisper: temperature must be between 0 and 1, got: %f", options.Temperature)
+	}
+
+	if options.MaxAlternatives < 0 {
+		return NewSTTErrorf(ErrAudioFormatInvalid, "openai-whisper: maxAlternatives cannot be negative")
+	}
+
+	if options.Model == "" {
+		return NewSTTError(ErrAudioFormatInvalid, "openai-whisper: model is required")
+	}
+
+	return nil
+}
+
+func (p *WhisperProvider) doTranscribe(ctx context.Context, audio []byte, options TranscribeOptions) (*TranscriptResult, error) {
+	var endpoint string
+	switch options.Mode {
+	case ModeTranslation:
+		endpoint = "/v1/audio/translations"
+	default:
+		endpoint = "/v1/audio/transcriptions"
+	}
+
+	resp, err := p.apiClient.DoTranscriptionRequest(ctx, endpoint, audio, options, false)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		respBody, _ := io.ReadAll(resp.Body)
+		return nil, p.handleErrorResponse(resp.StatusCode, respBody)
+	}
+
+	respBody, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to read response: %v", err)
+	}
+
+	return p.parseResponse(respBody, options)
+}
+
+func (p *WhisperProvider) handleErrorResponse(statusCode int, body []byte) error {
+	var errResp whisperErrorResponse
+	if err := json.Unmarshal(body, &errResp); err == nil && errResp.Error.Message != "" {
+		msg := fmt.Sprintf("openai-whisper: API error: %s (type: %s, code: %s)",
+			errResp.Error.Message, errResp.Error.Type, errResp.Error.Code)
+		switch statusCode {
+		case http.StatusUnauthorized:
+			return NewSTTError(ErrAuthentication, msg)
+		case http.StatusTooManyRequests:
+			return NewSTTError(ErrRateLimited, msg)
+		case http.StatusBadRequest:
+			return NewSTTError(ErrAudioFormatInvalid, msg)
+		default:
+			return NewSTTError(ErrTranscriptionFailed, msg)
+		}
+	}
+
+	switch statusCode {
+	case http.StatusUnauthorized:
+		return NewSTTError(ErrAuthentication, fmt.Sprintf("openai-whisper: authentication failed: %s", string(body)))
+	case http.StatusTooManyRequests:
+		return NewSTTError(ErrRateLimited, fmt.Sprintf("openai-whisper: rate limited: %s", string(body)))
+	case http.StatusBadRequest:
+		return NewSTTErrorf(ErrAudioFormatInvalid, "openai-whisper: invalid request: %s", string(body))
+	case http.StatusServiceUnavailable:
+		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: service unavailable: %s", string(body))
+	default:
+		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: unexpected status %d: %s", statusCode, string(body))
+	}
+}
+
+type whisperResponse struct {
+	Text     string  `json:"text"`
+	Language string  `json:"language"`
+	Duration float64 `json:"duration,omitempty"`
+	Segments []struct {
+		ID           int     `json:"id"`
+		Seek         int     `json:"seek"`
+		Start        float64 `json:"start"`
+		End          float64 `json:"end"`
+		Text         string  `json:"text"`
+		Tokens       []int   `json:"tokens"`
+		Temperature  float64 `json:"temperature"`
+		AvgLogProb   float64 `json:"avg_logprob"`
+		Compression  float64 `json:"compression_ratio"`
+		NoSpeechProb float64 `json:"no_speech_prob"`
+		Words        []struct {
+			Word       string  `json:"word"`
+			Start      float64 `json:"start"`
+			End        float64 `json:"end"`
+			Confidence float64 `json:"probability"`
+		} `json:"words,omitempty"`
+	} `json:"segments,omitempty"`
+	LanguageProbability float64 `json:"language_probability,omitempty"`
+}
+
+type whisperErrorResponse struct {
+	Error struct {
+		Message string `json:"message"`
+		Type    string `json:"type"`
+		Code    string `json:"code"`
+	} `json:"error"`
+}
+
+func (p *WhisperProvider) parseResponse(body []byte, options TranscribeOptions) (*TranscriptResult, error) {
+	var resp whisperResponse
+	if err := json.Unmarshal(body, &resp); err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to parse JSON response: %v", err)
+	}
+
+	result := &TranscriptResult{
+		Text:       strings.TrimSpace(resp.Text),
+		Language:   resp.Language,
+		Duration:   time.Duration(resp.Duration * float64(time.Second)),
+		Confidence: resp.LanguageProbability,
+	}
+
+	if len(resp.Segments) > 0 {
+		result.Segments = make([]SegmentInfo, 0, len(resp.Segments))
+		for _, seg := range resp.Segments {
+			segment := SegmentInfo{
+				ID:        seg.ID,
+				Text:      seg.Text,
+				StartTime: time.Duration(seg.Start * float64(time.Second)),
+				EndTime:   time.Duration(seg.End * float64(time.Second)),
+			}
+
+			if seg.AvgLogProb != 0 {
+				segment.Confidence = normalizeLogProb(seg.AvgLogProb)
+			}
+
+			if len(seg.Words) > 0 {
+				segment.Words = make([]WordInfo, 0, len(seg.Words))
+				for _, w := range seg.Words {
+					segment.Words = append(segment.Words, WordInfo{
+						Word:       w.Word,
+						StartTime:  time.Duration(w.Start * float64(time.Second)),
+						EndTime:    time.Duration(w.End * float64(time.Second)),
+						Confidence: w.Confidence,
+					})
+				}
+			}
+
+			result.Segments = append(result.Segments, segment)
+		}
+
+		if len(result.Segments) > 0 && result.Confidence == 0 {
+			totalConfidence := 0.0
+			for _, seg := range result.Segments {
+				totalConfidence += seg.Confidence
+			}
+			result.Confidence = totalConfidence / float64(len(result.Segments))
+		}
+	}
+
+	if options.WordTimestamps && len(result.Segments) > 0 {
+		words := make([]WordInfo, 0)
+		for _, seg := range result.Segments {
+			words = append(words, seg.Words...)
+		}
+		result.Words = words
+	}
+
+	return result, nil
+}
+
+func normalizeLogProb(logProb float64) float64 {
+	if logProb > 0 {
+		return 1.0
+	}
+	prob := 1.0 / (1.0 + logProb*-1)
+	if prob < 0 {
+		return 0
+	}
+	if prob > 1 {
+		return 1
+	}
+	return prob
+}
+
+func (p *WhisperProvider) TranscribeSSE(ctx context.Context, audio []byte, onChunk func(chunk *TranscriptResult), opts ...TranscribeOption) error {
+	options := TranscribeOptions{
+		Model:       string(p.model),
+		Language:    p.language,
+		Temperature: 0,
+		Mode:        ModeTranscription,
+		InputFormat: InputMP3,
+	}
+	for _, opt := range opts {
+		opt(&options)
+	}
+
+	if err := p.validateTranscribeOptions(options); err != nil {
+		return err
+	}
+
+	if len(audio) == 0 {
+		return NewSTTError(ErrAudioFormatInvalid, "openai-whisper: audio data is empty")
+	}
+
+	endpoint := "/v1/audio/transcriptions"
+	if options.Mode == ModeTranslation {
+		endpoint = "/v1/audio/translations"
+	}
+
+	resp, err := p.apiClient.DoTranscriptionRequest(ctx, endpoint, audio, options, true)
+	if err != nil {
+		return err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode != http.StatusOK {
+		respBody, _ := io.ReadAll(resp.Body)
+		return p.handleErrorResponse(resp.StatusCode, respBody)
+	}
+
+	return p.readSSEStream(resp.Body, onChunk)
+}
+
+func (p *WhisperProvider) readSSEStream(reader io.Reader, onChunk func(chunk *TranscriptResult)) error {
+	scanner := bufio.NewScanner(reader)
+	scanner.Split(bufio.ScanLines)
+
+	var currentText strings.Builder
+	var detectedLanguage string
+
+	for scanner.Scan() {
+		line := scanner.Text()
+
+		if strings.HasPrefix(line, "data: ") {
+			data := strings.TrimPrefix(line, "data: ")
+			if data == "[DONE]" {
+				break
+			}
+
+			var chunk struct {
+				Text     string `json:"text"`
+				Language string `json:"language"`
+				Done     bool   `json:"done"`
+			}
+			if err := json.Unmarshal([]byte(data), &chunk); err != nil {
+				continue
+			}
+
+			if chunk.Text != "" {
+				currentText.WriteString(chunk.Text)
+			}
+			if chunk.Language != "" {
+				detectedLanguage = chunk.Language
+			}
+
+			onChunk(&TranscriptResult{
+				Text:     currentText.String(),
+				Language: detectedLanguage,
+			})
+
+			if chunk.Done {
+				break
+			}
+		}
+	}
+
+	return scanner.Err()
+}
+
+func (p *WhisperProvider) ListLanguages(ctx context.Context) ([]string, error) {
+	return []string{
+		"af", "am", "ar", "as", "az", "ba", "be", "bg", "bn", "bo", "br", "bs", "ca", "cs", "cy", "da",
+		"de", "el", "en", "es", "et", "eu", "fa", "fi", "fo", "fr", "gl", "gu", "ha", "haw", "he", "hi",
+		"hr", "ht", "hu", "hy", "id", "is", "it", "ja", "jw", "ka", "kk", "km", "kn", "ko", "la", "lb",
+		"ln", "lo", "lt", "lv", "mg", "mi", "mk", "ml", "mn", "mr", "ms", "mt", "my", "ne", "nl", "nn",
+		"no", "oc", "pa", "pl", "ps", "pt", "ro", "ru", "sa", "sd", "si", "sk", "sl", "sn", "so", "sq",
+		"sr", "su", "sv", "sw", "ta", "te", "tg", "th", "tk", "tl", "tr", "tt", "uk", "ur", "uz", "vi",
+		"yi", "yo", "zh",
+	}, nil
+}
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/go.mod b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/go.mod
new file mode 100644
index 00000000..0917ad2d
--- /dev/null
+++ b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/go.mod
@@ -0,0 +1,102 @@
+module github.com/1024XEngineer/anyclaw
+
+go 1.25.1
+
+require (
+	github.com/anyclaw/anyclaw v0.0.0
+	github.com/charmbracelet/bubbles v1.0.0
+	github.com/charmbracelet/bubbletea v1.3.10
+	github.com/charmbracelet/lipgloss v1.1.0
+	github.com/charmbracelet/x/term v0.2.2
+	github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335
+	github.com/chromedp/chromedp v0.10.0
+	github.com/gorilla/websocket v1.5.3
+	github.com/wailsapp/wails/v2 v2.11.0
+	golang.org/x/sys v0.42.0
+	golang.org/x/text v0.35.0
+	modernc.org/sqlite v1.48.1
+)
+
+replace github.com/anyclaw/anyclaw => .
+
+require (
+	cloud.google.com/go v0.123.0 // indirect
+	cloud.google.com/go/auth v0.20.0 // indirect
+	cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect
+	cloud.google.com/go/compute/metadata v0.9.0 // indirect
+	cloud.google.com/go/longrunning v0.9.0 // indirect
+	cloud.google.com/go/speech v1.33.0 // indirect
+	github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
+	github.com/bep/debounce v1.2.1 // indirect
+	github.com/cespare/xxhash/v2 v2.3.0 // indirect
+	github.com/charmbracelet/colorprofile v0.4.1 // indirect
+	github.com/charmbracelet/x/ansi v0.11.6 // indirect
+	github.com/charmbracelet/x/cellbuf v0.0.15 // indirect
+	github.com/chromedp/sysutil v1.0.0 // indirect
+	github.com/clipperhouse/displaywidth v0.9.0 // indirect
+	github.com/clipperhouse/stringish v0.1.1 // indirect
+	github.com/clipperhouse/uax29/v2 v2.5.0 // indirect
+	github.com/dustin/go-humanize v1.0.1 // indirect
+	github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
+	github.com/felixge/httpsnoop v1.0.4 // indirect
+	github.com/go-logr/logr v1.4.3 // indirect
+	github.com/go-logr/stdr v1.2.2 // indirect
+	github.com/go-ole/go-ole v1.3.0 // indirect
+	github.com/gobwas/httphead v0.1.0 // indirect
+	github.com/gobwas/pool v0.2.1 // indirect
+	github.com/gobwas/ws v1.4.0 // indirect
+	github.com/godbus/dbus/v5 v5.1.0 // indirect
+	github.com/godeps/webrtcvad-go v0.1.0 // indirect
+	github.com/google/s2a-go v0.1.9 // indirect
+	github.com/google/uuid v1.6.0 // indirect
+	github.com/googleapis/enterprise-certificate-proxy v0.3.14 // indirect
+	github.com/googleapis/gax-go/v2 v2.21.0 // indirect
+	github.com/jchv/go-winloader v0.0.0-20210711035445-715c2860da7e // indirect
+	github.com/josharian/intern v1.0.0 // indirect
+	github.com/labstack/echo/v4 v4.13.3 // indirect
+	github.com/labstack/gommon v0.4.2 // indirect
+	github.com/leaanthony/go-ansi-parser v1.6.1 // indirect
+	github.com/leaanthony/gosod v1.0.4 // indirect
+	github.com/leaanthony/slicer v1.6.0 // indirect
+	github.com/leaanthony/u v1.1.1 // indirect
+	github.com/lucasb-eyer/go-colorful v1.3.0 // indirect
+	github.com/mailru/easyjson v0.7.7 // indirect
+	github.com/mattn/go-colorable v0.1.13 // indirect
+	github.com/mattn/go-isatty v0.0.20 // indirect
+	github.com/mattn/go-localereader v0.0.1 // indirect
+	github.com/mattn/go-runewidth v0.0.19 // indirect
+	github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
+	github.com/muesli/cancelreader v0.2.2 // indirect
+	github.com/muesli/termenv v0.16.0 // indirect
+	github.com/ncruces/go-strftime v1.0.0 // indirect
+	github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
+	github.com/pkg/errors v0.9.1 // indirect
+	github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
+	github.com/rivo/uniseg v0.4.7 // indirect
+	github.com/samber/lo v1.49.1 // indirect
+	github.com/tkrajina/go-reflector v0.5.8 // indirect
+	github.com/valyala/bytebufferpool v1.0.0 // indirect
+	github.com/valyala/fasttemplate v1.2.2 // indirect
+	github.com/wailsapp/go-webview2 v1.0.22 // indirect
+	github.com/wailsapp/mimetype v1.4.1 // indirect
+	github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
+	go.opentelemetry.io/auto/sdk v1.2.1 // indirect
+	go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.67.0 // indirect
+	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0 // indirect
+	go.opentelemetry.io/otel v1.43.0 // indirect
+	go.opentelemetry.io/otel/metric v1.43.0 // indirect
+	go.opentelemetry.io/otel/trace v1.43.0 // indirect
+	golang.org/x/crypto v0.49.0 // indirect
+	golang.org/x/net v0.52.0 // indirect
+	golang.org/x/oauth2 v0.36.0 // indirect
+	golang.org/x/sync v0.20.0 // indirect
+	golang.org/x/time v0.15.0 // indirect
+	google.golang.org/api v0.275.0 // indirect
+	google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect
+	google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect
+	google.golang.org/grpc v1.80.0 // indirect
+	google.golang.org/protobuf v1.36.11 // indirect
+	modernc.org/libc v1.70.0 // indirect
+	modernc.org/mathutil v1.7.1 // indirect
+	modernc.org/memory v1.11.0 // indirect
+)
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/go.sum b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/go.sum
new file mode 100644
index 00000000..c0e69ec6
--- /dev/null
+++ b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/go.sum
@@ -0,0 +1,251 @@
+cloud.google.com/go v0.123.0 h1:2NAUJwPR47q+E35uaJeYoNhuNEM9kM8SjgRgdeOJUSE=
+cloud.google.com/go v0.123.0/go.mod h1:xBoMV08QcqUGuPW65Qfm1o9Y4zKZBpGS+7bImXLTAZU=
+cloud.google.com/go/auth v0.20.0 h1:kXTssoVb4azsVDoUiF8KvxAqrsQcQtB53DcSgta74CA=
+cloud.google.com/go/auth v0.20.0/go.mod h1:942/yi/itH1SsmpyrbnTMDgGfdy2BUqIKyd0cyYLc5Q=
+cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc=
+cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c=
+cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs=
+cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10=
+cloud.google.com/go/longrunning v0.9.0 h1:0EzbDEGsAvOZNbqXopgniY0w0a1phvu5IdUFq8grmqY=
+cloud.google.com/go/longrunning v0.9.0/go.mod h1:pkTz846W7bF4o2SzdWJ40Hu0Re+UoNT6Q5t+igIcb8E=
+cloud.google.com/go/speech v1.33.0 h1:555yroj4HCS7SPgfHuDU8zX+E5KrhccVWG96HNyBUAk=
+cloud.google.com/go/speech v1.33.0/go.mod h1:shnf33sZbGnQQZyek1fdLOR5rRKV6D3jsNqpqyijvj8=
+github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
+github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
+github.com/bep/debounce v1.2.1 h1:v67fRdBA9UQu2NhLFXrSg0Brw7CexQekrBwDMM8bzeY=
+github.com/bep/debounce v1.2.1/go.mod h1:H8yggRPQKLUhUoqrJC1bO2xNya7vanpDl7xR3ISbCJ0=
+github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
+github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
+github.com/charmbracelet/bubbles v1.0.0 h1:12J8/ak/uCZEMQ6KU7pcfwceyjLlWsDLAxB5fXonfvc=
+github.com/charmbracelet/bubbles v1.0.0/go.mod h1:9d/Zd5GdnauMI5ivUIVisuEm3ave1XwXtD1ckyV6r3E=
+github.com/charmbracelet/bubbletea v1.3.10 h1:otUDHWMMzQSB0Pkc87rm691KZ3SWa4KUlvF9nRvCICw=
+github.com/charmbracelet/bubbletea v1.3.10/go.mod h1:ORQfo0fk8U+po9VaNvnV95UPWA1BitP1E0N6xJPlHr4=
+github.com/charmbracelet/colorprofile v0.4.1 h1:a1lO03qTrSIRaK8c3JRxJDZOvhvIeSco3ej+ngLk1kk=
+github.com/charmbracelet/colorprofile v0.4.1/go.mod h1:U1d9Dljmdf9DLegaJ0nGZNJvoXAhayhmidOdcBwAvKk=
+github.com/charmbracelet/lipgloss v1.1.0 h1:vYXsiLHVkK7fp74RkV7b2kq9+zDLoEU4MZoFqR/noCY=
+github.com/charmbracelet/lipgloss v1.1.0/go.mod h1:/6Q8FR2o+kj8rz4Dq0zQc3vYf7X+B0binUUBwA0aL30=
+github.com/charmbracelet/x/ansi v0.11.6 h1:GhV21SiDz/45W9AnV2R61xZMRri5NlLnl6CVF7ihZW8=
+github.com/charmbracelet/x/ansi v0.11.6/go.mod h1:2JNYLgQUsyqaiLovhU2Rv/pb8r6ydXKS3NIttu3VGZQ=
+github.com/charmbracelet/x/cellbuf v0.0.15 h1:ur3pZy0o6z/R7EylET877CBxaiE1Sp1GMxoFPAIztPI=
+github.com/charmbracelet/x/cellbuf v0.0.15/go.mod h1:J1YVbR7MUuEGIFPCaaZ96KDl5NoS0DAWkskup+mOY+Q=
+github.com/charmbracelet/x/term v0.2.2 h1:xVRT/S2ZcKdhhOuSP4t5cLi5o+JxklsoEObBSgfgZRk=
+github.com/charmbracelet/x/term v0.2.2/go.mod h1:kF8CY5RddLWrsgVwpw4kAa6TESp6EB5y3uxGLeCqzAI=
+github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335 h1:bATMoZLH2QGct1kzDxfmeBUQI/QhQvB0mBrOTct+YlQ=
+github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs=
+github.com/chromedp/chromedp v0.10.0 h1:bRclRYVpMm/UVD76+1HcRW9eV3l58rFfy7AdBvKab1E=
+github.com/chromedp/chromedp v0.10.0/go.mod h1:ei/1ncZIqXX1YnAYDkxhD4gzBgavMEUu7JCKvztdomE=
+github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic=
+github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww=
+github.com/clipperhouse/displaywidth v0.9.0 h1:Qb4KOhYwRiN3viMv1v/3cTBlz3AcAZX3+y9OLhMtAtA=
+github.com/clipperhouse/displaywidth v0.9.0/go.mod h1:aCAAqTlh4GIVkhQnJpbL0T/WfcrJXHcj8C0yjYcjOZA=
+github.com/clipperhouse/stringish v0.1.1 h1:+NSqMOr3GR6k1FdRhhnXrLfztGzuG+VuFDfatpWHKCs=
+github.com/clipperhouse/stringish v0.1.1/go.mod h1:v/WhFtE1q0ovMta2+m+UbpZ+2/HEXNWYXQgCt4hdOzA=
+github.com/clipperhouse/uax29/v2 v2.5.0 h1:x7T0T4eTHDONxFJsL94uKNKPHrclyFI0lm7+w94cO8U=
+github.com/clipperhouse/uax29/v2 v2.5.0/go.mod h1:Wn1g7MK6OoeDT0vL+Q0SQLDz/KpfsVRgg6W7ihQeh4g=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
+github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
+github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4=
+github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM=
+github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
+github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
+github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
+github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
+github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
+github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
+github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
+github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE=
+github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78=
+github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU=
+github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
+github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
+github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
+github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs=
+github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc=
+github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk=
+github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
+github.com/godeps/webrtcvad-go v0.1.0 h1:JpVfJHSzND9p/iuO7xqko1UlB/UJjKxskEWEbzKKjrQ=
+github.com/godeps/webrtcvad-go v0.1.0/go.mod h1:487THSHEZrYU29LRm4AKYCm/Y8PPq3pIJSuz1KX3MwU=
+github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
+github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
+github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0=
+github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM=
+github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
+github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/googleapis/enterprise-certificate-proxy v0.3.14 h1:yh8ncqsbUY4shRD5dA6RlzjJaT4hi3kII+zYw8wmLb8=
+github.com/googleapis/enterprise-certificate-proxy v0.3.14/go.mod h1:vqVt9yG9480NtzREnTlmGSBmFrA+bzb0yl0TxoBQXOg=
+github.com/googleapis/gax-go/v2 v2.21.0 h1:h45NjjzEO3faG9Lg/cFrBh2PgegVVgzqKzuZl/wMbiI=
+github.com/googleapis/gax-go/v2 v2.21.0/go.mod h1:But/NJU6TnZsrLai/xBAQLLz+Hc7fHZJt/hsCz3Fih4=
+github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
+github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
+github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
+github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
+github.com/jchv/go-winloader v0.0.0-20210711035445-715c2860da7e h1:Q3+PugElBCf4PFpxhErSzU3/PY5sFL5Z6rfv4AbGAck=
+github.com/jchv/go-winloader v0.0.0-20210711035445-715c2860da7e/go.mod h1:alcuEEnZsY1WQsagKhZDsoPCRoOijYqhZvPwLG0kzVs=
+github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
+github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
+github.com/labstack/echo/v4 v4.13.3 h1:pwhpCPrTl5qry5HRdM5FwdXnhXSLSY+WE+YQSeCaafY=
+github.com/labstack/echo/v4 v4.13.3/go.mod h1:o90YNEeQWjDozo584l7AwhJMHN0bOC4tAfg+Xox9q5g=
+github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0=
+github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU=
+github.com/leaanthony/debme v1.2.1 h1:9Tgwf+kjcrbMQ4WnPcEIUcQuIZYqdWftzZkBr+i/oOc=
+github.com/leaanthony/debme v1.2.1/go.mod h1:3V+sCm5tYAgQymvSOfYQ5Xx2JCr+OXiD9Jkw3otUjiA=
+github.com/leaanthony/go-ansi-parser v1.6.1 h1:xd8bzARK3dErqkPFtoF9F3/HgN8UQk0ed1YDKpEz01A=
+github.com/leaanthony/go-ansi-parser v1.6.1/go.mod h1:+vva/2y4alzVmmIEpk9QDhA7vLC5zKDTRwfZGOp3IWU=
+github.com/leaanthony/gosod v1.0.4 h1:YLAbVyd591MRffDgxUOU1NwLhT9T1/YiwjKZpkNFeaI=
+github.com/leaanthony/gosod v1.0.4/go.mod h1:GKuIL0zzPj3O1SdWQOdgURSuhkF+Urizzxh26t9f1cw=
+github.com/leaanthony/slicer v1.6.0 h1:1RFP5uiPJvT93TAHi+ipd3NACobkW53yUiBqZheE/Js=
+github.com/leaanthony/slicer v1.6.0/go.mod h1:o/Iz29g7LN0GqH3aMjWAe90381nyZlDNquK+mtH2Fj8=
+github.com/leaanthony/u v1.1.1 h1:TUFjwDGlNX+WuwVEzDqQwC2lOv0P4uhTQw7CMFdiK7M=
+github.com/leaanthony/u v1.1.1/go.mod h1:9+o6hejoRljvZ3BzdYlVL0JYCwtnAsVuN9pVTQcaRfI=
+github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo=
+github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
+github.com/lucasb-eyer/go-colorful v1.3.0 h1:2/yBRLdWBZKrf7gB40FoiKfAWYQ0lqNcbuQwVHXptag=
+github.com/lucasb-eyer/go-colorful v1.3.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
+github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
+github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
+github.com/matryer/is v1.4.0/go.mod h1:8I/i5uYgLzgsgEloJE1U6xx5HkBQpAZvepWuujKwMRU=
+github.com/matryer/is v1.4.1 h1:55ehd8zaGABKLXQUe2awZ99BD/PTc2ls+KV/dXphgEQ=
+github.com/matryer/is v1.4.1/go.mod h1:8I/i5uYgLzgsgEloJE1U6xx5HkBQpAZvepWuujKwMRU=
+github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
+github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
+github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
+github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
+github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
+github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
+github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
+github.com/mattn/go-runewidth v0.0.19 h1:v++JhqYnZuu5jSKrk9RbgF5v4CGUjqRfBm05byFGLdw=
+github.com/mattn/go-runewidth v0.0.19/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs=
+github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI=
+github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo=
+github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
+github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
+github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc=
+github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk=
+github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
+github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
+github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw=
+github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0=
+github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ=
+github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU=
+github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
+github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
+github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
+github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
+github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
+github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
+github.com/samber/lo v1.49.1 h1:4BIFyVfuQSEpluc7Fua+j1NolZHiEHEpaSEKdsH0tew=
+github.com/samber/lo v1.49.1/go.mod h1:dO6KHFzUKXgP8LDhU0oI8d2hekjXnGOu0DB8Jecxd6o=
+github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
+github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
+github.com/tkrajina/go-reflector v0.5.8 h1:yPADHrwmUbMq4RGEyaOUpz2H90sRsETNVpjzo3DLVQQ=
+github.com/tkrajina/go-reflector v0.5.8/go.mod h1:ECbqLgccecY5kPmPmXg1MrHW585yMcDkVl6IvJe64T4=
+github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
+github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
+github.com/valyala/fasttemplate v1.2.2 h1:lxLXG0uE3Qnshl9QyaK6XJxMXlQZELvChBOCmQD0Loo=
+github.com/valyala/fasttemplate v1.2.2/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ=
+github.com/wailsapp/go-webview2 v1.0.22 h1:YT61F5lj+GGaat5OB96Aa3b4QA+mybD0Ggq6NZijQ58=
+github.com/wailsapp/go-webview2 v1.0.22/go.mod h1:qJmWAmAmaniuKGZPWwne+uor3AHMB5PFhqiK0Bbj8kc=
+github.com/wailsapp/mimetype v1.4.1 h1:pQN9ycO7uo4vsUUuPeHEYoUkLVkaRntMnHJxVwYhwHs=
+github.com/wailsapp/mimetype v1.4.1/go.mod h1:9aV5k31bBOv5z6u+QP8TltzvNGJPmNJD4XlAL3U+j3o=
+github.com/wailsapp/wails/v2 v2.11.0 h1:seLacV8pqupq32IjS4Y7V8ucab0WZwtK6VvUVxSBtqQ=
+github.com/wailsapp/wails/v2 v2.11.0/go.mod h1:jrf0ZaM6+GBc1wRmXsM8cIvzlg0karYin3erahI4+0k=
+github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no=
+github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM=
+go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
+go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
+go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.67.0 h1:yI1/OhfEPy7J9eoa6Sj051C7n5dvpj0QX8g4sRchg04=
+go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.67.0/go.mod h1:NoUCKYWK+3ecatC4HjkRktREheMeEtrXoQxrqYFeHSc=
+go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0 h1:OyrsyzuttWTSur2qN/Lm0m2a8yqyIjUVBZcxFPuXq2o=
+go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0/go.mod h1:C2NGBr+kAB4bk3xtMXfZ94gqFDtg/GkI7e9zqGh5Beg=
+go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I=
+go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0=
+go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM=
+go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY=
+go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A=
+go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0=
+golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus=
+golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M=
+golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4=
+golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA=
+golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 h1:mgKeJMpvi0yx/sU5GsxQ7p6s2wtOnGAHZWCHUM4KGzY=
+golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546/go.mod h1:j/pmGrbnkbPtQfxEe5D0VQhZC6qKbfKifgD0oM7sR70=
+golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
+golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
+golang.org/x/net v0.0.0-20210505024714-0287a6fb4125/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8=
+golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk=
+golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0=
+golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw=
+golang.org/x/oauth2 v0.36.0 h1:peZ/1z27fi9hUOFCAZaHyrpWG5lwe0RJEEEeH0ThlIs=
+golang.org/x/oauth2 v0.36.0/go.mod h1:YDBUJMTkDnJS+A4BP4eZBjCqtokkg1hODuPjwiGPO7Q=
+golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
+golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
+golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
+golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
+golang.org/x/sys v0.0.0-20200810151505-1b9f1253b3ed/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
+golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
+golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
+golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
+golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8=
+golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA=
+golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
+golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
+golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
+google.golang.org/api v0.275.0 h1:vfY5d9vFVJeWEZT65QDd9hbndr7FyZ2+6mIzGAh71NI=
+google.golang.org/api v0.275.0/go.mod h1:Fnag/EWUPIcJXuIkP1pjoTgS5vdxlk3eeemL7Do6bvw=
+google.golang.org/genproto v0.0.0-20260319201613-d00831a3d3e7 h1:XzmzkmB14QhVhgnawEVsOn6OFsnpyxNPRY9QV01dNB0=
+google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA=
+google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8=
+google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM=
+google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4=
+google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
+google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis=
+modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0=
+modernc.org/ccgo/v4 v4.32.0 h1:hjG66bI/kqIPX1b2yT6fr/jt+QedtP2fqojG2VrFuVw=
+modernc.org/ccgo/v4 v4.32.0/go.mod h1:6F08EBCx5uQc38kMGl+0Nm0oWczoo1c7cgpzEry7Uc0=
+modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
+modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
+modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
+modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
+modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
+modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
+modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
+modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
+modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
+modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
+modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
+modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
+modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
+modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
+modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
+modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
+modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
+modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
+modernc.org/sqlite v1.48.1 h1:S85iToyU6cgeojybE2XJlSbcsvcWkQ6qqNXJHtW5hWA=
+modernc.org/sqlite v1.48.1/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
+modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
+modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
+modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
+modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google.go
new file mode 100644
index 00000000..ebb7145b
--- /dev/null
+++ b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google.go
@@ -0,0 +1,502 @@
+package speech
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"strings"
+	"time"
+
+	speechpb "cloud.google.com/go/speech/apiv1/speechpb"
+	"google.golang.org/api/googleapi"
+	"google.golang.org/api/option"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+	"google.golang.org/protobuf/types/known/durationpb"
+)
+
+type GoogleModel string
+
+const (
+	GoogleModelLatestLong            GoogleModel = "latest_long"
+	GoogleModelLatestShort           GoogleModel = "latest_short"
+	GoogleModelCommandAndSearch      GoogleModel = "command_and_search"
+	GoogleModelPhoneCall             GoogleModel = "phone_call"
+	GoogleModelVideo                 GoogleModel = "video"
+	GoogleModelDefault               GoogleModel = "default"
+	GoogleModelMedicalConversational GoogleModel = "medical_conversational"
+	GoogleModelMedicalDictation      GoogleModel = "medical_dictation"
+)
+
+type GoogleRecognitionConfig struct {
+	Encoding                            RecognitionEncoding
+	SampleRateHertz                     int32
+	AudioChannelCount                   int32
+	EnableSeparateRecognitionPerChannel bool
+	LanguageCode                        string
+	MaxAlternatives                     int32
+	ProfanityFilter                     bool
+	SpeechContexts                      []GoogleSpeechContext
+	EnableWordTimeOffsets               bool
+	EnableWordConfidence                bool
+	EnableAutomaticPunctuation          bool
+	EnableSpokenPunctuation             bool
+	Model                               string
+	UseEnhanced                         bool
+}
+
+type RecognitionEncoding string
+
+const (
+	EncodingLinear16             RecognitionEncoding = "LINEAR16"
+	EncodingFLAC                 RecognitionEncoding = "FLAC"
+	EncodingMULAW                RecognitionEncoding = "MULAW"
+	EncodingAMR                  RecognitionEncoding = "AMR"
+	EncodingAMRWB                RecognitionEncoding = "AMR_WB"
+	EncodingOGGOpus              RecognitionEncoding = "OGG_OPUS"
+	EncodingSpeexWithHeaderByte  RecognitionEncoding = "SPEEX_WITH_HEADER_BYTE"
+	EncodingMP3                  RecognitionEncoding = "MP3"
+	EncodingWEBMOpus             RecognitionEncoding = "WEBM_OPUS"
+	EncodingENCODING_UNSPECIFIED RecognitionEncoding = "ENCODING_UNSPECIFIED"
+)
+
+type GoogleSpeechContext struct {
+	Phrases []string
+	Boost   float32
+}
+
+type GoogleProvider struct {
+	apiKey          string
+	credentialsJSON string
+	baseURL         string
+	languageCode    string
+	model           GoogleModel
+	useEnhanced     bool
+	timeout         time.Duration
+	retries         int
+	client          googleRecognizeAPI
+}
+
+type GoogleOption func(*GoogleProvider)
+
+func WithGoogleBaseURL(url string) GoogleOption {
+	return func(p *GoogleProvider) {
+		p.baseURL = strings.TrimRight(url, "/")
+	}
+}
+
+func WithGoogleLanguageCode(code string) GoogleOption {
+	return func(p *GoogleProvider) {
+		p.languageCode = code
+	}
+}
+
+func WithGoogleModel(model GoogleModel) GoogleOption {
+	return func(p *GoogleProvider) {
+		p.model = model
+	}
+}
+
+func WithGoogleEnhanced(enabled bool) GoogleOption {
+	return func(p *GoogleProvider) {
+		p.useEnhanced = enabled
+	}
+}
+
+func WithGoogleTimeout(timeout time.Duration) GoogleOption {
+	return func(p *GoogleProvider) {
+		p.timeout = timeout
+	}
+}
+
+func WithGoogleRetries(retries int) GoogleOption {
+	return func(p *GoogleProvider) {
+		p.retries = retries
+	}
+}
+
+func WithGoogleCredentialsJSON(credentialsJSON string) GoogleOption {
+	return func(p *GoogleProvider) {
+		p.credentialsJSON = credentialsJSON
+	}
+}
+
+func withGoogleRecognizeClient(client googleRecognizeAPI) GoogleOption {
+	return func(p *GoogleProvider) {
+		p.client = client
+	}
+}
+
+func NewGoogleProvider(apiKey string, opts ...GoogleOption) (*GoogleProvider, error) {
+	p := &GoogleProvider{
+		apiKey:       apiKey,
+		baseURL:      "https://speech.googleapis.com",
+		languageCode: "en-US",
+		model:        GoogleModelDefault,
+		timeout:      120 * time.Second,
+		retries:      2,
+	}
+
+	for _, opt := range opts {
+		opt(p)
+	}
+
+	if p.apiKey == "" && p.credentialsJSON == "" {
+		return nil, NewSTTError(ErrAuthentication, "google: API key or credentials JSON is required")
+	}
+
+	if p.client == nil {
+		client, err := newGoogleRecognizeClient(context.Background(), p.clientOptions()...)
+		if err != nil {
+			return nil, NewSTTErrorf(ErrAuthentication, "google-speech: failed to initialize official client: %v", err)
+		}
+		p.client = client
+	}
+
+	return p, nil
+}
+
+func (p *GoogleProvider) clientOptions() []option.ClientOption {
+	opts := make([]option.ClientOption, 0, 2)
+	if p.credentialsJSON != "" {
+		opts = append(opts, option.WithCredentialsJSON([]byte(p.credentialsJSON)))
+	} else {
+		opts = append(opts, option.WithAPIKey(p.apiKey))
+	}
+	if p.baseURL != "" && p.baseURL != "https://speech.googleapis.com" {
+		opts = append(opts, option.WithEndpoint(p.baseURL))
+	}
+	return opts
+}
+
+func (p *GoogleProvider) Name() string {
+	return "google-speech"
+}
+
+func (p *GoogleProvider) Type() STTProviderType {
+	return STTProviderGoogle
+}
+
+func (p *GoogleProvider) Transcribe(ctx context.Context, audio []byte, opts ...TranscribeOption) (*TranscriptResult, error) {
+	options := TranscribeOptions{
+		Language:    p.languageCode,
+		Mode:        ModeTranscription,
+		InputFormat: InputMP3,
+	}
+	for _, opt := range opts {
+		opt(&options)
+	}
+
+	if len(audio) == 0 {
+		return nil, NewSTTError(ErrAudioFormatInvalid, "google-speech: audio data is empty")
+	}
+
+	const maxAudioSize = 100 * 1024 * 1024
+	if len(audio) > maxAudioSize {
+		return nil, NewSTTErrorf(ErrAudioTooLarge, "google-speech: audio exceeds 100MB limit (%d bytes)", len(audio))
+	}
+
+	if !validInputFormats[options.InputFormat] {
+		return nil, NewSTTErrorf(ErrAudioFormatInvalid, "google-speech: unsupported input format: %s", options.InputFormat)
+	}
+
+	var lastErr error
+	for attempt := 0; attempt <= p.retries; attempt++ {
+		if attempt > 0 {
+			backoff := time.Duration(attempt) * time.Second
+			select {
+			case <-ctx.Done():
+				return nil, NewSTTErrorf(ErrTranscriptionFailed, "google-speech: context cancelled during retry: %v", ctx.Err())
+			case <-time.After(backoff):
+			}
+		}
+
+		result, err := p.doTranscribe(ctx, audio, options)
+		if err == nil {
+			return result, nil
+		}
+
+		lastErr = err
+
+		if sttErr, ok := err.(*STTError); ok {
+			if sttErr.Code == ErrAuthentication || sttErr.Code == ErrAudioFormatInvalid || sttErr.Code == ErrAudioTooLarge || sttErr.Code == ErrRateLimited {
+				return nil, err
+			}
+		}
+	}
+
+	return nil, NewSTTErrorf(ErrTranscriptionFailed, "google-speech: all %d retries failed: %v", p.retries, lastErr)
+}
+
+func (p *GoogleProvider) TranscribeFile(ctx context.Context, filePath string, opts ...TranscribeOption) (*TranscriptResult, error) {
+	return nil, NewSTTError(ErrProviderNotSupported, "google-speech: file transcription requires GCS URI, use Transcribe with file content instead")
+}
+
+func (p *GoogleProvider) TranscribeStream(ctx context.Context, reader io.Reader, opts ...TranscribeOption) (*TranscriptResult, error) {
+	if reader == nil {
+		return nil, NewSTTError(ErrAudioFormatInvalid, "google-speech: reader is nil")
+	}
+
+	audio, err := io.ReadAll(reader)
+	if err != nil {
+		return nil, NewSTTErrorf(ErrTranscriptionFailed, "google-speech: failed to read stream: %v", err)
+	}
+
+	return p.Transcribe(ctx, audio, opts...)
+}
+
+func (p *GoogleProvider) doTranscribe(ctx context.Context, audio []byte, options TranscribeOptions) (*TranscriptResult, error) {
+	req := p.buildRecognizeRequest(audio, options)
+
+	requestCtx := ctx
+	var cancel context.CancelFunc
+	if _, hasDeadline := ctx.Deadline(); !hasDeadline && p.timeout > 0 {
+		requestCtx, cancel = context.WithTimeout(ctx, p.timeout)
+		defer cancel()
+	}
+
+	resp, err := p.client.Recognize(requestCtx, req)
+	if err != nil {
+		return nil, p.handleClientError(err)
+	}
+
+	return p.parseRecognizeResponse(resp, options)
+}
+
+func (p *GoogleProvider) buildRecognizeRequest(audio []byte, options TranscribeOptions) *speechpb.RecognizeRequest {
+	encoding := p.mapInputFormatToEncoding(options.InputFormat)
+
+	sampleRate := int32(options.SampleRate)
+	if sampleRate == 0 {
+		sampleRate = p.guessSampleRate(options.InputFormat)
+	}
+
+	return &speechpb.RecognizeRequest{
+		Config: &speechpb.RecognitionConfig{
+			Encoding:                   p.toProtoRecognitionEncoding(encoding),
+			SampleRateHertz:            sampleRate,
+			LanguageCode:               options.Language,
+			Model:                      string(p.model),
+			UseEnhanced:                p.useEnhanced,
+			MaxAlternatives:            int32(options.MaxAlternatives),
+			EnableWordTimeOffsets:      options.WordTimestamps,
+			EnableWordConfidence:       true,
+			EnableAutomaticPunctuation: true,
+		},
+		Audio: &speechpb.RecognitionAudio{
+			AudioSource: &speechpb.RecognitionAudio_Content{Content: audio},
+		},
+	}
+}
+
+func (p *GoogleProvider) mapInputFormatToEncoding(format AudioInputFormat) RecognitionEncoding {
+	switch format {
+	case InputWAV, InputPCM:
+		return EncodingLinear16
+	case InputFLAC:
+		return EncodingFLAC
+	case InputMP3:
+		return EncodingMP3
+	case InputOGG:
+		return EncodingOGGOpus
+	case InputWEBM:
+		return EncodingWEBMOpus
+	case InputM4A, InputMP4:
+		return EncodingWEBMOpus
+	case InputMPEG, InputMPGA:
+		return EncodingMP3
+	default:
+		return EncodingMP3
+	}
+}
+
+func (p *GoogleProvider) toProtoRecognitionEncoding(encoding RecognitionEncoding) speechpb.RecognitionConfig_AudioEncoding {
+	switch encoding {
+	case EncodingLinear16:
+		return speechpb.RecognitionConfig_LINEAR16
+	case EncodingFLAC:
+		return speechpb.RecognitionConfig_FLAC
+	case EncodingMULAW:
+		return speechpb.RecognitionConfig_MULAW
+	case EncodingAMR:
+		return speechpb.RecognitionConfig_AMR
+	case EncodingAMRWB:
+		return speechpb.RecognitionConfig_AMR_WB
+	case EncodingOGGOpus:
+		return speechpb.RecognitionConfig_OGG_OPUS
+	case EncodingSpeexWithHeaderByte:
+		return speechpb.RecognitionConfig_SPEEX_WITH_HEADER_BYTE
+	case EncodingWEBMOpus:
+		return speechpb.RecognitionConfig_WEBM_OPUS
+	case EncodingMP3:
+		return speechpb.RecognitionConfig_MP3
+	default:
+		return speechpb.RecognitionConfig_ENCODING_UNSPECIFIED
+	}
+}
+
+func (p *GoogleProvider) guessSampleRate(format AudioInputFormat) int32 {
+	switch format {
+	case InputWAV, InputPCM:
+		return 16000
+	case InputFLAC:
+		return 16000
+	case InputMP3:
+		return 16000
+	case InputOGG, InputWEBM:
+		return 48000
+	case InputM4A, InputMP4:
+		return 44100
+	default:
+		return 16000
+	}
+}
+
+func (p *GoogleProvider) handleClientError(err error) error {
+	if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
+		return NewSTTErrorf(ErrTranscriptionFailed, "google-speech: request context error: %v", err)
+	}
+
+	var apiErr *googleapi.Error
+	if errors.As(err, &apiErr) {
+		msg := fmt.Sprintf("google-speech: API error: %s", apiErr.Message)
+		switch apiErr.Code {
+		case 400:
+			return NewSTTError(ErrAudioFormatInvalid, msg)
+		case 401, 403:
+			return NewSTTError(ErrAuthentication, msg)
+		case 429:
+			return NewSTTError(ErrRateLimited, msg)
+		default:
+			return NewSTTError(ErrTranscriptionFailed, msg)
+		}
+	}
+
+	switch status.Code(err) {
+	case codes.InvalidArgument:
+		return NewSTTError(ErrAudioFormatInvalid, "google-speech: invalid recognition request")
+	case codes.Unauthenticated, codes.PermissionDenied:
+		return NewSTTError(ErrAuthentication, "google-speech: authentication failed")
+	case codes.ResourceExhausted:
+		return NewSTTError(ErrRateLimited, "google-speech: rate limited")
+	default:
+		return NewSTTErrorf(ErrTranscriptionFailed, "google-speech: request failed: %v", err)
+	}
+}
+
+func (p *GoogleProvider) parseRecognizeResponse(resp *speechpb.RecognizeResponse, options TranscribeOptions) (*TranscriptResult, error) {
+	results := resp.GetResults()
+	if len(results) == 0 {
+		return &TranscriptResult{
+			Text:     "",
+			Language: options.Language,
+		}, nil
+	}
+
+	result := &TranscriptResult{}
+	var totalConfidence float64
+	var confidenceCount int
+	var lastEnd time.Duration
+
+	for i, res := range results {
+		if len(res.GetAlternatives()) == 0 {
+			continue
+		}
+
+		primary := res.GetAlternatives()[0]
+		segment := SegmentInfo{
+			ID:   i,
+			Text: primary.GetTranscript(),
+		}
+
+		if confidence := primary.GetConfidence(); confidence > 0 {
+			segment.Confidence = float64(confidence)
+			totalConfidence += segment.Confidence
+			confidenceCount++
+		}
+
+		if len(primary.GetWords()) > 0 {
+			segment.Words = make([]WordInfo, 0, len(primary.GetWords()))
+			for _, word := range primary.GetWords() {
+				wordInfo := WordInfo{
+					Word:       word.GetWord(),
+					StartTime:  parseProtoDuration(word.GetStartTime()),
+					EndTime:    parseProtoDuration(word.GetEndTime()),
+					Confidence: float64(word.GetConfidence()),
+				}
+				segment.Words = append(segment.Words, wordInfo)
+			}
+			segment.StartTime = segment.Words[0].StartTime
+			segment.EndTime = segment.Words[len(segment.Words)-1].EndTime
+		} else {
+			segment.EndTime = parseProtoDuration(res.GetResultEndTime())
+		}
+
+		if options.WordTimestamps && len(segment.Words) > 0 {
+			result.Words = append(result.Words, segment.Words...)
+		}
+
+		result.Segments = append(result.Segments, segment)
+
+		if i == 0 {
+			result.Text = primary.GetTranscript()
+			if lang := res.GetLanguageCode(); lang != "" {
+				result.Language = lang
+			}
+		} else {
+			result.Text += " " + primary.GetTranscript()
+		}
+
+		if options.MaxAlternatives > 1 && len(res.GetAlternatives()) > 1 {
+			for _, alt := range res.GetAlternatives()[1:] {
+				result.Alternatives = append(result.Alternatives, alt.GetTranscript())
+			}
+		}
+
+		if segment.EndTime > lastEnd {
+			lastEnd = segment.EndTime
+		}
+	}
+
+	if result.Language == "" {
+		result.Language = options.Language
+	}
+
+	if confidenceCount > 0 {
+		result.Confidence = totalConfidence / float64(confidenceCount)
+	}
+
+	if lastEnd > 0 {
+		result.Duration = lastEnd
+	} else {
+		result.Duration = parseProtoDuration(resp.GetTotalBilledTime())
+	}
+
+	return result, nil
+}
+
+func parseProtoDuration(d *durationpb.Duration) time.Duration {
+	if d == nil {
+		return 0
+	}
+	return d.AsDuration()
+}
+
+func (p *GoogleProvider) ListLanguages(ctx context.Context) ([]string, error) {
+	return []string{
+		"af-ZA", "am-ET", "hy-AM", "az-AZ", "id-ID", "ms-MY", "bn-BD", "bn-IN", "ca-ES", "cs-CZ",
+		"da-DK", "de-DE", "en-AU", "en-CA", "en-GH", "en-GB", "en-IN", "en-IE", "en-KE", "en-NZ",
+		"en-NG", "en-PH", "en-SG", "en-ZA", "en-TZ", "en-US", "es-AR", "es-BO", "es-CL", "es-CO",
+		"es-CR", "es-EC", "es-SV", "es-ES", "es-US", "es-GT", "es-HN", "es-MX", "es-NI", "es-PA",
+		"es-PY", "es-PE", "es-PR", "es-DO", "es-UY", "es-VE", "eu-ES", "fil-PH", "fr-CA", "fr-FR",
+		"gl-ES", "ka-GE", "gu-IN", "hr-HR", "zu-ZA", "is-IS", "it-IT", "jv-ID", "kn-IN", "km-KH",
+		"lo-LA", "lv-LV", "lt-LT", "hu-HU", "ml-IN", "mr-IN", "nl-NL", "ne-NP", "nb-NO", "pl-PL",
+		"pt-BR", "pt-PT", "ro-RO", "si-LK", "sk-SK", "sl-SI", "sr-RS", "fi-FI", "sv-SE", "ta-IN",
+		"ta-SG", "ta-LK", "ta-MY", "te-IN", "vi-VN", "tr-TR", "ur-IN", "ur-PK", "el-GR", "bg-BG",
+		"ru-RU", "sr-RS", "uk-UA", "he-IL", "ar-AE", "ar-BH", "ar-DZ", "ar-EG", "ar-IQ", "ar-JO",
+		"ar-KW", "ar-LB", "ar-LY", "ar-MA", "ar-OM", "ar-QA", "ar-SA", "ar-PS", "ar-SY", "ar-TN",
+		"ar-YE", "fa-IR", "hi-IN", "th-TH", "ko-KR", "zh-TW", "ja-JP", "zh", "zh-CN", "zh-HK",
+		"yue-Hant-HK",
+	}, nil
+}
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google_client.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google_client.go
new file mode 100644
index 00000000..aed01cb6
--- /dev/null
+++ b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google_client.go
@@ -0,0 +1,34 @@
+package speech
+
+import (
+	"context"
+
+	speechapi "cloud.google.com/go/speech/apiv1"
+	speechpb "cloud.google.com/go/speech/apiv1/speechpb"
+	"google.golang.org/api/option"
+)
+
+type googleRecognizeAPI interface {
+	Recognize(context.Context, *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error)
+	Close() error
+}
+
+type googleRecognizeClient struct {
+	client *speechapi.Client
+}
+
+func newGoogleRecognizeClient(ctx context.Context, clientOpts ...option.ClientOption) (googleRecognizeAPI, error) {
+	client, err := speechapi.NewRESTClient(ctx, clientOpts...)
+	if err != nil {
+		return nil, err
+	}
+	return &googleRecognizeClient{client: client}, nil
+}
+
+func (c *googleRecognizeClient) Recognize(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
+	return c.client.Recognize(ctx, req)
+}
+
+func (c *googleRecognizeClient) Close() error {
+	return c.client.Close()
+}
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google_test.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google_test.go
new file mode 100644
index 00000000..cb7fbdac
--- /dev/null
+++ b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google_test.go
@@ -0,0 +1,544 @@
+package speech
+
+import (
+	"context"
+	"errors"
+	"math"
+	"strings"
+	"testing"
+	"time"
+
+	speechpb "cloud.google.com/go/speech/apiv1/speechpb"
+	"google.golang.org/api/googleapi"
+	"google.golang.org/protobuf/types/known/durationpb"
+)
+
+type fakeGoogleRecognizeClient struct {
+	calls       int
+	lastRequest *speechpb.RecognizeRequest
+	recognizeFn func(context.Context, *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error)
+}
+
+func (f *fakeGoogleRecognizeClient) Recognize(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
+	f.calls++
+	f.lastRequest = req
+	if f.recognizeFn != nil {
+		return f.recognizeFn(ctx, req)
+	}
+	return &speechpb.RecognizeResponse{}, nil
+}
+
+func (f *fakeGoogleRecognizeClient) Close() error {
+	return nil
+}
+
+func TestNewGoogleProvider(t *testing.T) {
+	t.Run("requires API key or credentials JSON", func(t *testing.T) {
+		_, err := NewGoogleProvider("", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
+		if err == nil {
+			t.Fatal("expected error when auth config is empty")
+		}
+		sttErr, ok := err.(*STTError)
+		if !ok {
+			t.Fatalf("expected *STTError, got %T", err)
+		}
+		if sttErr.Code != ErrAuthentication {
+			t.Errorf("expected ErrAuthentication, got %s", sttErr.Code)
+		}
+	})
+
+	t.Run("creates provider with defaults", func(t *testing.T) {
+		fake := &fakeGoogleRecognizeClient{}
+		p, err := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake))
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if p.Name() != "google-speech" {
+			t.Errorf("expected name google-speech, got %s", p.Name())
+		}
+		if p.Type() != STTProviderGoogle {
+			t.Errorf("expected type %s, got %s", STTProviderGoogle, p.Type())
+		}
+		if p.baseURL != "https://speech.googleapis.com" {
+			t.Errorf("expected default baseURL, got %s", p.baseURL)
+		}
+		if p.languageCode != "en-US" {
+			t.Errorf("expected default language en-US, got %s", p.languageCode)
+		}
+		if p.model != GoogleModelDefault {
+			t.Errorf("expected default model %s, got %s", GoogleModelDefault, p.model)
+		}
+		if p.retries != 2 {
+			t.Errorf("expected 2 retries, got %d", p.retries)
+		}
+		if p.client != fake {
+			t.Fatal("expected injected fake client to be used")
+		}
+	})
+
+	t.Run("applies options", func(t *testing.T) {
+		p, err := NewGoogleProvider("test-key",
+			withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}),
+			WithGoogleBaseURL("https://custom.speech.api.com/"),
+			WithGoogleLanguageCode("zh-CN"),
+			WithGoogleModel(GoogleModelLatestLong),
+			WithGoogleEnhanced(true),
+			WithGoogleTimeout(30*time.Second),
+			WithGoogleRetries(5),
+			WithGoogleCredentialsJSON(`{"type":"service_account"}`),
+		)
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if p.baseURL != "https://custom.speech.api.com" {
+			t.Errorf("expected custom baseURL, got %s", p.baseURL)
+		}
+		if p.languageCode != "zh-CN" {
+			t.Errorf("expected language zh-CN, got %s", p.languageCode)
+		}
+		if p.model != GoogleModelLatestLong {
+			t.Errorf("expected model %s, got %s", GoogleModelLatestLong, p.model)
+		}
+		if !p.useEnhanced {
+			t.Error("expected useEnhanced to be true")
+		}
+		if p.timeout != 30*time.Second {
+			t.Errorf("expected 30s timeout, got %v", p.timeout)
+		}
+		if p.retries != 5 {
+			t.Errorf("expected 5 retries, got %d", p.retries)
+		}
+		if p.credentialsJSON == "" {
+			t.Error("expected credentials JSON to be stored")
+		}
+	})
+}
+
+func TestGoogleProviderTranscribe(t *testing.T) {
+	t.Run("rejects empty audio", func(t *testing.T) {
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
+		_, err := p.Transcribe(context.Background(), nil)
+		if err == nil {
+			t.Fatal("expected error for empty audio")
+		}
+	})
+
+	t.Run("rejects audio too large", func(t *testing.T) {
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
+		largeAudio := make([]byte, 101*1024*1024)
+		_, err := p.Transcribe(context.Background(), largeAudio)
+		if err == nil {
+			t.Fatal("expected error for audio too large")
+		}
+		sttErr, ok := err.(*STTError)
+		if !ok {
+			t.Fatalf("expected *STTError, got %T", err)
+		}
+		if sttErr.Code != ErrAudioTooLarge {
+			t.Errorf("expected ErrAudioTooLarge, got %s", sttErr.Code)
+		}
+	})
+
+	t.Run("successful transcription and request mapping", func(t *testing.T) {
+		fake := &fakeGoogleRecognizeClient{
+			recognizeFn: func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
+				return &speechpb.RecognizeResponse{
+					Results: []*speechpb.SpeechRecognitionResult{
+						{
+							Alternatives: []*speechpb.SpeechRecognitionAlternative{
+								{
+									Transcript: "Hello world",
+									Confidence: 0.95,
+									Words: []*speechpb.WordInfo{
+										{
+											Word:       "Hello",
+											Confidence: 0.96,
+											StartTime:  durationpb.New(0),
+											EndTime:    durationpb.New(500 * time.Millisecond),
+										},
+										{
+											Word:       "world",
+											Confidence: 0.94,
+											StartTime:  durationpb.New(600 * time.Millisecond),
+											EndTime:    durationpb.New(time.Second),
+										},
+									},
+								},
+							},
+							LanguageCode:  "en-US",
+							ResultEndTime: durationpb.New(2500 * time.Millisecond),
+						},
+					},
+				}, nil
+			},
+		}
+
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake))
+		result, err := p.Transcribe(context.Background(), []byte("fake-audio-data"),
+			WithSTTLanguage("zh-CN"),
+			WithSTTWordTimestamps(true),
+			WithSTTMaxAlternatives(3),
+		)
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+
+		if result.Text != "Hello world" {
+			t.Errorf("expected 'Hello world', got '%s'", result.Text)
+		}
+		if result.Language != "en-US" {
+			t.Errorf("expected language 'en-US', got '%s'", result.Language)
+		}
+		if result.Duration != time.Second {
+			t.Errorf("expected duration 1s from word timestamps, got %v", result.Duration)
+		}
+		if len(result.Segments) != 1 {
+			t.Fatalf("expected 1 segment, got %d", len(result.Segments))
+		}
+		if len(result.Segments[0].Words) != 2 {
+			t.Fatalf("expected 2 words, got %d", len(result.Segments[0].Words))
+		}
+		if math.Abs(result.Confidence-0.95) > 0.0001 {
+			t.Errorf("expected confidence 0.95, got %f", result.Confidence)
+		}
+
+		req := fake.lastRequest
+		if req == nil {
+			t.Fatal("expected request to be captured")
+		}
+		if req.GetConfig().GetLanguageCode() != "zh-CN" {
+			t.Errorf("expected request language zh-CN, got %s", req.GetConfig().GetLanguageCode())
+		}
+		if !req.GetConfig().GetEnableWordTimeOffsets() {
+			t.Error("expected EnableWordTimeOffsets to be true")
+		}
+		if req.GetConfig().GetMaxAlternatives() != 3 {
+			t.Errorf("expected max alternatives 3, got %d", req.GetConfig().GetMaxAlternatives())
+		}
+		if req.GetConfig().GetEncoding() != speechpb.RecognitionConfig_MP3 {
+			t.Errorf("expected MP3 encoding, got %v", req.GetConfig().GetEncoding())
+		}
+		if len(req.GetAudio().GetContent()) == 0 {
+			t.Error("expected inline audio content to be populated")
+		}
+	})
+
+	t.Run("multiple segments and alternatives", func(t *testing.T) {
+		fake := &fakeGoogleRecognizeClient{
+			recognizeFn: func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
+				return &speechpb.RecognizeResponse{
+					Results: []*speechpb.SpeechRecognitionResult{
+						{
+							Alternatives: []*speechpb.SpeechRecognitionAlternative{
+								{Transcript: "First segment", Confidence: 0.9},
+								{Transcript: "First segments", Confidence: 0.7},
+							},
+							LanguageCode:  "en-US",
+							ResultEndTime: durationpb.New(time.Second),
+						},
+						{
+							Alternatives: []*speechpb.SpeechRecognitionAlternative{
+								{Transcript: "Second segment", Confidence: 0.8},
+								{Transcript: "Second segments", Confidence: 0.6},
+							},
+							LanguageCode:  "en-US",
+							ResultEndTime: durationpb.New(2 * time.Second),
+						},
+					},
+				}, nil
+			},
+		}
+
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake))
+		result, err := p.Transcribe(context.Background(), []byte("fake-audio"), WithSTTMaxAlternatives(3))
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+
+		if result.Text != "First segment Second segment" {
+			t.Errorf("unexpected combined text: %s", result.Text)
+		}
+		if len(result.Segments) != 2 {
+			t.Fatalf("expected 2 segments, got %d", len(result.Segments))
+		}
+		if len(result.Alternatives) != 2 {
+			t.Fatalf("expected 2 alternatives, got %d", len(result.Alternatives))
+		}
+		if result.Duration != 2*time.Second {
+			t.Errorf("expected duration 2s, got %v", result.Duration)
+		}
+	})
+
+	t.Run("empty results", func(t *testing.T) {
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
+		result, err := p.Transcribe(context.Background(), []byte("fake-audio"))
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if result.Text != "" {
+			t.Errorf("expected empty text, got %q", result.Text)
+		}
+	})
+
+	t.Run("does not retry auth errors", func(t *testing.T) {
+		fake := &fakeGoogleRecognizeClient{
+			recognizeFn: func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
+				return nil, &googleapi.Error{Code: 401, Message: "invalid API key"}
+			},
+		}
+		p, _ := NewGoogleProvider("bad-key", withGoogleRecognizeClient(fake), WithGoogleRetries(3))
+		_, err := p.Transcribe(context.Background(), []byte("fake-audio"))
+		if err == nil {
+			t.Fatal("expected authentication error")
+		}
+		if fake.calls != 1 {
+			t.Errorf("expected 1 call, got %d", fake.calls)
+		}
+	})
+
+	t.Run("retries transient errors then succeeds", func(t *testing.T) {
+		fake := &fakeGoogleRecognizeClient{}
+		fake.recognizeFn = func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
+			if fake.calls == 1 {
+				return nil, &googleapi.Error{Code: 503, Message: "service unavailable"}
+			}
+			return &speechpb.RecognizeResponse{
+				Results: []*speechpb.SpeechRecognitionResult{
+					{
+						Alternatives:  []*speechpb.SpeechRecognitionAlternative{{Transcript: "Success after retry"}},
+						LanguageCode:  "en-US",
+						ResultEndTime: durationpb.New(time.Second),
+					},
+				},
+			}, nil
+		}
+
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake), WithGoogleRetries(2))
+		result, err := p.Transcribe(context.Background(), []byte("fake-audio"))
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if result.Text != "Success after retry" {
+			t.Errorf("expected 'Success after retry', got '%s'", result.Text)
+		}
+		if fake.calls != 2 {
+			t.Errorf("expected 2 calls, got %d", fake.calls)
+		}
+	})
+
+	t.Run("context cancellation", func(t *testing.T) {
+		fake := &fakeGoogleRecognizeClient{
+			recognizeFn: func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
+				return nil, context.Canceled
+			},
+		}
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake), WithGoogleRetries(0))
+		_, err := p.Transcribe(context.Background(), []byte("fake-audio"))
+		if err == nil {
+			t.Fatal("expected context cancellation error")
+		}
+	})
+}
+
+func TestGoogleProviderTranscribeStream(t *testing.T) {
+	t.Run("rejects nil reader", func(t *testing.T) {
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
+		_, err := p.TranscribeStream(context.Background(), nil)
+		if err == nil {
+			t.Fatal("expected error for nil reader")
+		}
+	})
+
+	t.Run("successful stream transcription", func(t *testing.T) {
+		fake := &fakeGoogleRecognizeClient{
+			recognizeFn: func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
+				return &speechpb.RecognizeResponse{
+					Results: []*speechpb.SpeechRecognitionResult{
+						{
+							Alternatives: []*speechpb.SpeechRecognitionAlternative{{Transcript: "Stream content", Confidence: 0.9}},
+							LanguageCode: "en-US",
+						},
+					},
+				}, nil
+			},
+		}
+		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake))
+		reader := strings.NewReader("stream-audio-data")
+		result, err := p.TranscribeStream(context.Background(), reader)
+		if err != nil {
+			t.Fatalf("unexpected error: %v", err)
+		}
+		if result.Text != "Stream content" {
+			t.Errorf("expected 'Stream content', got '%s'", result.Text)
+		}
+	})
+}
+
+func TestGoogleProviderTranscribeFile(t *testing.T) {
+	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
+	_, err := p.TranscribeFile(context.Background(), "/some/file.mp3")
+	if err == nil {
+		t.Fatal("expected error for file transcription")
+	}
+	sttErr, ok := err.(*STTError)
+	if !ok {
+		t.Fatalf("expected *STTError, got %T", err)
+	}
+	if sttErr.Code != ErrProviderNotSupported {
+		t.Errorf("expected ErrProviderNotSupported, got %s", sttErr.Code)
+	}
+}
+
+func TestGoogleProviderListLanguages(t *testing.T) {
+	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
+	langs, err := p.ListLanguages(context.Background())
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(langs) == 0 {
+		t.Fatal("expected non-empty language list")
+	}
+}
+
+func TestGoogleProviderEncodingMapping(t *testing.T) {
+	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
+
+	tests := []struct {
+		format AudioInputFormat
+		want   RecognitionEncoding
+	}{
+		{InputWAV, EncodingLinear16},
+		{InputPCM, EncodingLinear16},
+		{InputFLAC, EncodingFLAC},
+		{InputMP3, EncodingMP3},
+		{InputOGG, EncodingOGGOpus},
+		{InputWEBM, EncodingWEBMOpus},
+		{InputM4A, EncodingWEBMOpus},
+		{InputMP4, EncodingWEBMOpus},
+		{InputMPEG, EncodingMP3},
+		{InputMPGA, EncodingMP3},
+	}
+
+	for _, tt := range tests {
+		t.Run(string(tt.format), func(t *testing.T) {
+			got := p.mapInputFormatToEncoding(tt.format)
+			if got != tt.want {
+				t.Errorf("mapInputFormatToEncoding(%s) = %s, want %s", tt.format, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestGoogleProviderSampleRateGuessing(t *testing.T) {
+	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
+
+	tests := []struct {
+		format AudioInputFormat
+		want   int32
+	}{
+		{InputWAV, 16000},
+		{InputPCM, 16000},
+		{InputFLAC, 16000},
+		{InputMP3, 16000},
+		{InputOGG, 48000},
+		{InputWEBM, 48000},
+		{InputM4A, 44100},
+		{InputMP4, 44100},
+	}
+
+	for _, tt := range tests {
+		t.Run(string(tt.format), func(t *testing.T) {
+			got := p.guessSampleRate(tt.format)
+			if got != tt.want {
+				t.Errorf("guessSampleRate(%s) = %d, want %d", tt.format, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestParseProtoDuration(t *testing.T) {
+	tests := []struct {
+		name string
+		d    *durationpb.Duration
+		want time.Duration
+	}{
+		{"nil", nil, 0},
+		{"zero", durationpb.New(0), 0},
+		{"one second", durationpb.New(time.Second), time.Second},
+		{"500ms", durationpb.New(500 * time.Millisecond), 500 * time.Millisecond},
+		{"2.5s", durationpb.New(2500 * time.Millisecond), 2500 * time.Millisecond},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := parseProtoDuration(tt.d)
+			if got != tt.want {
+				t.Errorf("parseProtoDuration(%v) = %v, want %v", tt.d, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestNewSTTProviderGoogle(t *testing.T) {
+	p, err := NewSTTProvider(STTConfig{
+		Type:    STTProviderGoogle,
+		APIKey:  "test-key",
+		Timeout: 30 * time.Second,
+	})
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if p.Type() != STTProviderGoogle {
+		t.Errorf("expected STTProviderGoogle, got %s", p.Type())
+	}
+	if p.Name() != "google-speech" {
+		t.Errorf("expected name 'google-speech', got %s", p.Name())
+	}
+}
+
+func TestGoogleSTTManager(t *testing.T) {
+	m := NewSTTManager()
+	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
+
+	err := m.Register("google", p)
+	if err != nil {
+		t.Fatalf("failed to register provider: %v", err)
+	}
+
+	got, err := m.Get("google")
+	if err != nil {
+		t.Fatalf("failed to get provider: %v", err)
+	}
+	if got.Type() != STTProviderGoogle {
+		t.Errorf("expected STTProviderGoogle, got %s", got.Type())
+	}
+}
+
+func TestGoogleProviderHandleClientError(t *testing.T) {
+	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
+
+	tests := []struct {
+		name string
+		err  error
+		want STTErrorCode
+	}{
+		{"bad request", &googleapi.Error{Code: 400, Message: "bad request"}, ErrAudioFormatInvalid},
+		{"unauthorized", &googleapi.Error{Code: 401, Message: "unauthorized"}, ErrAuthentication},
+		{"forbidden", &googleapi.Error{Code: 403, Message: "forbidden"}, ErrAuthentication},
+		{"rate limited", &googleapi.Error{Code: 429, Message: "quota"}, ErrRateLimited},
+		{"generic", errors.New("boom"), ErrTranscriptionFailed},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			err := p.handleClientError(tt.err)
+			sttErr, ok := err.(*STTError)
+			if !ok {
+				t.Fatalf("expected *STTError, got %T", err)
+			}
+			if sttErr.Code != tt.want {
+				t.Errorf("expected %s, got %s", tt.want, sttErr.Code)
+			}
+		})
+	}
+}
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_provider.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_provider.go
new file mode 100644
index 00000000..10f6bd5a
--- /dev/null
+++ b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_provider.go
@@ -0,0 +1,249 @@
+package speech
+
+import (
+	"context"
+	"fmt"
+	"time"
+)
+
+type STTProviderType string
+
+const (
+	STTProviderOpenAI        STTProviderType = "openai"
+	STTProviderAzure         STTProviderType = "azure"
+	STTProviderGoogle        STTProviderType = "google"
+	STTProviderDeepgram      STTProviderType = "deepgram"
+	STTProviderAssemblyAI    STTProviderType = "assemblyai"
+	STTProviderWhisperCPP    STTProviderType = "whisper.cpp"
+	STTProviderVosk          STTProviderType = "vosk"
+	STTProviderFasterWhisper STTProviderType = "faster-whisper"
+	STTProviderCustom        STTProviderType = "custom"
+)
+
+type AudioInputFormat string
+
+const (
+	InputMP3  AudioInputFormat = "mp3"
+	InputWAV  AudioInputFormat = "wav"
+	InputOGG  AudioInputFormat = "ogg"
+	InputFLAC AudioInputFormat = "flac"
+	InputPCM  AudioInputFormat = "pcm"
+	InputM4A  AudioInputFormat = "m4a"
+	InputMP4  AudioInputFormat = "mp4"
+	InputMPEG AudioInputFormat = "mpeg"
+	InputMPGA AudioInputFormat = "mpga"
+	InputWEBM AudioInputFormat = "webm"
+)
+
+type STTProvider interface {
+	Name() string
+	Type() STTProviderType
+	Transcribe(ctx context.Context, audio []byte, opts ...TranscribeOption) (*TranscriptResult, error)
+	ListLanguages(ctx context.Context) ([]string, error)
+}
+
+type STTConfig struct {
+	Type            STTProviderType
+	APIKey          string
+	CredentialsJSON string
+	BaseURL         string
+	Model           string
+	Language        string
+	SampleRate      int
+	Timeout         time.Duration
+}
+
+func NewSTTProvider(cfg STTConfig) (STTProvider, error) {
+	switch cfg.Type {
+	case STTProviderOpenAI:
+		opts := []WhisperOption{}
+		if cfg.BaseURL != "" {
+			opts = append(opts, WithWhisperBaseURL(cfg.BaseURL))
+		}
+		if cfg.Model != "" {
+			opts = append(opts, WithWhisperModel(WhisperModel(cfg.Model)))
+		}
+		if cfg.Language != "" {
+			opts = append(opts, WithWhisperLanguage(cfg.Language))
+		}
+		if cfg.Timeout > 0 {
+			opts = append(opts, WithWhisperTimeout(cfg.Timeout))
+		}
+		return NewWhisperProvider(cfg.APIKey, opts...)
+	case STTProviderGoogle:
+		opts := []GoogleOption{}
+		if cfg.CredentialsJSON != "" {
+			opts = append(opts, WithGoogleCredentialsJSON(cfg.CredentialsJSON))
+		}
+		if cfg.BaseURL != "" {
+			opts = append(opts, WithGoogleBaseURL(cfg.BaseURL))
+		}
+		if cfg.Language != "" {
+			opts = append(opts, WithGoogleLanguageCode(cfg.Language))
+		}
+		if cfg.Timeout > 0 {
+			opts = append(opts, WithGoogleTimeout(cfg.Timeout))
+		}
+		return NewGoogleProvider(cfg.APIKey, opts...)
+	case STTProviderWhisperCPP:
+		opts := []WhisperCPPOption{}
+		if cfg.Model != "" {
+			opts = append(opts, WithWhisperCPPModelPath(cfg.Model))
+		}
+		if cfg.Language != "" {
+			opts = append(opts, WithWhisperCPPLanguage(cfg.Language))
+		}
+		if cfg.Timeout > 0 {
+			opts = append(opts, WithWhisperCPPTimeout(cfg.Timeout))
+		}
+		return NewWhisperCPPProvider(opts...)
+	default:
+		return nil, NewSTTError(ErrProviderNotSupported, "unknown STT provider: "+string(cfg.Type))
+	}
+}
+
+type TranscribeMode string
+
+const (
+	ModeTranscription TranscribeMode = "transcription"
+	ModeTranslation   TranscribeMode = "translation"
+)
+
+type TranscribeOptions struct {
+	Language        string
+	Model           string
+	Prompt          string
+	Temperature     float64
+	Mode            TranscribeMode
+	InputFormat     AudioInputFormat
+	SampleRate      int
+	WordTimestamps  bool
+	SpeakerLabels   bool
+	MaxAlternatives int
+}
+
+type TranscribeOption func(*TranscribeOptions)
+
+func WithSTTLanguage(lang string) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.Language = lang
+	}
+}
+
+func WithSTTModel(model string) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.Model = model
+	}
+}
+
+func WithSTTPrompt(prompt string) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.Prompt = prompt
+	}
+}
+
+func WithSTTTemperature(temp float64) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.Temperature = temp
+	}
+}
+
+func WithSTTMode(mode TranscribeMode) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.Mode = mode
+	}
+}
+
+func WithSTTInputFormat(format AudioInputFormat) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.InputFormat = format
+	}
+}
+
+func WithSTTSampleRate(rate int) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.SampleRate = rate
+	}
+}
+
+func WithSTTWordTimestamps(enabled bool) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.WordTimestamps = enabled
+	}
+}
+
+func WithSTTSpeakerLabels(enabled bool) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.SpeakerLabels = enabled
+	}
+}
+
+func WithSTTMaxAlternatives(n int) TranscribeOption {
+	return func(o *TranscribeOptions) {
+		o.MaxAlternatives = n
+	}
+}
+
+type WordInfo struct {
+	Word       string
+	StartTime  time.Duration
+	EndTime    time.Duration
+	Confidence float64
+}
+
+type SegmentInfo struct {
+	ID         int
+	Text       string
+	StartTime  time.Duration
+	EndTime    time.Duration
+	Confidence float64
+	Speaker    string
+	Words      []WordInfo
+}
+
+type TranscriptResult struct {
+	Text         string
+	Language     string
+	Duration     time.Duration
+	Confidence   float64
+	Segments     []SegmentInfo
+	Words        []WordInfo
+	Alternatives []string
+}
+
+type STTErrorCode string
+
+const (
+	ErrProviderNotSupported STTErrorCode = "provider_not_supported"
+	ErrAudioFormatInvalid   STTErrorCode = "audio_format_invalid"
+	ErrTranscriptionFailed  STTErrorCode = "transcription_failed"
+	ErrAudioTooLong         STTErrorCode = "audio_too_long"
+	ErrAudioTooLarge        STTErrorCode = "audio_too_large"
+	ErrRateLimited          STTErrorCode = "rate_limited"
+	ErrAuthentication       STTErrorCode = "authentication_failed"
+)
+
+type STTError struct {
+	Code    STTErrorCode
+	Message string
+	Err     error
+}
+
+func NewSTTError(code STTErrorCode, message string) *STTError {
+	return &STTError{Code: code, Message: message}
+}
+
+func NewSTTErrorf(code STTErrorCode, format string, args ...interface{}) *STTError {
+	return &STTError{Code: code, Message: fmt.Sprintf(format, args...)}
+}
+
+func (e *STTError) Error() string {
+	if e.Err != nil {
+		return string(e.Code) + ": " + e.Message + ": " + e.Err.Error()
+	}
+	return string(e.Code) + ": " + e.Message
+}
+
+func (e *STTError) Unwrap() error {
+	return e.Err
+}
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/README.md b/_speech_vad_stt_remaining_unuploaded_20260428_171936/README.md
new file mode 100644
index 00000000..19636a76
--- /dev/null
+++ b/_speech_vad_stt_remaining_unuploaded_20260428_171936/README.md
@@ -0,0 +1,46 @@
+# Speech VAD/STT Remaining Unuploaded Export
+
+This folder collects the speech-related changes exported from branch:
+
+- `feature/speech-vad-stt-phase1`
+
+It follows the same idea as the existing `_gateway_remaining_unuploaded_*` folders:
+
+- group changed files by topic
+- preserve relative project paths under each group
+- make it easy to review or move patches in batches
+
+## Groups
+
+### 01_vad_provider_and_voicewake
+
+Contains the VAD-side refactor and default provider switch:
+
+- provider abstraction for VAD
+- fallback heuristic VAD retained
+- WebRTC VAD implementation added
+- `VoiceWake` switched to use the provider interface and default WebRTC path
+
+### 02_stt_openai_client_refactor
+
+Contains the OpenAI STT cleanup:
+
+- multipart/request logic extracted into a thin client
+- `WhisperProvider` simplified to reuse the shared client
+
+### 03_stt_google_official_client
+
+Contains the Google STT migration:
+
+- provider moved from handwritten REST calls to the official Google Speech Go client
+- thin Google client wrapper added
+- provider config extended with `CredentialsJSON`
+- tests updated to use fake client injection
+- `go.mod` / `go.sum` included because the dependency graph changed
+
+## Validation
+
+The branch state corresponding to these files passed:
+
+- `go test ./pkg/speech/...`
+- `go test ./pkg/gateway/...`

From c65cc6c4809645ae8f023a0887ed5517ce139e56 Mon Sep 17 00:00:00 2001
From: TheShigure7 <2947458856@qq.com>
Date: Tue, 28 Apr 2026 17:23:50 +0800
Subject: [PATCH 5/6] chore(export): remove mistaken in-repo speech export
 folder

---
 .../pkg/speech/vad.go                         | 319 ---------
 .../pkg/speech/vad_provider.go                |  58 --
 .../pkg/speech/vad_webrtc.go                  | 144 ----
 .../pkg/speech/voicewake.go                   | 631 ------------------
 .../pkg/speech/stt_openai_client.go           | 126 ----
 .../pkg/speech/stt_whisper.go                 | 521 ---------------
 .../03_stt_google_official_client/go.mod      | 102 ---
 .../03_stt_google_official_client/go.sum      | 251 -------
 .../pkg/speech/stt_google.go                  | 502 --------------
 .../pkg/speech/stt_google_client.go           |  34 -
 .../pkg/speech/stt_google_test.go             | 544 ---------------
 .../pkg/speech/stt_provider.go                | 249 -------
 .../README.md                                 |  46 --
 13 files changed, 3527 deletions(-)
 delete mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad.go
 delete mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad_provider.go
 delete mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad_webrtc.go
 delete mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/voicewake.go
 delete mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/02_stt_openai_client_refactor/pkg/speech/stt_openai_client.go
 delete mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/02_stt_openai_client_refactor/pkg/speech/stt_whisper.go
 delete mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/go.mod
 delete mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/go.sum
 delete mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google.go
 delete mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google_client.go
 delete mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google_test.go
 delete mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_provider.go
 delete mode 100644 _speech_vad_stt_remaining_unuploaded_20260428_171936/README.md

diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad.go
deleted file mode 100644
index 045f0f9f..00000000
--- a/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad.go
+++ /dev/null
@@ -1,319 +0,0 @@
-package speech
-
-import (
-	"math"
-	"sync"
-)
-
-type VADState string
-
-const (
-	VADStateSilence VADState = "silence"
-	VADStateSpeech  VADState = "speech"
-)
-
-type VADConfig struct {
-	SampleRate         int
-	FrameSize          int
-	Aggressiveness     int
-	EnergyThreshold    float64
-	ZeroCrossThreshold int
-	SpeechMinFrames    int
-	SilenceFrames      int
-	HangoverFrames     int
-}
-
-func DefaultVADConfig() VADConfig {
-	return VADConfig{
-		SampleRate:         16000,
-		FrameSize:          320,
-		Aggressiveness:     2,
-		EnergyThreshold:    0.01,
-		ZeroCrossThreshold: 50,
-		SpeechMinFrames:    3,
-		SilenceFrames:      30,
-		HangoverFrames:     10,
-	}
-}
-
-type VAD struct {
-	mu                 sync.Mutex
-	cfg                VADConfig
-	state              VADState
-	consecutiveSpeech  int
-	consecutiveSilence int
-	listeners          []VADStateListener
-}
-
-type VADStateListener func(state VADState, energy float64, zcr float64)
-
-func (v *VAD) Name() string {
-	return "heuristic-vad"
-}
-
-func (v *VAD) Type() VADProviderType {
-	return VADProviderHeuristic
-}
-
-func NewVAD(cfg VADConfig) *VAD {
-	if cfg.SampleRate == 0 {
-		cfg.SampleRate = 16000
-	}
-	if cfg.FrameSize == 0 {
-		cfg.FrameSize = 320
-	}
-	if cfg.Aggressiveness < 0 || cfg.Aggressiveness > 3 {
-		cfg.Aggressiveness = 2
-	}
-	if cfg.EnergyThreshold == 0 {
-		cfg.EnergyThreshold = 0.01
-	}
-	if cfg.ZeroCrossThreshold == 0 {
-		cfg.ZeroCrossThreshold = 50
-	}
-	if cfg.SpeechMinFrames == 0 {
-		cfg.SpeechMinFrames = 3
-	}
-	if cfg.SilenceFrames == 0 {
-		cfg.SilenceFrames = 30
-	}
-	if cfg.HangoverFrames == 0 {
-		cfg.HangoverFrames = 10
-	}
-
-	return &VAD{
-		cfg:   cfg,
-		state: VADStateSilence,
-	}
-}
-
-func (v *VAD) RegisterListener(listener VADStateListener) {
-	v.mu.Lock()
-	defer v.mu.Unlock()
-	v.listeners = append(v.listeners, listener)
-}
-
-func (v *VAD) ProcessFrame(samples []int16) VADState {
-	v.mu.Lock()
-	defer v.mu.Unlock()
-
-	energy := v.calculateRMS(samples)
-	zcr := v.calculateZeroCrossingRate(samples)
-
-	isSpeech := v.isSpeechFrame(energy, zcr)
-
-	if isSpeech {
-		v.consecutiveSpeech++
-		v.consecutiveSilence = 0
-	} else {
-		v.consecutiveSilence++
-		v.consecutiveSpeech = 0
-	}
-
-	switch v.state {
-	case VADStateSilence:
-		if isSpeech {
-			if v.consecutiveSpeech >= v.cfg.SpeechMinFrames {
-				v.state = VADStateSpeech
-				v.notifyListeners(VADStateSpeech, energy, zcr)
-			}
-		} else {
-			v.consecutiveSpeech = 0
-		}
-
-	case VADStateSpeech:
-		if isSpeech {
-			v.consecutiveSilence = 0
-		} else {
-			if v.consecutiveSilence >= v.cfg.HangoverFrames {
-				v.state = VADStateSilence
-				v.consecutiveSpeech = 0
-				v.consecutiveSilence = 0
-				v.notifyListeners(VADStateSilence, energy, zcr)
-			}
-		}
-	}
-
-	return v.state
-}
-
-func (v *VAD) ProcessFloatFrame(samples []float32) VADState {
-	intSamples := make([]int16, len(samples))
-	for i, s := range samples {
-		clamped := s
-		if clamped > 1.0 {
-			clamped = 1.0
-		}
-		if clamped < -1.0 {
-			clamped = -1.0
-		}
-		intSamples[i] = int16(clamped * 32767.0)
-	}
-	return v.ProcessFrame(intSamples)
-}
-
-func (v *VAD) isSpeechFrame(energy float64, zcr float64) bool {
-	return energy > v.cfg.EnergyThreshold || zcr > float64(v.cfg.ZeroCrossThreshold)
-}
-
-func (v *VAD) calculateRMS(samples []int16) float64 {
-	if len(samples) == 0 {
-		return 0
-	}
-
-	var sumSquares float64
-	for _, s := range samples {
-		normalized := float64(s) / 32768.0
-		sumSquares += normalized * normalized
-	}
-
-	return math.Sqrt(sumSquares / float64(len(samples)))
-}
-
-func (v *VAD) calculateZeroCrossingRate(samples []int16) float64 {
-	if len(samples) < 2 {
-		return 0
-	}
-
-	var crossings int
-	for i := 1; i < len(samples); i++ {
-		if (samples[i] >= 0 && samples[i-1] < 0) || (samples[i] < 0 && samples[i-1] >= 0) {
-			crossings++
-		}
-	}
-
-	return float64(crossings)
-}
-
-func (v *VAD) State() VADState {
-	v.mu.Lock()
-	defer v.mu.Unlock()
-	return v.state
-}
-
-func (v *VAD) Reset() {
-	v.mu.Lock()
-	defer v.mu.Unlock()
-	v.state = VADStateSilence
-	v.consecutiveSpeech = 0
-	v.consecutiveSilence = 0
-}
-
-func (v *VAD) notifyListeners(state VADState, energy float64, zcr float64) {
-	for _, listener := range v.listeners {
-		listener(state, energy, zcr)
-	}
-}
-
-func (v *VAD) UpdateConfig(cfg VADConfig) {
-	v.mu.Lock()
-	defer v.mu.Unlock()
-	if cfg.EnergyThreshold > 0 {
-		v.cfg.EnergyThreshold = cfg.EnergyThreshold
-	}
-	if cfg.ZeroCrossThreshold > 0 {
-		v.cfg.ZeroCrossThreshold = cfg.ZeroCrossThreshold
-	}
-	if cfg.SpeechMinFrames > 0 {
-		v.cfg.SpeechMinFrames = cfg.SpeechMinFrames
-	}
-	if cfg.SilenceFrames > 0 {
-		v.cfg.SilenceFrames = cfg.SilenceFrames
-	}
-	if cfg.HangoverFrames > 0 {
-		v.cfg.HangoverFrames = cfg.HangoverFrames
-	}
-}
-
-func (v *VAD) Config() VADConfig {
-	v.mu.Lock()
-	defer v.mu.Unlock()
-	return v.cfg
-}
-
-func NormalizeAudio(samples []int16) []float64 {
-	result := make([]float64, len(samples))
-	for i, s := range samples {
-		result[i] = float64(s) / 32768.0
-	}
-	return result
-}
-
-func Float32ToInt16(samples []float32) []int16 {
-	result := make([]int16, len(samples))
-	for i, s := range samples {
-		clamped := s
-		if clamped > 1.0 {
-			clamped = 1.0
-		}
-		if clamped < -1.0 {
-			clamped = -1.0
-		}
-		result[i] = int16(clamped * 32767.0)
-	}
-	return result
-}
-
-func Int16ToWAV(samples []int16, sampleRate int, channels int) []byte {
-	if len(samples) == 0 {
-		return nil
-	}
-
-	bitsPerSample := 16
-	byteRate := sampleRate * channels * bitsPerSample / 8
-	blockAlign := channels * bitsPerSample / 8
-	dataSize := len(samples) * 2
-	fileSize := 36 + dataSize
-
-	buf := make([]byte, 44+dataSize)
-
-	copy(buf[0:4], []byte("RIFF"))
-	buf[4] = byte(fileSize)
-	buf[5] = byte(fileSize >> 8)
-	buf[6] = byte(fileSize >> 16)
-	buf[7] = byte(fileSize >> 24)
-
-	copy(buf[8:12], []byte("WAVE"))
-
-	copy(buf[12:16], []byte("fmt "))
-	buf[16] = 16
-	buf[17] = 0
-	buf[18] = 0
-	buf[19] = 0
-
-	buf[20] = 1
-	buf[21] = 0
-
-	buf[22] = byte(channels)
-	buf[23] = 0
-
-	buf[24] = byte(sampleRate)
-	buf[25] = byte(sampleRate >> 8)
-	buf[26] = byte(sampleRate >> 16)
-	buf[27] = byte(sampleRate >> 24)
-
-	buf[28] = byte(byteRate)
-	buf[29] = byte(byteRate >> 8)
-	buf[30] = byte(byteRate >> 16)
-	buf[31] = byte(byteRate >> 24)
-
-	buf[32] = byte(blockAlign)
-	buf[33] = 0
-
-	buf[34] = byte(bitsPerSample)
-	buf[35] = 0
-
-	copy(buf[36:40], []byte("data"))
-	buf[40] = byte(dataSize)
-	buf[41] = byte(dataSize >> 8)
-	buf[42] = byte(dataSize >> 16)
-	buf[43] = byte(dataSize >> 24)
-
-	for i, s := range samples {
-		offset := 44 + i*2
-		buf[offset] = byte(s)
-		buf[offset+1] = byte(s >> 8)
-	}
-
-	return buf
-}
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad_provider.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad_provider.go
deleted file mode 100644
index ce0de4e2..00000000
--- a/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad_provider.go
+++ /dev/null
@@ -1,58 +0,0 @@
-package speech
-
-import "fmt"
-
-type VADProviderType string
-
-const (
-	VADProviderHeuristic VADProviderType = "heuristic"
-	VADProviderWebRTC    VADProviderType = "webrtc"
-)
-
-type VADProcessor interface {
-	Name() string
-	Type() VADProviderType
-	ProcessFrame(samples []int16) VADState
-	ProcessFloatFrame(samples []float32) VADState
-	RegisterListener(listener VADStateListener)
-	State() VADState
-	Reset()
-	UpdateConfig(cfg VADConfig)
-	Config() VADConfig
-}
-
-type VADProviderFactory func(cfg VADConfig) (VADProcessor, error)
-
-type VADManager struct {
-	factories map[VADProviderType]VADProviderFactory
-}
-
-func NewVADManager() *VADManager {
-	m := &VADManager{
-		factories: map[VADProviderType]VADProviderFactory{},
-	}
-	m.Register(VADProviderHeuristic, func(cfg VADConfig) (VADProcessor, error) {
-		return NewVAD(cfg), nil
-	})
-	m.Register(VADProviderWebRTC, func(cfg VADConfig) (VADProcessor, error) {
-		return NewWebRTCVAD(cfg)
-	})
-	return m
-}
-
-func (m *VADManager) Register(providerType VADProviderType, factory VADProviderFactory) {
-	m.factories[providerType] = factory
-}
-
-func (m *VADManager) New(cfg VADConfig, providerType VADProviderType) (VADProcessor, error) {
-	if providerType == "" {
-		providerType = VADProviderHeuristic
-	}
-
-	factory, ok := m.factories[providerType]
-	if !ok {
-		return nil, fmt.Errorf("vad: unsupported provider %q", providerType)
-	}
-
-	return factory(cfg)
-}
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad_webrtc.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad_webrtc.go
deleted file mode 100644
index 4a7d2163..00000000
--- a/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/vad_webrtc.go
+++ /dev/null
@@ -1,144 +0,0 @@
-package speech
-
-import (
-	"encoding/binary"
-	"fmt"
-
-	webrtcvad "github.com/godeps/webrtcvad-go"
-)
-
-type WebRTCVAD struct {
-	inner      *VAD
-	detector   *webrtcvad.VAD
-	mode       int
-	sampleRate int
-	frameSize  int
-}
-
-func NewWebRTCVAD(cfg VADConfig) (*WebRTCVAD, error) {
-	if cfg.SampleRate == 0 {
-		cfg.SampleRate = 16000
-	}
-	if cfg.FrameSize == 0 {
-		cfg.FrameSize = 320
-	}
-	if cfg.Aggressiveness < 0 || cfg.Aggressiveness > 3 {
-		cfg.Aggressiveness = 2
-	}
-
-	if !webrtcvad.ValidRateAndFrameLength(cfg.SampleRate, cfg.FrameSize) {
-		return nil, fmt.Errorf("vad: invalid WebRTC sampleRate/frameSize combination: %d/%d", cfg.SampleRate, cfg.FrameSize)
-	}
-
-	detector, err := webrtcvad.New(cfg.Aggressiveness)
-	if err != nil {
-		return nil, fmt.Errorf("vad: failed to create WebRTC VAD: %w", err)
-	}
-
-	return &WebRTCVAD{
-		inner:      NewVAD(cfg),
-		detector:   detector,
-		mode:       cfg.Aggressiveness,
-		sampleRate: cfg.SampleRate,
-		frameSize:  cfg.FrameSize,
-	}, nil
-}
-
-func (v *WebRTCVAD) Name() string {
-	return "webrtc-vad"
-}
-
-func (v *WebRTCVAD) Type() VADProviderType {
-	return VADProviderWebRTC
-}
-
-func (v *WebRTCVAD) ProcessFrame(samples []int16) VADState {
-	if len(samples) == 0 {
-		return v.inner.ProcessFrame(samples)
-	}
-
-	audio := int16ToLittleEndianBytes(samples)
-	isSpeech, err := v.detector.IsSpeech(audio, v.sampleRate)
-	if err != nil {
-		return v.inner.ProcessFrame(samples)
-	}
-
-	v.inner.mu.Lock()
-	defer v.inner.mu.Unlock()
-
-	energy := v.inner.calculateRMS(samples)
-	zcr := v.inner.calculateZeroCrossingRate(samples)
-
-	if isSpeech {
-		v.inner.consecutiveSpeech++
-		v.inner.consecutiveSilence = 0
-	} else {
-		v.inner.consecutiveSilence++
-		v.inner.consecutiveSpeech = 0
-	}
-
-	switch v.inner.state {
-	case VADStateSilence:
-		if isSpeech {
-			if v.inner.consecutiveSpeech >= v.inner.cfg.SpeechMinFrames {
-				v.inner.state = VADStateSpeech
-				v.inner.notifyListeners(VADStateSpeech, energy, zcr)
-			}
-		} else {
-			v.inner.consecutiveSpeech = 0
-		}
-
-	case VADStateSpeech:
-		if isSpeech {
-			v.inner.consecutiveSilence = 0
-		} else {
-			if v.inner.consecutiveSilence >= v.inner.cfg.HangoverFrames {
-				v.inner.state = VADStateSilence
-				v.inner.consecutiveSpeech = 0
-				v.inner.consecutiveSilence = 0
-				v.inner.notifyListeners(VADStateSilence, energy, zcr)
-			}
-		}
-	}
-
-	return v.inner.state
-}
-
-func (v *WebRTCVAD) ProcessFloatFrame(samples []float32) VADState {
-	return v.ProcessFrame(Float32ToInt16(samples))
-}
-
-func (v *WebRTCVAD) RegisterListener(listener VADStateListener) {
-	v.inner.RegisterListener(listener)
-}
-
-func (v *WebRTCVAD) State() VADState {
-	return v.inner.State()
-}
-
-func (v *WebRTCVAD) Reset() {
-	v.inner.Reset()
-}
-
-func (v *WebRTCVAD) UpdateConfig(cfg VADConfig) {
-	v.inner.UpdateConfig(cfg)
-	if cfg.Aggressiveness >= 0 && cfg.Aggressiveness <= 3 {
-		_ = v.detector.SetMode(cfg.Aggressiveness)
-		v.mode = cfg.Aggressiveness
-	}
-}
-
-func (v *WebRTCVAD) Config() VADConfig {
-	cfg := v.inner.Config()
-	cfg.Aggressiveness = v.mode
-	return cfg
-}
-
-func int16ToLittleEndianBytes(samples []int16) []byte {
-	out := make([]byte, len(samples)*2)
-	for i, s := range samples {
-		binary.LittleEndian.PutUint16(out[i*2:], uint16(s))
-	}
-	return out
-}
-
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/voicewake.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/voicewake.go
deleted file mode 100644
index 64d89ffc..00000000
--- a/_speech_vad_stt_remaining_unuploaded_20260428_171936/01_vad_provider_and_voicewake/pkg/speech/voicewake.go
+++ /dev/null
@@ -1,631 +0,0 @@
-package speech
-
-import (
-	"context"
-	"fmt"
-	"log"
-	"sync"
-	"time"
-)
-
-type VoiceWakeState string
-
-const (
-	VoiceWakeStateIdle       VoiceWakeState = "idle"
-	VoiceWakeStateListening  VoiceWakeState = "listening"
-	VoiceWakeStateRecording  VoiceWakeState = "recording"
-	VoiceWakeStateProcessing VoiceWakeState = "processing"
-	VoiceWakeStateTriggered  VoiceWakeState = "triggered"
-)
-
-type VoiceWakeEventType string
-
-const (
-	VoiceWakeEventStateChanged VoiceWakeEventType = "state_changed"
-	VoiceWakeEventWakeDetected VoiceWakeEventType = "wake_detected"
-	VoiceWakeEventSpeechStart  VoiceWakeEventType = "speech_start"
-	VoiceWakeEventSpeechEnd    VoiceWakeEventType = "speech_end"
-	VoiceWakeEventError        VoiceWakeEventType = "error"
-)
-
-type VoiceWakeEvent struct {
-	Type      VoiceWakeEventType
-	State     VoiceWakeState
-	Timestamp time.Time
-	Data      map[string]any
-}
-
-type VoiceWakeListener func(event VoiceWakeEvent)
-
-type AudioSource interface {
-	Start(ctx context.Context) error
-	Stop() error
-	Read(samples []int16) (int, error)
-	SampleRate() int
-	Channels() int
-}
-
-type VoiceWakeConfig struct {
-	VADConfig        VADConfig
-	VADProvider      VADProviderType
-	WakeWordConfig   WakeWordConfig
-	EngineConfig     WakeWordEngineConfig
-	SampleRate       int
-	Channels         int
-	FrameSize        int
-	MaxRecordingTime time.Duration
-	CooldownTime     time.Duration
-	AudioSource      AudioSource
-	STTPipeline      *STTPipeline
-	AutoTranscribe   bool
-	WakeWordEngine   WakeWordEngineType
-}
-
-func DefaultVoiceWakeConfig() VoiceWakeConfig {
-	return VoiceWakeConfig{
-		VADConfig:        DefaultVADConfig(),
-		VADProvider:      VADProviderWebRTC,
-		WakeWordConfig:   DefaultWakeWordConfig(),
-		SampleRate:       16000,
-		Channels:         1,
-		FrameSize:        320,
-		MaxRecordingTime: 30 * time.Second,
-		CooldownTime:     2 * time.Second,
-		AutoTranscribe:   true,
-	}
-}
-
-type VoiceWake struct {
-	mu              sync.Mutex
-	cfg             VoiceWakeConfig
-	state           VoiceWakeState
-	vad             VADProcessor
-	wakeDetector    *WakeWordDetector
-	engineRouter    *WakeWordEngineRouter
-	engineAdapter   *WakeWordEngineAdapter
-	listeners       []VoiceWakeListener
-	audioBuffer     []int16
-	recordingBuffer []int16
-	isRecording     bool
-	recordingStart  time.Time
-	cooldownUntil   time.Time
-	ctx             context.Context
-	cancel          context.CancelFunc
-	wg              sync.WaitGroup
-	transcriber     *STTPipeline
-	lastTranscript  string
-	lastWakeMatch   string
-	lastConfidence  float64
-	lastEnergy      float64
-}
-
-func NewVoiceWake(cfg VoiceWakeConfig) *VoiceWake {
-	if cfg.SampleRate == 0 {
-		cfg.SampleRate = 16000
-	}
-	if cfg.Channels == 0 {
-		cfg.Channels = 1
-	}
-	if cfg.FrameSize == 0 {
-		cfg.FrameSize = 320
-	}
-	if cfg.MaxRecordingTime == 0 {
-		cfg.MaxRecordingTime = 30 * time.Second
-	}
-	if cfg.CooldownTime == 0 {
-		cfg.CooldownTime = 2 * time.Second
-	}
-
-	cfg.VADConfig.SampleRate = cfg.SampleRate
-	cfg.VADConfig.FrameSize = cfg.FrameSize
-
-	cfg.EngineConfig.SampleRate = cfg.SampleRate
-	cfg.EngineConfig.FrameSize = cfg.FrameSize
-
-	if cfg.VADProvider == "" {
-		cfg.VADProvider = VADProviderHeuristic
-	}
-
-	vadManager := NewVADManager()
-	vad, err := vadManager.New(cfg.VADConfig, cfg.VADProvider)
-	if err != nil {
-		log.Printf("voicewake: failed to create VAD provider %q, fallback to heuristic: %v", cfg.VADProvider, err)
-		vad = NewVAD(cfg.VADConfig)
-	}
-	wakeDetector := NewWakeWordDetector(cfg.WakeWordConfig)
-
-	router := NewWakeWordEngineRouter(cfg.EngineConfig)
-	adapter := NewWakeWordEngineAdapter(router, wakeDetector)
-
-	vw := &VoiceWake{
-		cfg:           cfg,
-		state:         VoiceWakeStateIdle,
-		vad:           vad,
-		wakeDetector:  wakeDetector,
-		engineRouter:  router,
-		engineAdapter: adapter,
-		transcriber:   cfg.STTPipeline,
-	}
-
-	vad.RegisterListener(vw.onVADStateChanged)
-
-	return vw
-}
-
-func (vw *VoiceWake) RegisterListener(listener VoiceWakeListener) {
-	vw.mu.Lock()
-	defer vw.mu.Unlock()
-	vw.listeners = append(vw.listeners, listener)
-}
-
-func (vw *VoiceWake) Start(ctx context.Context) error {
-	vw.mu.Lock()
-	if vw.state != VoiceWakeStateIdle {
-		vw.mu.Unlock()
-		return fmt.Errorf("voicewake: already in state %s", vw.state)
-	}
-	vw.state = VoiceWakeStateListening
-	vw.mu.Unlock()
-
-	vw.ctx, vw.cancel = context.WithCancel(ctx)
-
-	if vw.cfg.AudioSource != nil {
-		if err := vw.cfg.AudioSource.Start(vw.ctx); err != nil {
-			vw.mu.Lock()
-			vw.state = VoiceWakeStateIdle
-			vw.mu.Unlock()
-			return fmt.Errorf("voicewake: failed to start audio source: %w", err)
-		}
-	}
-
-	vw.wg.Add(1)
-	go vw.listenLoop()
-
-	vw.notifyListeners(VoiceWakeEvent{
-		Type:      VoiceWakeEventStateChanged,
-		State:     VoiceWakeStateListening,
-		Timestamp: time.Now(),
-		Data:      map[string]any{"message": "Voice wake listener started"},
-	})
-
-	return nil
-}
-
-func (vw *VoiceWake) Stop() error {
-	vw.mu.Lock()
-	if vw.state == VoiceWakeStateIdle {
-		vw.mu.Unlock()
-		return nil
-	}
-
-	if vw.cancel != nil {
-		vw.cancel()
-	}
-	vw.state = VoiceWakeStateIdle
-	vw.mu.Unlock()
-
-	if vw.cfg.AudioSource != nil {
-		_ = vw.cfg.AudioSource.Stop()
-	}
-
-	vw.wg.Wait()
-
-	vw.notifyListeners(VoiceWakeEvent{
-		Type:      VoiceWakeEventStateChanged,
-		State:     VoiceWakeStateIdle,
-		Timestamp: time.Now(),
-		Data:      map[string]any{"message": "Voice wake listener stopped"},
-	})
-
-	return nil
-}
-
-func (vw *VoiceWake) listenLoop() {
-	defer vw.wg.Done()
-
-	samples := make([]int16, vw.cfg.FrameSize)
-
-	for {
-		select {
-		case <-vw.ctx.Done():
-			return
-		default:
-		}
-
-		var n int
-		var err error
-
-		if vw.cfg.AudioSource != nil {
-			n, err = vw.cfg.AudioSource.Read(samples)
-			if err != nil {
-				log.Printf("voicewake: error reading audio: %v", err)
-				time.Sleep(10 * time.Millisecond)
-				continue
-			}
-		} else {
-			time.Sleep(time.Duration(vw.cfg.FrameSize) * time.Second / time.Duration(vw.cfg.SampleRate))
-			continue
-		}
-
-		if n == 0 {
-			continue
-		}
-
-		vw.mu.Lock()
-		inCooldown := time.Now().Before(vw.cooldownUntil)
-		vw.mu.Unlock()
-
-		if inCooldown {
-			continue
-		}
-
-		if vw.engineAdapter != nil && vw.engineAdapter.UseEngine() {
-			result, detected := vw.engineAdapter.ProcessFrame(samples[:n])
-			if detected && result != nil {
-				vw.mu.Lock()
-				vw.lastWakeMatch = result.Keyword
-				vw.lastConfidence = result.Confidence
-				vw.cooldownUntil = time.Now().Add(vw.cfg.CooldownTime)
-				vw.mu.Unlock()
-
-				vw.notifyListeners(VoiceWakeEvent{
-					Type:      VoiceWakeEventWakeDetected,
-					State:     VoiceWakeStateTriggered,
-					Timestamp: time.Now(),
-					Data: map[string]any{
-						"phrase":     result.Keyword,
-						"confidence": result.Confidence,
-						"engine":     string(result.Engine),
-						"energy":     0.0,
-					},
-				})
-
-				vw.mu.Lock()
-				vw.setState(VoiceWakeStateTriggered)
-				vw.mu.Unlock()
-
-				time.Sleep(vw.cfg.CooldownTime)
-
-				vw.mu.Lock()
-				vw.setState(VoiceWakeStateListening)
-				vw.mu.Unlock()
-
-				continue
-			}
-		}
-
-		vw.processAudio(samples[:n])
-	}
-}
-
-func (vw *VoiceWake) processAudio(samples []int16) {
-	vw.mu.Lock()
-	vw.audioBuffer = append(vw.audioBuffer, samples...)
-	vw.mu.Unlock()
-
-	state := vw.vad.ProcessFrame(samples)
-
-	switch state {
-	case VADStateSpeech:
-		vw.mu.Lock()
-		if !vw.isRecording {
-			vw.isRecording = true
-			vw.recordingStart = time.Now()
-			vw.recordingBuffer = make([]int16, 0, vw.cfg.SampleRate*int(vw.cfg.MaxRecordingTime.Seconds()))
-			vw.setState(VoiceWakeStateRecording)
-		}
-		vw.recordingBuffer = append(vw.recordingBuffer, samples...)
-		vw.mu.Unlock()
-
-	case VADStateSilence:
-		vw.mu.Lock()
-		if vw.isRecording {
-			vw.isRecording = false
-			recording := make([]int16, len(vw.recordingBuffer))
-			copy(recording, vw.recordingBuffer)
-			vw.recordingBuffer = nil
-			vw.mu.Unlock()
-
-			vw.processRecording(recording)
-		} else {
-			vw.mu.Unlock()
-		}
-	}
-}
-
-func (vw *VoiceWake) processRecording(samples []int16) {
-	if len(samples) == 0 {
-		return
-	}
-
-	vw.mu.Lock()
-	vw.setState(VoiceWakeStateProcessing)
-	vw.mu.Unlock()
-
-	if vw.cfg.AutoTranscribe && vw.transcriber != nil {
-		audioData := Int16ToWAV(samples, vw.cfg.SampleRate, vw.cfg.Channels)
-
-		go func() {
-			ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
-			defer cancel()
-
-			result, err := vw.transcriber.TranscribeDirect(ctx, audioData, WithSTTInputFormat(InputWAV))
-			if err != nil {
-				log.Printf("voicewake: transcription error: %v", err)
-				vw.notifyListeners(VoiceWakeEvent{
-					Type:      VoiceWakeEventError,
-					State:     VoiceWakeStateProcessing,
-					Timestamp: time.Now(),
-					Data:      map[string]any{"error": err.Error()},
-				})
-				vw.mu.Lock()
-				vw.setState(VoiceWakeStateListening)
-				vw.mu.Unlock()
-				return
-			}
-
-			vw.mu.Lock()
-			vw.lastTranscript = result.Text
-			vw.mu.Unlock()
-
-			vw.checkWakeWord(result.Text)
-		}()
-	} else {
-		vw.mu.Lock()
-		vw.setState(VoiceWakeStateListening)
-		vw.mu.Unlock()
-	}
-}
-
-func (vw *VoiceWake) checkWakeWord(transcript string) {
-	if transcript == "" {
-		vw.mu.Lock()
-		vw.setState(VoiceWakeStateListening)
-		vw.mu.Unlock()
-		return
-	}
-
-	phrase, confidence, matched := vw.wakeDetector.Detect(transcript)
-
-	vw.mu.Lock()
-	vw.lastTranscript = transcript
-	vw.lastWakeMatch = phrase
-	vw.lastConfidence = confidence
-	vw.mu.Unlock()
-
-	if matched {
-		vw.mu.Lock()
-		vw.setState(VoiceWakeStateTriggered)
-		vw.cooldownUntil = time.Now().Add(vw.cfg.CooldownTime)
-		energy := vw.lastEnergy
-		vw.mu.Unlock()
-
-		vw.notifyListeners(VoiceWakeEvent{
-			Type:      VoiceWakeEventWakeDetected,
-			State:     VoiceWakeStateTriggered,
-			Timestamp: time.Now(),
-			Data: map[string]any{
-				"phrase":     phrase,
-				"confidence": confidence,
-				"transcript": transcript,
-				"energy":     energy,
-			},
-		})
-
-		time.Sleep(vw.cfg.CooldownTime)
-
-		vw.mu.Lock()
-		vw.setState(VoiceWakeStateListening)
-		vw.mu.Unlock()
-	} else {
-		vw.mu.Lock()
-		vw.setState(VoiceWakeStateListening)
-		vw.mu.Unlock()
-	}
-}
-
-func (vw *VoiceWake) onVADStateChanged(state VADState, energy float64, zcr float64) {
-	vw.mu.Lock()
-	vw.lastEnergy = energy
-	vw.mu.Unlock()
-
-	switch state {
-	case VADStateSpeech:
-		vw.notifyListeners(VoiceWakeEvent{
-			Type:      VoiceWakeEventSpeechStart,
-			State:     vw.State(),
-			Timestamp: time.Now(),
-			Data: map[string]any{
-				"energy": energy,
-				"zcr":    zcr,
-			},
-		})
-
-	case VADStateSilence:
-		vw.notifyListeners(VoiceWakeEvent{
-			Type:      VoiceWakeEventSpeechEnd,
-			State:     vw.State(),
-			Timestamp: time.Now(),
-			Data: map[string]any{
-				"energy": energy,
-				"zcr":    zcr,
-			},
-		})
-	}
-}
-
-func (vw *VoiceWake) setState(state VoiceWakeState) {
-	oldState := vw.state
-	vw.state = state
-
-	if oldState != state {
-		vw.notifyListeners(VoiceWakeEvent{
-			Type:      VoiceWakeEventStateChanged,
-			State:     state,
-			Timestamp: time.Now(),
-			Data: map[string]any{
-				"previous_state": oldState,
-				"new_state":      state,
-			},
-		})
-	}
-}
-
-func (vw *VoiceWake) State() VoiceWakeState {
-	vw.mu.Lock()
-	defer vw.mu.Unlock()
-	return vw.state
-}
-
-func (vw *VoiceWake) notifyListeners(event VoiceWakeEvent) {
-	vw.mu.Lock()
-	listeners := make([]VoiceWakeListener, len(vw.listeners))
-	copy(listeners, vw.listeners)
-	vw.mu.Unlock()
-
-	for _, listener := range listeners {
-		listener(event)
-	}
-}
-
-func (vw *VoiceWake) LastTranscript() string {
-	vw.mu.Lock()
-	defer vw.mu.Unlock()
-	return vw.lastTranscript
-}
-
-func (vw *VoiceWake) LastWakeMatch() (string, float64) {
-	vw.mu.Lock()
-	defer vw.mu.Unlock()
-	return vw.lastWakeMatch, vw.lastConfidence
-}
-
-func (vw *VoiceWake) VAD() VADProcessor {
-	return vw.vad
-}
-
-func (vw *VoiceWake) WakeDetector() *WakeWordDetector {
-	return vw.wakeDetector
-}
-
-func (vw *VoiceWake) UpdateConfig(cfg VoiceWakeConfig) {
-	vw.mu.Lock()
-	defer vw.mu.Unlock()
-
-	if cfg.SampleRate > 0 {
-		vw.cfg.SampleRate = cfg.SampleRate
-	}
-	if cfg.Channels > 0 {
-		vw.cfg.Channels = cfg.Channels
-	}
-	if cfg.FrameSize > 0 {
-		vw.cfg.FrameSize = cfg.FrameSize
-	}
-	if cfg.MaxRecordingTime > 0 {
-		vw.cfg.MaxRecordingTime = cfg.MaxRecordingTime
-	}
-	if cfg.CooldownTime > 0 {
-		vw.cfg.CooldownTime = cfg.CooldownTime
-	}
-
-	vw.cfg.AutoTranscribe = cfg.AutoTranscribe
-}
-
-func (vw *VoiceWake) Config() VoiceWakeConfig {
-	vw.mu.Lock()
-	defer vw.mu.Unlock()
-	return vw.cfg
-}
-
-func (vw *VoiceWake) SetTranscriber(pipeline *STTPipeline) {
-	vw.mu.Lock()
-	defer vw.mu.Unlock()
-	vw.transcriber = pipeline
-}
-
-func (vw *VoiceWake) RegisterEngine(engineType WakeWordEngineType, cfg WakeWordEngineConfig) error {
-	vw.mu.Lock()
-	router := vw.engineRouter
-	vw.mu.Unlock()
-
-	if router == nil {
-		return fmt.Errorf("voicewake: no engine router available")
-	}
-
-	if err := router.CreateEngine(engineType, cfg); err != nil {
-		return err
-	}
-
-	vw.engineAdapter.SetUseEngine(true)
-	return nil
-}
-
-func (vw *VoiceWake) SetActiveEngine(name string) error {
-	vw.mu.Lock()
-	router := vw.engineRouter
-	vw.mu.Unlock()
-
-	if router == nil {
-		return fmt.Errorf("voicewake: no engine router available")
-	}
-
-	return router.SetActive(name)
-}
-
-func (vw *VoiceWake) UseWakeWordEngine(use bool) {
-	vw.mu.Lock()
-	defer vw.mu.Unlock()
-	if vw.engineAdapter != nil {
-		vw.engineAdapter.SetUseEngine(use)
-	}
-}
-
-func (vw *VoiceWake) IsUsingWakeWordEngine() bool {
-	vw.mu.Lock()
-	defer vw.mu.Unlock()
-	if vw.engineAdapter == nil {
-		return false
-	}
-	return vw.engineAdapter.UseEngine()
-}
-
-func (vw *VoiceWake) AvailableEngines() []string {
-	vw.mu.Lock()
-	defer vw.mu.Unlock()
-	if vw.engineRouter == nil {
-		return nil
-	}
-	return vw.engineRouter.Engines()
-}
-
-func (vw *VoiceWake) ActiveEngine() string {
-	vw.mu.Lock()
-	defer vw.mu.Unlock()
-	if vw.engineRouter == nil {
-		return ""
-	}
-	return vw.engineRouter.ActiveEngine()
-}
-
-func (vw *VoiceWake) EngineRouter() *WakeWordEngineRouter {
-	return vw.engineRouter
-}
-
-func (vw *VoiceWake) EngineAdapter() *WakeWordEngineAdapter {
-	return vw.engineAdapter
-}
-
-func (vw *VoiceWake) Close() error {
-	if err := vw.Stop(); err != nil {
-		return err
-	}
-
-	vw.mu.Lock()
-	router := vw.engineRouter
-	vw.mu.Unlock()
-
-	if router != nil {
-		return router.Close()
-	}
-	return nil
-}
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/02_stt_openai_client_refactor/pkg/speech/stt_openai_client.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/02_stt_openai_client_refactor/pkg/speech/stt_openai_client.go
deleted file mode 100644
index 3d986daa..00000000
--- a/_speech_vad_stt_remaining_unuploaded_20260428_171936/02_stt_openai_client_refactor/pkg/speech/stt_openai_client.go
+++ /dev/null
@@ -1,126 +0,0 @@
-package speech
-
-import (
-	"bytes"
-	"context"
-	"fmt"
-	"mime/multipart"
-	"net/http"
-)
-
-type openAIAudioAPIClient struct {
-	apiKey  string
-	baseURL string
-	client  *http.Client
-}
-
-func newOpenAIAudioAPIClient(apiKey, baseURL string, client *http.Client) *openAIAudioAPIClient {
-	return &openAIAudioAPIClient{
-		apiKey:  apiKey,
-		baseURL: baseURL,
-		client:  client,
-	}
-}
-
-func (c *openAIAudioAPIClient) DoTranscriptionRequest(ctx context.Context, endpoint string, audio []byte, options TranscribeOptions, stream bool) (*http.Response, error) {
-	body, contentType, err := c.buildMultipartBody(audio, options, stream)
-	if err != nil {
-		return nil, err
-	}
-
-	req, err := http.NewRequestWithContext(ctx, "POST", c.baseURL+endpoint, body)
-	if err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to create request: %v", err)
-	}
-
-	req.Header.Set("Authorization", "Bearer "+c.apiKey)
-	req.Header.Set("Content-Type", contentType)
-	req.Header.Set("User-Agent", "anyclaw-stt/1.0")
-	if stream {
-		req.Header.Set("Accept", "text/event-stream")
-	}
-
-	resp, err := c.client.Do(req)
-	if err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: request failed: %v", err)
-	}
-
-	return resp, nil
-}
-
-func (c *openAIAudioAPIClient) buildMultipartBody(audio []byte, options TranscribeOptions, stream bool) (*bytes.Buffer, string, error) {
-	var body bytes.Buffer
-	writer := multipart.NewWriter(&body)
-
-	filename := "audio." + string(options.InputFormat)
-	part, err := writer.CreateFormFile("file", filename)
-	if err != nil {
-		return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to create form file: %v", err)
-	}
-
-	if _, err := part.Write(audio); err != nil {
-		return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write audio data: %v", err)
-	}
-
-	if err := writer.WriteField("model", options.Model); err != nil {
-		return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write model field: %v", err)
-	}
-
-	if options.Language != "" {
-		if err := writer.WriteField("language", options.Language); err != nil {
-			return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write language field: %v", err)
-		}
-	}
-
-	if options.Prompt != "" {
-		if err := writer.WriteField("prompt", options.Prompt); err != nil {
-			return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write prompt field: %v", err)
-		}
-	}
-
-	if options.Temperature > 0 {
-		if err := writer.WriteField("temperature", fmt.Sprintf("%.2f", options.Temperature)); err != nil {
-			return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write temperature field: %v", err)
-		}
-	}
-
-	if options.MaxAlternatives > 0 {
-		if err := writer.WriteField("max_alternatives", fmt.Sprintf("%d", options.MaxAlternatives)); err != nil {
-			return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write max_alternatives field: %v", err)
-		}
-	}
-
-	if options.WordTimestamps || options.SpeakerLabels {
-		if options.WordTimestamps {
-			if err := writer.WriteField("timestamp_granularities[]", "word"); err != nil {
-				return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write word timestamp_granularities: %v", err)
-			}
-		}
-		if options.SpeakerLabels {
-			if err := writer.WriteField("timestamp_granularities[]", "segment"); err != nil {
-				return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write segment timestamp_granularities: %v", err)
-			}
-		}
-	}
-
-	responseType := "verbose_json"
-	if stream {
-		responseType = "json"
-	}
-	if err := writer.WriteField("response_format", responseType); err != nil {
-		return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write response_format field: %v", err)
-	}
-
-	if stream {
-		if err := writer.WriteField("stream", "true"); err != nil {
-			return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write stream field: %v", err)
-		}
-	}
-
-	if err := writer.Close(); err != nil {
-		return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to close multipart writer: %v", err)
-	}
-
-	return &body, writer.FormDataContentType(), nil
-}
-
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/02_stt_openai_client_refactor/pkg/speech/stt_whisper.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/02_stt_openai_client_refactor/pkg/speech/stt_whisper.go
deleted file mode 100644
index e94b451d..00000000
--- a/_speech_vad_stt_remaining_unuploaded_20260428_171936/02_stt_openai_client_refactor/pkg/speech/stt_whisper.go
+++ /dev/null
@@ -1,521 +0,0 @@
-package speech
-
-import (
-	"bufio"
-	"context"
-	"encoding/json"
-	"fmt"
-	"io"
-	"net/http"
-	"os"
-	"path/filepath"
-	"strings"
-	"time"
-)
-
-type WhisperModel string
-
-const (
-	WhisperModelV1 WhisperModel = "whisper-1"
-)
-
-var validWhisperModels = map[WhisperModel]bool{
-	WhisperModelV1: true,
-}
-
-var validInputFormats = map[AudioInputFormat]bool{
-	InputMP3:  true,
-	InputWAV:  true,
-	InputOGG:  true,
-	InputFLAC: true,
-	InputM4A:  true,
-	InputMP4:  true,
-	InputMPEG: true,
-	InputMPGA: true,
-	InputWEBM: true,
-}
-
-type WhisperProvider struct {
-	apiKey        string
-	baseURL       string
-	model         WhisperModel
-	language      string
-	timeout       time.Duration
-	retries       int
-	client        *http.Client
-	apiClient     *openAIAudioAPIClient
-	httpTransport *http.Transport
-}
-
-type WhisperOption func(*WhisperProvider)
-
-func WithWhisperBaseURL(url string) WhisperOption {
-	return func(p *WhisperProvider) {
-		p.baseURL = strings.TrimRight(url, "/")
-	}
-}
-
-func WithWhisperModel(model WhisperModel) WhisperOption {
-	return func(p *WhisperProvider) {
-		p.model = model
-	}
-}
-
-func WithWhisperLanguage(lang string) WhisperOption {
-	return func(p *WhisperProvider) {
-		p.language = lang
-	}
-}
-
-func WithWhisperTimeout(timeout time.Duration) WhisperOption {
-	return func(p *WhisperProvider) {
-		p.timeout = timeout
-	}
-}
-
-func WithWhisperRetries(retries int) WhisperOption {
-	return func(p *WhisperProvider) {
-		p.retries = retries
-	}
-}
-
-func WithWhisperHTTPTransport(transport *http.Transport) WhisperOption {
-	return func(p *WhisperProvider) {
-		p.httpTransport = transport
-	}
-}
-
-func NewWhisperProvider(apiKey string, opts ...WhisperOption) (*WhisperProvider, error) {
-	if apiKey == "" {
-		return nil, NewSTTError(ErrAuthentication, "openai: API key is required")
-	}
-
-	p := &WhisperProvider{
-		apiKey:  apiKey,
-		baseURL: "https://api.openai.com",
-		model:   WhisperModelV1,
-		timeout: 120 * time.Second,
-		retries: 2,
-		client:  &http.Client{Timeout: 120 * time.Second},
-	}
-
-	for _, opt := range opts {
-		opt(p)
-	}
-
-	if p.httpTransport != nil {
-		p.client.Transport = p.httpTransport
-	}
-	p.client.Timeout = p.timeout
-	p.apiClient = newOpenAIAudioAPIClient(p.apiKey, p.baseURL, p.client)
-
-	if !validWhisperModels[p.model] {
-		return nil, NewSTTErrorf(ErrProviderNotSupported, "openai: invalid whisper model: %s", p.model)
-	}
-
-	return p, nil
-}
-
-func (p *WhisperProvider) Name() string {
-	return "openai-whisper"
-}
-
-func (p *WhisperProvider) Type() STTProviderType {
-	return STTProviderOpenAI
-}
-
-func (p *WhisperProvider) Transcribe(ctx context.Context, audio []byte, opts ...TranscribeOption) (*TranscriptResult, error) {
-	options := TranscribeOptions{
-		Model:       string(p.model),
-		Language:    p.language,
-		Temperature: 0,
-		Mode:        ModeTranscription,
-		InputFormat: InputMP3,
-	}
-	for _, opt := range opts {
-		opt(&options)
-	}
-
-	if err := p.validateTranscribeOptions(options); err != nil {
-		return nil, err
-	}
-
-	if len(audio) == 0 {
-		return nil, NewSTTError(ErrAudioFormatInvalid, "openai-whisper: audio data is empty")
-	}
-
-	const maxAudioSize = 25 * 1024 * 1024
-	if len(audio) > maxAudioSize {
-		return nil, NewSTTErrorf(ErrAudioTooLarge, "openai-whisper: audio exceeds 25MB limit (%d bytes)", len(audio))
-	}
-
-	if !validInputFormats[options.InputFormat] {
-		return nil, NewSTTErrorf(ErrAudioFormatInvalid, "openai-whisper: unsupported input format: %s", options.InputFormat)
-	}
-
-	var lastErr error
-	for attempt := 0; attempt <= p.retries; attempt++ {
-		if attempt > 0 {
-			backoff := time.Duration(attempt) * time.Second
-			select {
-			case <-ctx.Done():
-				return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: context cancelled during retry: %v", ctx.Err())
-			case <-time.After(backoff):
-			}
-		}
-
-		result, err := p.doTranscribe(ctx, audio, options)
-		if err == nil {
-			return result, nil
-		}
-
-		lastErr = err
-
-		if sttErr, ok := err.(*STTError); ok {
-			if sttErr.Code == ErrAuthentication || sttErr.Code == ErrAudioFormatInvalid || sttErr.Code == ErrAudioTooLarge || sttErr.Code == ErrRateLimited {
-				return nil, err
-			}
-		}
-	}
-
-	return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: all %d retries failed: %v", p.retries, lastErr)
-}
-
-func (p *WhisperProvider) TranscribeFile(ctx context.Context, filePath string, opts ...TranscribeOption) (*TranscriptResult, error) {
-	if filePath == "" {
-		return nil, NewSTTError(ErrAudioFormatInvalid, "openai-whisper: file path is empty")
-	}
-
-	info, err := os.Stat(filePath)
-	if err != nil {
-		if os.IsNotExist(err) {
-			return nil, NewSTTErrorf(ErrAudioFormatInvalid, "openai-whisper: file not found: %s", filePath)
-		}
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to stat file: %v", err)
-	}
-
-	const maxAudioSize = 25 * 1024 * 1024
-	if info.Size() > maxAudioSize {
-		return nil, NewSTTErrorf(ErrAudioTooLarge, "openai-whisper: file exceeds 25MB limit (%d bytes)", info.Size())
-	}
-
-	audio, err := os.ReadFile(filePath)
-	if err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to read file: %v", err)
-	}
-
-	if len(opts) == 0 || anyInputFormatNotSet(opts) {
-		ext := strings.TrimPrefix(strings.ToLower(filepath.Ext(filePath)), ".")
-		if ext != "" {
-			formatOpts := append([]TranscribeOption{WithSTTInputFormat(AudioInputFormat(ext))}, opts...)
-			return p.Transcribe(ctx, audio, formatOpts...)
-		}
-	}
-
-	return p.Transcribe(ctx, audio, opts...)
-}
-
-func anyInputFormatNotSet(opts []TranscribeOption) bool {
-	for _, opt := range opts {
-		o := &TranscribeOptions{}
-		opt(o)
-		if o.InputFormat != "" {
-			return false
-		}
-	}
-	return true
-}
-
-func (p *WhisperProvider) TranscribeStream(ctx context.Context, reader io.Reader, opts ...TranscribeOption) (*TranscriptResult, error) {
-	if reader == nil {
-		return nil, NewSTTError(ErrAudioFormatInvalid, "openai-whisper: reader is nil")
-	}
-
-	audio, err := io.ReadAll(reader)
-	if err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to read stream: %v", err)
-	}
-
-	return p.Transcribe(ctx, audio, opts...)
-}
-
-func (p *WhisperProvider) validateTranscribeOptions(options TranscribeOptions) error {
-	if options.Temperature < 0 || options.Temperature > 1 {
-		return NewSTTErrorf(ErrAudioFormatInvalid, "openai-whisper: temperature must be between 0 and 1, got: %f", options.Temperature)
-	}
-
-	if options.MaxAlternatives < 0 {
-		return NewSTTErrorf(ErrAudioFormatInvalid, "openai-whisper: maxAlternatives cannot be negative")
-	}
-
-	if options.Model == "" {
-		return NewSTTError(ErrAudioFormatInvalid, "openai-whisper: model is required")
-	}
-
-	return nil
-}
-
-func (p *WhisperProvider) doTranscribe(ctx context.Context, audio []byte, options TranscribeOptions) (*TranscriptResult, error) {
-	var endpoint string
-	switch options.Mode {
-	case ModeTranslation:
-		endpoint = "/v1/audio/translations"
-	default:
-		endpoint = "/v1/audio/transcriptions"
-	}
-
-	resp, err := p.apiClient.DoTranscriptionRequest(ctx, endpoint, audio, options, false)
-	if err != nil {
-		return nil, err
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode != http.StatusOK {
-		respBody, _ := io.ReadAll(resp.Body)
-		return nil, p.handleErrorResponse(resp.StatusCode, respBody)
-	}
-
-	respBody, err := io.ReadAll(resp.Body)
-	if err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to read response: %v", err)
-	}
-
-	return p.parseResponse(respBody, options)
-}
-
-func (p *WhisperProvider) handleErrorResponse(statusCode int, body []byte) error {
-	var errResp whisperErrorResponse
-	if err := json.Unmarshal(body, &errResp); err == nil && errResp.Error.Message != "" {
-		msg := fmt.Sprintf("openai-whisper: API error: %s (type: %s, code: %s)",
-			errResp.Error.Message, errResp.Error.Type, errResp.Error.Code)
-		switch statusCode {
-		case http.StatusUnauthorized:
-			return NewSTTError(ErrAuthentication, msg)
-		case http.StatusTooManyRequests:
-			return NewSTTError(ErrRateLimited, msg)
-		case http.StatusBadRequest:
-			return NewSTTError(ErrAudioFormatInvalid, msg)
-		default:
-			return NewSTTError(ErrTranscriptionFailed, msg)
-		}
-	}
-
-	switch statusCode {
-	case http.StatusUnauthorized:
-		return NewSTTError(ErrAuthentication, fmt.Sprintf("openai-whisper: authentication failed: %s", string(body)))
-	case http.StatusTooManyRequests:
-		return NewSTTError(ErrRateLimited, fmt.Sprintf("openai-whisper: rate limited: %s", string(body)))
-	case http.StatusBadRequest:
-		return NewSTTErrorf(ErrAudioFormatInvalid, "openai-whisper: invalid request: %s", string(body))
-	case http.StatusServiceUnavailable:
-		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: service unavailable: %s", string(body))
-	default:
-		return NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: unexpected status %d: %s", statusCode, string(body))
-	}
-}
-
-type whisperResponse struct {
-	Text     string  `json:"text"`
-	Language string  `json:"language"`
-	Duration float64 `json:"duration,omitempty"`
-	Segments []struct {
-		ID           int     `json:"id"`
-		Seek         int     `json:"seek"`
-		Start        float64 `json:"start"`
-		End          float64 `json:"end"`
-		Text         string  `json:"text"`
-		Tokens       []int   `json:"tokens"`
-		Temperature  float64 `json:"temperature"`
-		AvgLogProb   float64 `json:"avg_logprob"`
-		Compression  float64 `json:"compression_ratio"`
-		NoSpeechProb float64 `json:"no_speech_prob"`
-		Words        []struct {
-			Word       string  `json:"word"`
-			Start      float64 `json:"start"`
-			End        float64 `json:"end"`
-			Confidence float64 `json:"probability"`
-		} `json:"words,omitempty"`
-	} `json:"segments,omitempty"`
-	LanguageProbability float64 `json:"language_probability,omitempty"`
-}
-
-type whisperErrorResponse struct {
-	Error struct {
-		Message string `json:"message"`
-		Type    string `json:"type"`
-		Code    string `json:"code"`
-	} `json:"error"`
-}
-
-func (p *WhisperProvider) parseResponse(body []byte, options TranscribeOptions) (*TranscriptResult, error) {
-	var resp whisperResponse
-	if err := json.Unmarshal(body, &resp); err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to parse JSON response: %v", err)
-	}
-
-	result := &TranscriptResult{
-		Text:       strings.TrimSpace(resp.Text),
-		Language:   resp.Language,
-		Duration:   time.Duration(resp.Duration * float64(time.Second)),
-		Confidence: resp.LanguageProbability,
-	}
-
-	if len(resp.Segments) > 0 {
-		result.Segments = make([]SegmentInfo, 0, len(resp.Segments))
-		for _, seg := range resp.Segments {
-			segment := SegmentInfo{
-				ID:        seg.ID,
-				Text:      seg.Text,
-				StartTime: time.Duration(seg.Start * float64(time.Second)),
-				EndTime:   time.Duration(seg.End * float64(time.Second)),
-			}
-
-			if seg.AvgLogProb != 0 {
-				segment.Confidence = normalizeLogProb(seg.AvgLogProb)
-			}
-
-			if len(seg.Words) > 0 {
-				segment.Words = make([]WordInfo, 0, len(seg.Words))
-				for _, w := range seg.Words {
-					segment.Words = append(segment.Words, WordInfo{
-						Word:       w.Word,
-						StartTime:  time.Duration(w.Start * float64(time.Second)),
-						EndTime:    time.Duration(w.End * float64(time.Second)),
-						Confidence: w.Confidence,
-					})
-				}
-			}
-
-			result.Segments = append(result.Segments, segment)
-		}
-
-		if len(result.Segments) > 0 && result.Confidence == 0 {
-			totalConfidence := 0.0
-			for _, seg := range result.Segments {
-				totalConfidence += seg.Confidence
-			}
-			result.Confidence = totalConfidence / float64(len(result.Segments))
-		}
-	}
-
-	if options.WordTimestamps && len(result.Segments) > 0 {
-		words := make([]WordInfo, 0)
-		for _, seg := range result.Segments {
-			words = append(words, seg.Words...)
-		}
-		result.Words = words
-	}
-
-	return result, nil
-}
-
-func normalizeLogProb(logProb float64) float64 {
-	if logProb > 0 {
-		return 1.0
-	}
-	prob := 1.0 / (1.0 + logProb*-1)
-	if prob < 0 {
-		return 0
-	}
-	if prob > 1 {
-		return 1
-	}
-	return prob
-}
-
-func (p *WhisperProvider) TranscribeSSE(ctx context.Context, audio []byte, onChunk func(chunk *TranscriptResult), opts ...TranscribeOption) error {
-	options := TranscribeOptions{
-		Model:       string(p.model),
-		Language:    p.language,
-		Temperature: 0,
-		Mode:        ModeTranscription,
-		InputFormat: InputMP3,
-	}
-	for _, opt := range opts {
-		opt(&options)
-	}
-
-	if err := p.validateTranscribeOptions(options); err != nil {
-		return err
-	}
-
-	if len(audio) == 0 {
-		return NewSTTError(ErrAudioFormatInvalid, "openai-whisper: audio data is empty")
-	}
-
-	endpoint := "/v1/audio/transcriptions"
-	if options.Mode == ModeTranslation {
-		endpoint = "/v1/audio/translations"
-	}
-
-	resp, err := p.apiClient.DoTranscriptionRequest(ctx, endpoint, audio, options, true)
-	if err != nil {
-		return err
-	}
-	defer resp.Body.Close()
-
-	if resp.StatusCode != http.StatusOK {
-		respBody, _ := io.ReadAll(resp.Body)
-		return p.handleErrorResponse(resp.StatusCode, respBody)
-	}
-
-	return p.readSSEStream(resp.Body, onChunk)
-}
-
-func (p *WhisperProvider) readSSEStream(reader io.Reader, onChunk func(chunk *TranscriptResult)) error {
-	scanner := bufio.NewScanner(reader)
-	scanner.Split(bufio.ScanLines)
-
-	var currentText strings.Builder
-	var detectedLanguage string
-
-	for scanner.Scan() {
-		line := scanner.Text()
-
-		if strings.HasPrefix(line, "data: ") {
-			data := strings.TrimPrefix(line, "data: ")
-			if data == "[DONE]" {
-				break
-			}
-
-			var chunk struct {
-				Text     string `json:"text"`
-				Language string `json:"language"`
-				Done     bool   `json:"done"`
-			}
-			if err := json.Unmarshal([]byte(data), &chunk); err != nil {
-				continue
-			}
-
-			if chunk.Text != "" {
-				currentText.WriteString(chunk.Text)
-			}
-			if chunk.Language != "" {
-				detectedLanguage = chunk.Language
-			}
-
-			onChunk(&TranscriptResult{
-				Text:     currentText.String(),
-				Language: detectedLanguage,
-			})
-
-			if chunk.Done {
-				break
-			}
-		}
-	}
-
-	return scanner.Err()
-}
-
-func (p *WhisperProvider) ListLanguages(ctx context.Context) ([]string, error) {
-	return []string{
-		"af", "am", "ar", "as", "az", "ba", "be", "bg", "bn", "bo", "br", "bs", "ca", "cs", "cy", "da",
-		"de", "el", "en", "es", "et", "eu", "fa", "fi", "fo", "fr", "gl", "gu", "ha", "haw", "he", "hi",
-		"hr", "ht", "hu", "hy", "id", "is", "it", "ja", "jw", "ka", "kk", "km", "kn", "ko", "la", "lb",
-		"ln", "lo", "lt", "lv", "mg", "mi", "mk", "ml", "mn", "mr", "ms", "mt", "my", "ne", "nl", "nn",
-		"no", "oc", "pa", "pl", "ps", "pt", "ro", "ru", "sa", "sd", "si", "sk", "sl", "sn", "so", "sq",
-		"sr", "su", "sv", "sw", "ta", "te", "tg", "th", "tk", "tl", "tr", "tt", "uk", "ur", "uz", "vi",
-		"yi", "yo", "zh",
-	}, nil
-}
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/go.mod b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/go.mod
deleted file mode 100644
index 0917ad2d..00000000
--- a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/go.mod
+++ /dev/null
@@ -1,102 +0,0 @@
-module github.com/1024XEngineer/anyclaw
-
-go 1.25.1
-
-require (
-	github.com/anyclaw/anyclaw v0.0.0
-	github.com/charmbracelet/bubbles v1.0.0
-	github.com/charmbracelet/bubbletea v1.3.10
-	github.com/charmbracelet/lipgloss v1.1.0
-	github.com/charmbracelet/x/term v0.2.2
-	github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335
-	github.com/chromedp/chromedp v0.10.0
-	github.com/gorilla/websocket v1.5.3
-	github.com/wailsapp/wails/v2 v2.11.0
-	golang.org/x/sys v0.42.0
-	golang.org/x/text v0.35.0
-	modernc.org/sqlite v1.48.1
-)
-
-replace github.com/anyclaw/anyclaw => .
-
-require (
-	cloud.google.com/go v0.123.0 // indirect
-	cloud.google.com/go/auth v0.20.0 // indirect
-	cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect
-	cloud.google.com/go/compute/metadata v0.9.0 // indirect
-	cloud.google.com/go/longrunning v0.9.0 // indirect
-	cloud.google.com/go/speech v1.33.0 // indirect
-	github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
-	github.com/bep/debounce v1.2.1 // indirect
-	github.com/cespare/xxhash/v2 v2.3.0 // indirect
-	github.com/charmbracelet/colorprofile v0.4.1 // indirect
-	github.com/charmbracelet/x/ansi v0.11.6 // indirect
-	github.com/charmbracelet/x/cellbuf v0.0.15 // indirect
-	github.com/chromedp/sysutil v1.0.0 // indirect
-	github.com/clipperhouse/displaywidth v0.9.0 // indirect
-	github.com/clipperhouse/stringish v0.1.1 // indirect
-	github.com/clipperhouse/uax29/v2 v2.5.0 // indirect
-	github.com/dustin/go-humanize v1.0.1 // indirect
-	github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
-	github.com/felixge/httpsnoop v1.0.4 // indirect
-	github.com/go-logr/logr v1.4.3 // indirect
-	github.com/go-logr/stdr v1.2.2 // indirect
-	github.com/go-ole/go-ole v1.3.0 // indirect
-	github.com/gobwas/httphead v0.1.0 // indirect
-	github.com/gobwas/pool v0.2.1 // indirect
-	github.com/gobwas/ws v1.4.0 // indirect
-	github.com/godbus/dbus/v5 v5.1.0 // indirect
-	github.com/godeps/webrtcvad-go v0.1.0 // indirect
-	github.com/google/s2a-go v0.1.9 // indirect
-	github.com/google/uuid v1.6.0 // indirect
-	github.com/googleapis/enterprise-certificate-proxy v0.3.14 // indirect
-	github.com/googleapis/gax-go/v2 v2.21.0 // indirect
-	github.com/jchv/go-winloader v0.0.0-20210711035445-715c2860da7e // indirect
-	github.com/josharian/intern v1.0.0 // indirect
-	github.com/labstack/echo/v4 v4.13.3 // indirect
-	github.com/labstack/gommon v0.4.2 // indirect
-	github.com/leaanthony/go-ansi-parser v1.6.1 // indirect
-	github.com/leaanthony/gosod v1.0.4 // indirect
-	github.com/leaanthony/slicer v1.6.0 // indirect
-	github.com/leaanthony/u v1.1.1 // indirect
-	github.com/lucasb-eyer/go-colorful v1.3.0 // indirect
-	github.com/mailru/easyjson v0.7.7 // indirect
-	github.com/mattn/go-colorable v0.1.13 // indirect
-	github.com/mattn/go-isatty v0.0.20 // indirect
-	github.com/mattn/go-localereader v0.0.1 // indirect
-	github.com/mattn/go-runewidth v0.0.19 // indirect
-	github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
-	github.com/muesli/cancelreader v0.2.2 // indirect
-	github.com/muesli/termenv v0.16.0 // indirect
-	github.com/ncruces/go-strftime v1.0.0 // indirect
-	github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
-	github.com/pkg/errors v0.9.1 // indirect
-	github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
-	github.com/rivo/uniseg v0.4.7 // indirect
-	github.com/samber/lo v1.49.1 // indirect
-	github.com/tkrajina/go-reflector v0.5.8 // indirect
-	github.com/valyala/bytebufferpool v1.0.0 // indirect
-	github.com/valyala/fasttemplate v1.2.2 // indirect
-	github.com/wailsapp/go-webview2 v1.0.22 // indirect
-	github.com/wailsapp/mimetype v1.4.1 // indirect
-	github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
-	go.opentelemetry.io/auto/sdk v1.2.1 // indirect
-	go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.67.0 // indirect
-	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0 // indirect
-	go.opentelemetry.io/otel v1.43.0 // indirect
-	go.opentelemetry.io/otel/metric v1.43.0 // indirect
-	go.opentelemetry.io/otel/trace v1.43.0 // indirect
-	golang.org/x/crypto v0.49.0 // indirect
-	golang.org/x/net v0.52.0 // indirect
-	golang.org/x/oauth2 v0.36.0 // indirect
-	golang.org/x/sync v0.20.0 // indirect
-	golang.org/x/time v0.15.0 // indirect
-	google.golang.org/api v0.275.0 // indirect
-	google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect
-	google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect
-	google.golang.org/grpc v1.80.0 // indirect
-	google.golang.org/protobuf v1.36.11 // indirect
-	modernc.org/libc v1.70.0 // indirect
-	modernc.org/mathutil v1.7.1 // indirect
-	modernc.org/memory v1.11.0 // indirect
-)
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/go.sum b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/go.sum
deleted file mode 100644
index c0e69ec6..00000000
--- a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/go.sum
+++ /dev/null
@@ -1,251 +0,0 @@
-cloud.google.com/go v0.123.0 h1:2NAUJwPR47q+E35uaJeYoNhuNEM9kM8SjgRgdeOJUSE=
-cloud.google.com/go v0.123.0/go.mod h1:xBoMV08QcqUGuPW65Qfm1o9Y4zKZBpGS+7bImXLTAZU=
-cloud.google.com/go/auth v0.20.0 h1:kXTssoVb4azsVDoUiF8KvxAqrsQcQtB53DcSgta74CA=
-cloud.google.com/go/auth v0.20.0/go.mod h1:942/yi/itH1SsmpyrbnTMDgGfdy2BUqIKyd0cyYLc5Q=
-cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc=
-cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c=
-cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs=
-cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10=
-cloud.google.com/go/longrunning v0.9.0 h1:0EzbDEGsAvOZNbqXopgniY0w0a1phvu5IdUFq8grmqY=
-cloud.google.com/go/longrunning v0.9.0/go.mod h1:pkTz846W7bF4o2SzdWJ40Hu0Re+UoNT6Q5t+igIcb8E=
-cloud.google.com/go/speech v1.33.0 h1:555yroj4HCS7SPgfHuDU8zX+E5KrhccVWG96HNyBUAk=
-cloud.google.com/go/speech v1.33.0/go.mod h1:shnf33sZbGnQQZyek1fdLOR5rRKV6D3jsNqpqyijvj8=
-github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
-github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
-github.com/bep/debounce v1.2.1 h1:v67fRdBA9UQu2NhLFXrSg0Brw7CexQekrBwDMM8bzeY=
-github.com/bep/debounce v1.2.1/go.mod h1:H8yggRPQKLUhUoqrJC1bO2xNya7vanpDl7xR3ISbCJ0=
-github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
-github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
-github.com/charmbracelet/bubbles v1.0.0 h1:12J8/ak/uCZEMQ6KU7pcfwceyjLlWsDLAxB5fXonfvc=
-github.com/charmbracelet/bubbles v1.0.0/go.mod h1:9d/Zd5GdnauMI5ivUIVisuEm3ave1XwXtD1ckyV6r3E=
-github.com/charmbracelet/bubbletea v1.3.10 h1:otUDHWMMzQSB0Pkc87rm691KZ3SWa4KUlvF9nRvCICw=
-github.com/charmbracelet/bubbletea v1.3.10/go.mod h1:ORQfo0fk8U+po9VaNvnV95UPWA1BitP1E0N6xJPlHr4=
-github.com/charmbracelet/colorprofile v0.4.1 h1:a1lO03qTrSIRaK8c3JRxJDZOvhvIeSco3ej+ngLk1kk=
-github.com/charmbracelet/colorprofile v0.4.1/go.mod h1:U1d9Dljmdf9DLegaJ0nGZNJvoXAhayhmidOdcBwAvKk=
-github.com/charmbracelet/lipgloss v1.1.0 h1:vYXsiLHVkK7fp74RkV7b2kq9+zDLoEU4MZoFqR/noCY=
-github.com/charmbracelet/lipgloss v1.1.0/go.mod h1:/6Q8FR2o+kj8rz4Dq0zQc3vYf7X+B0binUUBwA0aL30=
-github.com/charmbracelet/x/ansi v0.11.6 h1:GhV21SiDz/45W9AnV2R61xZMRri5NlLnl6CVF7ihZW8=
-github.com/charmbracelet/x/ansi v0.11.6/go.mod h1:2JNYLgQUsyqaiLovhU2Rv/pb8r6ydXKS3NIttu3VGZQ=
-github.com/charmbracelet/x/cellbuf v0.0.15 h1:ur3pZy0o6z/R7EylET877CBxaiE1Sp1GMxoFPAIztPI=
-github.com/charmbracelet/x/cellbuf v0.0.15/go.mod h1:J1YVbR7MUuEGIFPCaaZ96KDl5NoS0DAWkskup+mOY+Q=
-github.com/charmbracelet/x/term v0.2.2 h1:xVRT/S2ZcKdhhOuSP4t5cLi5o+JxklsoEObBSgfgZRk=
-github.com/charmbracelet/x/term v0.2.2/go.mod h1:kF8CY5RddLWrsgVwpw4kAa6TESp6EB5y3uxGLeCqzAI=
-github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335 h1:bATMoZLH2QGct1kzDxfmeBUQI/QhQvB0mBrOTct+YlQ=
-github.com/chromedp/cdproto v0.0.0-20240801214329-3f85d328b335/go.mod h1:GKljq0VrfU4D5yc+2qA6OVr8pmO/MBbPEWqWQ/oqGEs=
-github.com/chromedp/chromedp v0.10.0 h1:bRclRYVpMm/UVD76+1HcRW9eV3l58rFfy7AdBvKab1E=
-github.com/chromedp/chromedp v0.10.0/go.mod h1:ei/1ncZIqXX1YnAYDkxhD4gzBgavMEUu7JCKvztdomE=
-github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3Ic=
-github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww=
-github.com/clipperhouse/displaywidth v0.9.0 h1:Qb4KOhYwRiN3viMv1v/3cTBlz3AcAZX3+y9OLhMtAtA=
-github.com/clipperhouse/displaywidth v0.9.0/go.mod h1:aCAAqTlh4GIVkhQnJpbL0T/WfcrJXHcj8C0yjYcjOZA=
-github.com/clipperhouse/stringish v0.1.1 h1:+NSqMOr3GR6k1FdRhhnXrLfztGzuG+VuFDfatpWHKCs=
-github.com/clipperhouse/stringish v0.1.1/go.mod h1:v/WhFtE1q0ovMta2+m+UbpZ+2/HEXNWYXQgCt4hdOzA=
-github.com/clipperhouse/uax29/v2 v2.5.0 h1:x7T0T4eTHDONxFJsL94uKNKPHrclyFI0lm7+w94cO8U=
-github.com/clipperhouse/uax29/v2 v2.5.0/go.mod h1:Wn1g7MK6OoeDT0vL+Q0SQLDz/KpfsVRgg6W7ihQeh4g=
-github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
-github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
-github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
-github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
-github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4=
-github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM=
-github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
-github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
-github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
-github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
-github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
-github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
-github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
-github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE=
-github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78=
-github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU=
-github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
-github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
-github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
-github.com/gobwas/ws v1.4.0 h1:CTaoG1tojrh4ucGPcoJFiAQUAsEWekEWvLy7GsVNqGs=
-github.com/gobwas/ws v1.4.0/go.mod h1:G3gNqMNtPppf5XUz7O4shetPpcZ1VJ7zt18dlUeakrc=
-github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk=
-github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
-github.com/godeps/webrtcvad-go v0.1.0 h1:JpVfJHSzND9p/iuO7xqko1UlB/UJjKxskEWEbzKKjrQ=
-github.com/godeps/webrtcvad-go v0.1.0/go.mod h1:487THSHEZrYU29LRm4AKYCm/Y8PPq3pIJSuz1KX3MwU=
-github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e h1:ijClszYn+mADRFY17kjQEVQ1XRhq2/JR1M3sGqeJoxs=
-github.com/google/pprof v0.0.0-20250317173921-a4b03ec1a45e/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
-github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0=
-github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM=
-github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
-github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/googleapis/enterprise-certificate-proxy v0.3.14 h1:yh8ncqsbUY4shRD5dA6RlzjJaT4hi3kII+zYw8wmLb8=
-github.com/googleapis/enterprise-certificate-proxy v0.3.14/go.mod h1:vqVt9yG9480NtzREnTlmGSBmFrA+bzb0yl0TxoBQXOg=
-github.com/googleapis/gax-go/v2 v2.21.0 h1:h45NjjzEO3faG9Lg/cFrBh2PgegVVgzqKzuZl/wMbiI=
-github.com/googleapis/gax-go/v2 v2.21.0/go.mod h1:But/NJU6TnZsrLai/xBAQLLz+Hc7fHZJt/hsCz3Fih4=
-github.com/gorilla/websocket v1.5.3 h1:saDtZ6Pbx/0u+bgYQ3q96pZgCzfhKXGPqt7kZ72aNNg=
-github.com/gorilla/websocket v1.5.3/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/adAjf1fMHhE=
-github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k=
-github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
-github.com/jchv/go-winloader v0.0.0-20210711035445-715c2860da7e h1:Q3+PugElBCf4PFpxhErSzU3/PY5sFL5Z6rfv4AbGAck=
-github.com/jchv/go-winloader v0.0.0-20210711035445-715c2860da7e/go.mod h1:alcuEEnZsY1WQsagKhZDsoPCRoOijYqhZvPwLG0kzVs=
-github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
-github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
-github.com/labstack/echo/v4 v4.13.3 h1:pwhpCPrTl5qry5HRdM5FwdXnhXSLSY+WE+YQSeCaafY=
-github.com/labstack/echo/v4 v4.13.3/go.mod h1:o90YNEeQWjDozo584l7AwhJMHN0bOC4tAfg+Xox9q5g=
-github.com/labstack/gommon v0.4.2 h1:F8qTUNXgG1+6WQmqoUWnz8WiEU60mXVVw0P4ht1WRA0=
-github.com/labstack/gommon v0.4.2/go.mod h1:QlUFxVM+SNXhDL/Z7YhocGIBYOiwB0mXm1+1bAPHPyU=
-github.com/leaanthony/debme v1.2.1 h1:9Tgwf+kjcrbMQ4WnPcEIUcQuIZYqdWftzZkBr+i/oOc=
-github.com/leaanthony/debme v1.2.1/go.mod h1:3V+sCm5tYAgQymvSOfYQ5Xx2JCr+OXiD9Jkw3otUjiA=
-github.com/leaanthony/go-ansi-parser v1.6.1 h1:xd8bzARK3dErqkPFtoF9F3/HgN8UQk0ed1YDKpEz01A=
-github.com/leaanthony/go-ansi-parser v1.6.1/go.mod h1:+vva/2y4alzVmmIEpk9QDhA7vLC5zKDTRwfZGOp3IWU=
-github.com/leaanthony/gosod v1.0.4 h1:YLAbVyd591MRffDgxUOU1NwLhT9T1/YiwjKZpkNFeaI=
-github.com/leaanthony/gosod v1.0.4/go.mod h1:GKuIL0zzPj3O1SdWQOdgURSuhkF+Urizzxh26t9f1cw=
-github.com/leaanthony/slicer v1.6.0 h1:1RFP5uiPJvT93TAHi+ipd3NACobkW53yUiBqZheE/Js=
-github.com/leaanthony/slicer v1.6.0/go.mod h1:o/Iz29g7LN0GqH3aMjWAe90381nyZlDNquK+mtH2Fj8=
-github.com/leaanthony/u v1.1.1 h1:TUFjwDGlNX+WuwVEzDqQwC2lOv0P4uhTQw7CMFdiK7M=
-github.com/leaanthony/u v1.1.1/go.mod h1:9+o6hejoRljvZ3BzdYlVL0JYCwtnAsVuN9pVTQcaRfI=
-github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80 h1:6Yzfa6GP0rIo/kULo2bwGEkFvCePZ3qHDDTC3/J9Swo=
-github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
-github.com/lucasb-eyer/go-colorful v1.3.0 h1:2/yBRLdWBZKrf7gB40FoiKfAWYQ0lqNcbuQwVHXptag=
-github.com/lucasb-eyer/go-colorful v1.3.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
-github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
-github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
-github.com/matryer/is v1.4.0/go.mod h1:8I/i5uYgLzgsgEloJE1U6xx5HkBQpAZvepWuujKwMRU=
-github.com/matryer/is v1.4.1 h1:55ehd8zaGABKLXQUe2awZ99BD/PTc2ls+KV/dXphgEQ=
-github.com/matryer/is v1.4.1/go.mod h1:8I/i5uYgLzgsgEloJE1U6xx5HkBQpAZvepWuujKwMRU=
-github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
-github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
-github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
-github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
-github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
-github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
-github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
-github.com/mattn/go-runewidth v0.0.19 h1:v++JhqYnZuu5jSKrk9RbgF5v4CGUjqRfBm05byFGLdw=
-github.com/mattn/go-runewidth v0.0.19/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs=
-github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI=
-github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo=
-github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
-github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
-github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc=
-github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk=
-github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
-github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
-github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde h1:x0TT0RDC7UhAVbbWWBzr41ElhJx5tXPWkIHA2HWPRuw=
-github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0=
-github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ=
-github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU=
-github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
-github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
-github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
-github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
-github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
-github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
-github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
-github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
-github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
-github.com/samber/lo v1.49.1 h1:4BIFyVfuQSEpluc7Fua+j1NolZHiEHEpaSEKdsH0tew=
-github.com/samber/lo v1.49.1/go.mod h1:dO6KHFzUKXgP8LDhU0oI8d2hekjXnGOu0DB8Jecxd6o=
-github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
-github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
-github.com/tkrajina/go-reflector v0.5.8 h1:yPADHrwmUbMq4RGEyaOUpz2H90sRsETNVpjzo3DLVQQ=
-github.com/tkrajina/go-reflector v0.5.8/go.mod h1:ECbqLgccecY5kPmPmXg1MrHW585yMcDkVl6IvJe64T4=
-github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
-github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
-github.com/valyala/fasttemplate v1.2.2 h1:lxLXG0uE3Qnshl9QyaK6XJxMXlQZELvChBOCmQD0Loo=
-github.com/valyala/fasttemplate v1.2.2/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ=
-github.com/wailsapp/go-webview2 v1.0.22 h1:YT61F5lj+GGaat5OB96Aa3b4QA+mybD0Ggq6NZijQ58=
-github.com/wailsapp/go-webview2 v1.0.22/go.mod h1:qJmWAmAmaniuKGZPWwne+uor3AHMB5PFhqiK0Bbj8kc=
-github.com/wailsapp/mimetype v1.4.1 h1:pQN9ycO7uo4vsUUuPeHEYoUkLVkaRntMnHJxVwYhwHs=
-github.com/wailsapp/mimetype v1.4.1/go.mod h1:9aV5k31bBOv5z6u+QP8TltzvNGJPmNJD4XlAL3U+j3o=
-github.com/wailsapp/wails/v2 v2.11.0 h1:seLacV8pqupq32IjS4Y7V8ucab0WZwtK6VvUVxSBtqQ=
-github.com/wailsapp/wails/v2 v2.11.0/go.mod h1:jrf0ZaM6+GBc1wRmXsM8cIvzlg0karYin3erahI4+0k=
-github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no=
-github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM=
-go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64=
-go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
-go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.67.0 h1:yI1/OhfEPy7J9eoa6Sj051C7n5dvpj0QX8g4sRchg04=
-go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.67.0/go.mod h1:NoUCKYWK+3ecatC4HjkRktREheMeEtrXoQxrqYFeHSc=
-go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0 h1:OyrsyzuttWTSur2qN/Lm0m2a8yqyIjUVBZcxFPuXq2o=
-go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0/go.mod h1:C2NGBr+kAB4bk3xtMXfZ94gqFDtg/GkI7e9zqGh5Beg=
-go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I=
-go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0=
-go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM=
-go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY=
-go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A=
-go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0=
-golang.org/x/crypto v0.33.0 h1:IOBPskki6Lysi0lo9qQvbxiQ+FvsCC/YWOecCHAixus=
-golang.org/x/crypto v0.33.0/go.mod h1:bVdXmD7IV/4GdElGPozy6U7lWdRXA4qyRVGJV57uQ5M=
-golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4=
-golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA=
-golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 h1:mgKeJMpvi0yx/sU5GsxQ7p6s2wtOnGAHZWCHUM4KGzY=
-golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546/go.mod h1:j/pmGrbnkbPtQfxEe5D0VQhZC6qKbfKifgD0oM7sR70=
-golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8=
-golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w=
-golang.org/x/net v0.0.0-20210505024714-0287a6fb4125/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
-golang.org/x/net v0.35.0 h1:T5GQRQb2y08kTAByq9L4/bz8cipCdA8FbRTXewonqY8=
-golang.org/x/net v0.35.0/go.mod h1:EglIi67kWsHKlRzzVMUD93VMSWGFOMSZgxFjparz1Qk=
-golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0=
-golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw=
-golang.org/x/oauth2 v0.36.0 h1:peZ/1z27fi9hUOFCAZaHyrpWG5lwe0RJEEEeH0ThlIs=
-golang.org/x/oauth2 v0.36.0/go.mod h1:YDBUJMTkDnJS+A4BP4eZBjCqtokkg1hODuPjwiGPO7Q=
-golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4=
-golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
-golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
-golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
-golang.org/x/sys v0.0.0-20200810151505-1b9f1253b3ed/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
-golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
-golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
-golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
-golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
-golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
-golang.org/x/text v0.22.0 h1:bofq7m3/HAFvbF51jz3Q9wLg3jkvSPuiZu/pD1XwgtM=
-golang.org/x/text v0.22.0/go.mod h1:YRoo4H8PVmsu+E3Ou7cqLVH8oXWIHVoX0jqUWALQhfY=
-golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8=
-golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA=
-golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
-golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
-golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
-golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
-golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
-google.golang.org/api v0.275.0 h1:vfY5d9vFVJeWEZT65QDd9hbndr7FyZ2+6mIzGAh71NI=
-google.golang.org/api v0.275.0/go.mod h1:Fnag/EWUPIcJXuIkP1pjoTgS5vdxlk3eeemL7Do6bvw=
-google.golang.org/genproto v0.0.0-20260319201613-d00831a3d3e7 h1:XzmzkmB14QhVhgnawEVsOn6OFsnpyxNPRY9QV01dNB0=
-google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA=
-google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M=
-google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg=
-google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8=
-google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM=
-google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4=
-google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
-google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
-gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
-gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
-modernc.org/cc/v4 v4.27.1 h1:9W30zRlYrefrDV2JE2O8VDtJ1yPGownxciz5rrbQZis=
-modernc.org/cc/v4 v4.27.1/go.mod h1:uVtb5OGqUKpoLWhqwNQo/8LwvoiEBLvZXIQ/SmO6mL0=
-modernc.org/ccgo/v4 v4.32.0 h1:hjG66bI/kqIPX1b2yT6fr/jt+QedtP2fqojG2VrFuVw=
-modernc.org/ccgo/v4 v4.32.0/go.mod h1:6F08EBCx5uQc38kMGl+0Nm0oWczoo1c7cgpzEry7Uc0=
-modernc.org/fileutil v1.4.0 h1:j6ZzNTftVS054gi281TyLjHPp6CPHr2KCxEXjEbD6SM=
-modernc.org/fileutil v1.4.0/go.mod h1:EqdKFDxiByqxLk8ozOxObDSfcVOv/54xDs/DUHdvCUU=
-modernc.org/gc/v2 v2.6.5 h1:nyqdV8q46KvTpZlsw66kWqwXRHdjIlJOhG6kxiV/9xI=
-modernc.org/gc/v2 v2.6.5/go.mod h1:YgIahr1ypgfe7chRuJi2gD7DBQiKSLMPgBQe9oIiito=
-modernc.org/gc/v3 v3.1.2 h1:ZtDCnhonXSZexk/AYsegNRV1lJGgaNZJuKjJSWKyEqo=
-modernc.org/gc/v3 v3.1.2/go.mod h1:HFK/6AGESC7Ex+EZJhJ2Gni6cTaYpSMmU/cT9RmlfYY=
-modernc.org/goabi0 v0.2.0 h1:HvEowk7LxcPd0eq6mVOAEMai46V+i7Jrj13t4AzuNks=
-modernc.org/goabi0 v0.2.0/go.mod h1:CEFRnnJhKvWT1c1JTI3Avm+tgOWbkOu5oPA8eH8LnMI=
-modernc.org/libc v1.70.0 h1:U58NawXqXbgpZ/dcdS9kMshu08aiA6b7gusEusqzNkw=
-modernc.org/libc v1.70.0/go.mod h1:OVmxFGP1CI/Z4L3E0Q3Mf1PDE0BucwMkcXjjLntvHJo=
-modernc.org/mathutil v1.7.1 h1:GCZVGXdaN8gTqB1Mf/usp1Y/hSqgI2vAGGP4jZMCxOU=
-modernc.org/mathutil v1.7.1/go.mod h1:4p5IwJITfppl0G4sUEDtCr4DthTaT47/N3aT6MhfgJg=
-modernc.org/memory v1.11.0 h1:o4QC8aMQzmcwCK3t3Ux/ZHmwFPzE6hf2Y5LbkRs+hbI=
-modernc.org/memory v1.11.0/go.mod h1:/JP4VbVC+K5sU2wZi9bHoq2MAkCnrt2r98UGeSK7Mjw=
-modernc.org/opt v0.1.4 h1:2kNGMRiUjrp4LcaPuLY2PzUfqM/w9N23quVwhKt5Qm8=
-modernc.org/opt v0.1.4/go.mod h1:03fq9lsNfvkYSfxrfUhZCWPk1lm4cq4N+Bh//bEtgns=
-modernc.org/sortutil v1.2.1 h1:+xyoGf15mM3NMlPDnFqrteY07klSFxLElE2PVuWIJ7w=
-modernc.org/sortutil v1.2.1/go.mod h1:7ZI3a3REbai7gzCLcotuw9AC4VZVpYMjDzETGsSMqJE=
-modernc.org/sqlite v1.48.1 h1:S85iToyU6cgeojybE2XJlSbcsvcWkQ6qqNXJHtW5hWA=
-modernc.org/sqlite v1.48.1/go.mod h1:hWjRO6Tj/5Ik8ieqxQybiEOUXy0NJFNp2tpvVpKlvig=
-modernc.org/strutil v1.2.1 h1:UneZBkQA+DX2Rp35KcM69cSsNES9ly8mQWD71HKlOA0=
-modernc.org/strutil v1.2.1/go.mod h1:EHkiggD70koQxjVdSBM3JKM7k6L0FbGE5eymy9i3B9A=
-modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y=
-modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM=
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google.go
deleted file mode 100644
index ebb7145b..00000000
--- a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google.go
+++ /dev/null
@@ -1,502 +0,0 @@
-package speech
-
-import (
-	"context"
-	"errors"
-	"fmt"
-	"io"
-	"strings"
-	"time"
-
-	speechpb "cloud.google.com/go/speech/apiv1/speechpb"
-	"google.golang.org/api/googleapi"
-	"google.golang.org/api/option"
-	"google.golang.org/grpc/codes"
-	"google.golang.org/grpc/status"
-	"google.golang.org/protobuf/types/known/durationpb"
-)
-
-type GoogleModel string
-
-const (
-	GoogleModelLatestLong            GoogleModel = "latest_long"
-	GoogleModelLatestShort           GoogleModel = "latest_short"
-	GoogleModelCommandAndSearch      GoogleModel = "command_and_search"
-	GoogleModelPhoneCall             GoogleModel = "phone_call"
-	GoogleModelVideo                 GoogleModel = "video"
-	GoogleModelDefault               GoogleModel = "default"
-	GoogleModelMedicalConversational GoogleModel = "medical_conversational"
-	GoogleModelMedicalDictation      GoogleModel = "medical_dictation"
-)
-
-type GoogleRecognitionConfig struct {
-	Encoding                            RecognitionEncoding
-	SampleRateHertz                     int32
-	AudioChannelCount                   int32
-	EnableSeparateRecognitionPerChannel bool
-	LanguageCode                        string
-	MaxAlternatives                     int32
-	ProfanityFilter                     bool
-	SpeechContexts                      []GoogleSpeechContext
-	EnableWordTimeOffsets               bool
-	EnableWordConfidence                bool
-	EnableAutomaticPunctuation          bool
-	EnableSpokenPunctuation             bool
-	Model                               string
-	UseEnhanced                         bool
-}
-
-type RecognitionEncoding string
-
-const (
-	EncodingLinear16             RecognitionEncoding = "LINEAR16"
-	EncodingFLAC                 RecognitionEncoding = "FLAC"
-	EncodingMULAW                RecognitionEncoding = "MULAW"
-	EncodingAMR                  RecognitionEncoding = "AMR"
-	EncodingAMRWB                RecognitionEncoding = "AMR_WB"
-	EncodingOGGOpus              RecognitionEncoding = "OGG_OPUS"
-	EncodingSpeexWithHeaderByte  RecognitionEncoding = "SPEEX_WITH_HEADER_BYTE"
-	EncodingMP3                  RecognitionEncoding = "MP3"
-	EncodingWEBMOpus             RecognitionEncoding = "WEBM_OPUS"
-	EncodingENCODING_UNSPECIFIED RecognitionEncoding = "ENCODING_UNSPECIFIED"
-)
-
-type GoogleSpeechContext struct {
-	Phrases []string
-	Boost   float32
-}
-
-type GoogleProvider struct {
-	apiKey          string
-	credentialsJSON string
-	baseURL         string
-	languageCode    string
-	model           GoogleModel
-	useEnhanced     bool
-	timeout         time.Duration
-	retries         int
-	client          googleRecognizeAPI
-}
-
-type GoogleOption func(*GoogleProvider)
-
-func WithGoogleBaseURL(url string) GoogleOption {
-	return func(p *GoogleProvider) {
-		p.baseURL = strings.TrimRight(url, "/")
-	}
-}
-
-func WithGoogleLanguageCode(code string) GoogleOption {
-	return func(p *GoogleProvider) {
-		p.languageCode = code
-	}
-}
-
-func WithGoogleModel(model GoogleModel) GoogleOption {
-	return func(p *GoogleProvider) {
-		p.model = model
-	}
-}
-
-func WithGoogleEnhanced(enabled bool) GoogleOption {
-	return func(p *GoogleProvider) {
-		p.useEnhanced = enabled
-	}
-}
-
-func WithGoogleTimeout(timeout time.Duration) GoogleOption {
-	return func(p *GoogleProvider) {
-		p.timeout = timeout
-	}
-}
-
-func WithGoogleRetries(retries int) GoogleOption {
-	return func(p *GoogleProvider) {
-		p.retries = retries
-	}
-}
-
-func WithGoogleCredentialsJSON(credentialsJSON string) GoogleOption {
-	return func(p *GoogleProvider) {
-		p.credentialsJSON = credentialsJSON
-	}
-}
-
-func withGoogleRecognizeClient(client googleRecognizeAPI) GoogleOption {
-	return func(p *GoogleProvider) {
-		p.client = client
-	}
-}
-
-func NewGoogleProvider(apiKey string, opts ...GoogleOption) (*GoogleProvider, error) {
-	p := &GoogleProvider{
-		apiKey:       apiKey,
-		baseURL:      "https://speech.googleapis.com",
-		languageCode: "en-US",
-		model:        GoogleModelDefault,
-		timeout:      120 * time.Second,
-		retries:      2,
-	}
-
-	for _, opt := range opts {
-		opt(p)
-	}
-
-	if p.apiKey == "" && p.credentialsJSON == "" {
-		return nil, NewSTTError(ErrAuthentication, "google: API key or credentials JSON is required")
-	}
-
-	if p.client == nil {
-		client, err := newGoogleRecognizeClient(context.Background(), p.clientOptions()...)
-		if err != nil {
-			return nil, NewSTTErrorf(ErrAuthentication, "google-speech: failed to initialize official client: %v", err)
-		}
-		p.client = client
-	}
-
-	return p, nil
-}
-
-func (p *GoogleProvider) clientOptions() []option.ClientOption {
-	opts := make([]option.ClientOption, 0, 2)
-	if p.credentialsJSON != "" {
-		opts = append(opts, option.WithCredentialsJSON([]byte(p.credentialsJSON)))
-	} else {
-		opts = append(opts, option.WithAPIKey(p.apiKey))
-	}
-	if p.baseURL != "" && p.baseURL != "https://speech.googleapis.com" {
-		opts = append(opts, option.WithEndpoint(p.baseURL))
-	}
-	return opts
-}
-
-func (p *GoogleProvider) Name() string {
-	return "google-speech"
-}
-
-func (p *GoogleProvider) Type() STTProviderType {
-	return STTProviderGoogle
-}
-
-func (p *GoogleProvider) Transcribe(ctx context.Context, audio []byte, opts ...TranscribeOption) (*TranscriptResult, error) {
-	options := TranscribeOptions{
-		Language:    p.languageCode,
-		Mode:        ModeTranscription,
-		InputFormat: InputMP3,
-	}
-	for _, opt := range opts {
-		opt(&options)
-	}
-
-	if len(audio) == 0 {
-		return nil, NewSTTError(ErrAudioFormatInvalid, "google-speech: audio data is empty")
-	}
-
-	const maxAudioSize = 100 * 1024 * 1024
-	if len(audio) > maxAudioSize {
-		return nil, NewSTTErrorf(ErrAudioTooLarge, "google-speech: audio exceeds 100MB limit (%d bytes)", len(audio))
-	}
-
-	if !validInputFormats[options.InputFormat] {
-		return nil, NewSTTErrorf(ErrAudioFormatInvalid, "google-speech: unsupported input format: %s", options.InputFormat)
-	}
-
-	var lastErr error
-	for attempt := 0; attempt <= p.retries; attempt++ {
-		if attempt > 0 {
-			backoff := time.Duration(attempt) * time.Second
-			select {
-			case <-ctx.Done():
-				return nil, NewSTTErrorf(ErrTranscriptionFailed, "google-speech: context cancelled during retry: %v", ctx.Err())
-			case <-time.After(backoff):
-			}
-		}
-
-		result, err := p.doTranscribe(ctx, audio, options)
-		if err == nil {
-			return result, nil
-		}
-
-		lastErr = err
-
-		if sttErr, ok := err.(*STTError); ok {
-			if sttErr.Code == ErrAuthentication || sttErr.Code == ErrAudioFormatInvalid || sttErr.Code == ErrAudioTooLarge || sttErr.Code == ErrRateLimited {
-				return nil, err
-			}
-		}
-	}
-
-	return nil, NewSTTErrorf(ErrTranscriptionFailed, "google-speech: all %d retries failed: %v", p.retries, lastErr)
-}
-
-func (p *GoogleProvider) TranscribeFile(ctx context.Context, filePath string, opts ...TranscribeOption) (*TranscriptResult, error) {
-	return nil, NewSTTError(ErrProviderNotSupported, "google-speech: file transcription requires GCS URI, use Transcribe with file content instead")
-}
-
-func (p *GoogleProvider) TranscribeStream(ctx context.Context, reader io.Reader, opts ...TranscribeOption) (*TranscriptResult, error) {
-	if reader == nil {
-		return nil, NewSTTError(ErrAudioFormatInvalid, "google-speech: reader is nil")
-	}
-
-	audio, err := io.ReadAll(reader)
-	if err != nil {
-		return nil, NewSTTErrorf(ErrTranscriptionFailed, "google-speech: failed to read stream: %v", err)
-	}
-
-	return p.Transcribe(ctx, audio, opts...)
-}
-
-func (p *GoogleProvider) doTranscribe(ctx context.Context, audio []byte, options TranscribeOptions) (*TranscriptResult, error) {
-	req := p.buildRecognizeRequest(audio, options)
-
-	requestCtx := ctx
-	var cancel context.CancelFunc
-	if _, hasDeadline := ctx.Deadline(); !hasDeadline && p.timeout > 0 {
-		requestCtx, cancel = context.WithTimeout(ctx, p.timeout)
-		defer cancel()
-	}
-
-	resp, err := p.client.Recognize(requestCtx, req)
-	if err != nil {
-		return nil, p.handleClientError(err)
-	}
-
-	return p.parseRecognizeResponse(resp, options)
-}
-
-func (p *GoogleProvider) buildRecognizeRequest(audio []byte, options TranscribeOptions) *speechpb.RecognizeRequest {
-	encoding := p.mapInputFormatToEncoding(options.InputFormat)
-
-	sampleRate := int32(options.SampleRate)
-	if sampleRate == 0 {
-		sampleRate = p.guessSampleRate(options.InputFormat)
-	}
-
-	return &speechpb.RecognizeRequest{
-		Config: &speechpb.RecognitionConfig{
-			Encoding:                   p.toProtoRecognitionEncoding(encoding),
-			SampleRateHertz:            sampleRate,
-			LanguageCode:               options.Language,
-			Model:                      string(p.model),
-			UseEnhanced:                p.useEnhanced,
-			MaxAlternatives:            int32(options.MaxAlternatives),
-			EnableWordTimeOffsets:      options.WordTimestamps,
-			EnableWordConfidence:       true,
-			EnableAutomaticPunctuation: true,
-		},
-		Audio: &speechpb.RecognitionAudio{
-			AudioSource: &speechpb.RecognitionAudio_Content{Content: audio},
-		},
-	}
-}
-
-func (p *GoogleProvider) mapInputFormatToEncoding(format AudioInputFormat) RecognitionEncoding {
-	switch format {
-	case InputWAV, InputPCM:
-		return EncodingLinear16
-	case InputFLAC:
-		return EncodingFLAC
-	case InputMP3:
-		return EncodingMP3
-	case InputOGG:
-		return EncodingOGGOpus
-	case InputWEBM:
-		return EncodingWEBMOpus
-	case InputM4A, InputMP4:
-		return EncodingWEBMOpus
-	case InputMPEG, InputMPGA:
-		return EncodingMP3
-	default:
-		return EncodingMP3
-	}
-}
-
-func (p *GoogleProvider) toProtoRecognitionEncoding(encoding RecognitionEncoding) speechpb.RecognitionConfig_AudioEncoding {
-	switch encoding {
-	case EncodingLinear16:
-		return speechpb.RecognitionConfig_LINEAR16
-	case EncodingFLAC:
-		return speechpb.RecognitionConfig_FLAC
-	case EncodingMULAW:
-		return speechpb.RecognitionConfig_MULAW
-	case EncodingAMR:
-		return speechpb.RecognitionConfig_AMR
-	case EncodingAMRWB:
-		return speechpb.RecognitionConfig_AMR_WB
-	case EncodingOGGOpus:
-		return speechpb.RecognitionConfig_OGG_OPUS
-	case EncodingSpeexWithHeaderByte:
-		return speechpb.RecognitionConfig_SPEEX_WITH_HEADER_BYTE
-	case EncodingWEBMOpus:
-		return speechpb.RecognitionConfig_WEBM_OPUS
-	case EncodingMP3:
-		return speechpb.RecognitionConfig_MP3
-	default:
-		return speechpb.RecognitionConfig_ENCODING_UNSPECIFIED
-	}
-}
-
-func (p *GoogleProvider) guessSampleRate(format AudioInputFormat) int32 {
-	switch format {
-	case InputWAV, InputPCM:
-		return 16000
-	case InputFLAC:
-		return 16000
-	case InputMP3:
-		return 16000
-	case InputOGG, InputWEBM:
-		return 48000
-	case InputM4A, InputMP4:
-		return 44100
-	default:
-		return 16000
-	}
-}
-
-func (p *GoogleProvider) handleClientError(err error) error {
-	if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
-		return NewSTTErrorf(ErrTranscriptionFailed, "google-speech: request context error: %v", err)
-	}
-
-	var apiErr *googleapi.Error
-	if errors.As(err, &apiErr) {
-		msg := fmt.Sprintf("google-speech: API error: %s", apiErr.Message)
-		switch apiErr.Code {
-		case 400:
-			return NewSTTError(ErrAudioFormatInvalid, msg)
-		case 401, 403:
-			return NewSTTError(ErrAuthentication, msg)
-		case 429:
-			return NewSTTError(ErrRateLimited, msg)
-		default:
-			return NewSTTError(ErrTranscriptionFailed, msg)
-		}
-	}
-
-	switch status.Code(err) {
-	case codes.InvalidArgument:
-		return NewSTTError(ErrAudioFormatInvalid, "google-speech: invalid recognition request")
-	case codes.Unauthenticated, codes.PermissionDenied:
-		return NewSTTError(ErrAuthentication, "google-speech: authentication failed")
-	case codes.ResourceExhausted:
-		return NewSTTError(ErrRateLimited, "google-speech: rate limited")
-	default:
-		return NewSTTErrorf(ErrTranscriptionFailed, "google-speech: request failed: %v", err)
-	}
-}
-
-func (p *GoogleProvider) parseRecognizeResponse(resp *speechpb.RecognizeResponse, options TranscribeOptions) (*TranscriptResult, error) {
-	results := resp.GetResults()
-	if len(results) == 0 {
-		return &TranscriptResult{
-			Text:     "",
-			Language: options.Language,
-		}, nil
-	}
-
-	result := &TranscriptResult{}
-	var totalConfidence float64
-	var confidenceCount int
-	var lastEnd time.Duration
-
-	for i, res := range results {
-		if len(res.GetAlternatives()) == 0 {
-			continue
-		}
-
-		primary := res.GetAlternatives()[0]
-		segment := SegmentInfo{
-			ID:   i,
-			Text: primary.GetTranscript(),
-		}
-
-		if confidence := primary.GetConfidence(); confidence > 0 {
-			segment.Confidence = float64(confidence)
-			totalConfidence += segment.Confidence
-			confidenceCount++
-		}
-
-		if len(primary.GetWords()) > 0 {
-			segment.Words = make([]WordInfo, 0, len(primary.GetWords()))
-			for _, word := range primary.GetWords() {
-				wordInfo := WordInfo{
-					Word:       word.GetWord(),
-					StartTime:  parseProtoDuration(word.GetStartTime()),
-					EndTime:    parseProtoDuration(word.GetEndTime()),
-					Confidence: float64(word.GetConfidence()),
-				}
-				segment.Words = append(segment.Words, wordInfo)
-			}
-			segment.StartTime = segment.Words[0].StartTime
-			segment.EndTime = segment.Words[len(segment.Words)-1].EndTime
-		} else {
-			segment.EndTime = parseProtoDuration(res.GetResultEndTime())
-		}
-
-		if options.WordTimestamps && len(segment.Words) > 0 {
-			result.Words = append(result.Words, segment.Words...)
-		}
-
-		result.Segments = append(result.Segments, segment)
-
-		if i == 0 {
-			result.Text = primary.GetTranscript()
-			if lang := res.GetLanguageCode(); lang != "" {
-				result.Language = lang
-			}
-		} else {
-			result.Text += " " + primary.GetTranscript()
-		}
-
-		if options.MaxAlternatives > 1 && len(res.GetAlternatives()) > 1 {
-			for _, alt := range res.GetAlternatives()[1:] {
-				result.Alternatives = append(result.Alternatives, alt.GetTranscript())
-			}
-		}
-
-		if segment.EndTime > lastEnd {
-			lastEnd = segment.EndTime
-		}
-	}
-
-	if result.Language == "" {
-		result.Language = options.Language
-	}
-
-	if confidenceCount > 0 {
-		result.Confidence = totalConfidence / float64(confidenceCount)
-	}
-
-	if lastEnd > 0 {
-		result.Duration = lastEnd
-	} else {
-		result.Duration = parseProtoDuration(resp.GetTotalBilledTime())
-	}
-
-	return result, nil
-}
-
-func parseProtoDuration(d *durationpb.Duration) time.Duration {
-	if d == nil {
-		return 0
-	}
-	return d.AsDuration()
-}
-
-func (p *GoogleProvider) ListLanguages(ctx context.Context) ([]string, error) {
-	return []string{
-		"af-ZA", "am-ET", "hy-AM", "az-AZ", "id-ID", "ms-MY", "bn-BD", "bn-IN", "ca-ES", "cs-CZ",
-		"da-DK", "de-DE", "en-AU", "en-CA", "en-GH", "en-GB", "en-IN", "en-IE", "en-KE", "en-NZ",
-		"en-NG", "en-PH", "en-SG", "en-ZA", "en-TZ", "en-US", "es-AR", "es-BO", "es-CL", "es-CO",
-		"es-CR", "es-EC", "es-SV", "es-ES", "es-US", "es-GT", "es-HN", "es-MX", "es-NI", "es-PA",
-		"es-PY", "es-PE", "es-PR", "es-DO", "es-UY", "es-VE", "eu-ES", "fil-PH", "fr-CA", "fr-FR",
-		"gl-ES", "ka-GE", "gu-IN", "hr-HR", "zu-ZA", "is-IS", "it-IT", "jv-ID", "kn-IN", "km-KH",
-		"lo-LA", "lv-LV", "lt-LT", "hu-HU", "ml-IN", "mr-IN", "nl-NL", "ne-NP", "nb-NO", "pl-PL",
-		"pt-BR", "pt-PT", "ro-RO", "si-LK", "sk-SK", "sl-SI", "sr-RS", "fi-FI", "sv-SE", "ta-IN",
-		"ta-SG", "ta-LK", "ta-MY", "te-IN", "vi-VN", "tr-TR", "ur-IN", "ur-PK", "el-GR", "bg-BG",
-		"ru-RU", "sr-RS", "uk-UA", "he-IL", "ar-AE", "ar-BH", "ar-DZ", "ar-EG", "ar-IQ", "ar-JO",
-		"ar-KW", "ar-LB", "ar-LY", "ar-MA", "ar-OM", "ar-QA", "ar-SA", "ar-PS", "ar-SY", "ar-TN",
-		"ar-YE", "fa-IR", "hi-IN", "th-TH", "ko-KR", "zh-TW", "ja-JP", "zh", "zh-CN", "zh-HK",
-		"yue-Hant-HK",
-	}, nil
-}
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google_client.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google_client.go
deleted file mode 100644
index aed01cb6..00000000
--- a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google_client.go
+++ /dev/null
@@ -1,34 +0,0 @@
-package speech
-
-import (
-	"context"
-
-	speechapi "cloud.google.com/go/speech/apiv1"
-	speechpb "cloud.google.com/go/speech/apiv1/speechpb"
-	"google.golang.org/api/option"
-)
-
-type googleRecognizeAPI interface {
-	Recognize(context.Context, *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error)
-	Close() error
-}
-
-type googleRecognizeClient struct {
-	client *speechapi.Client
-}
-
-func newGoogleRecognizeClient(ctx context.Context, clientOpts ...option.ClientOption) (googleRecognizeAPI, error) {
-	client, err := speechapi.NewRESTClient(ctx, clientOpts...)
-	if err != nil {
-		return nil, err
-	}
-	return &googleRecognizeClient{client: client}, nil
-}
-
-func (c *googleRecognizeClient) Recognize(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
-	return c.client.Recognize(ctx, req)
-}
-
-func (c *googleRecognizeClient) Close() error {
-	return c.client.Close()
-}
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google_test.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google_test.go
deleted file mode 100644
index cb7fbdac..00000000
--- a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_google_test.go
+++ /dev/null
@@ -1,544 +0,0 @@
-package speech
-
-import (
-	"context"
-	"errors"
-	"math"
-	"strings"
-	"testing"
-	"time"
-
-	speechpb "cloud.google.com/go/speech/apiv1/speechpb"
-	"google.golang.org/api/googleapi"
-	"google.golang.org/protobuf/types/known/durationpb"
-)
-
-type fakeGoogleRecognizeClient struct {
-	calls       int
-	lastRequest *speechpb.RecognizeRequest
-	recognizeFn func(context.Context, *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error)
-}
-
-func (f *fakeGoogleRecognizeClient) Recognize(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
-	f.calls++
-	f.lastRequest = req
-	if f.recognizeFn != nil {
-		return f.recognizeFn(ctx, req)
-	}
-	return &speechpb.RecognizeResponse{}, nil
-}
-
-func (f *fakeGoogleRecognizeClient) Close() error {
-	return nil
-}
-
-func TestNewGoogleProvider(t *testing.T) {
-	t.Run("requires API key or credentials JSON", func(t *testing.T) {
-		_, err := NewGoogleProvider("", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
-		if err == nil {
-			t.Fatal("expected error when auth config is empty")
-		}
-		sttErr, ok := err.(*STTError)
-		if !ok {
-			t.Fatalf("expected *STTError, got %T", err)
-		}
-		if sttErr.Code != ErrAuthentication {
-			t.Errorf("expected ErrAuthentication, got %s", sttErr.Code)
-		}
-	})
-
-	t.Run("creates provider with defaults", func(t *testing.T) {
-		fake := &fakeGoogleRecognizeClient{}
-		p, err := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake))
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-		if p.Name() != "google-speech" {
-			t.Errorf("expected name google-speech, got %s", p.Name())
-		}
-		if p.Type() != STTProviderGoogle {
-			t.Errorf("expected type %s, got %s", STTProviderGoogle, p.Type())
-		}
-		if p.baseURL != "https://speech.googleapis.com" {
-			t.Errorf("expected default baseURL, got %s", p.baseURL)
-		}
-		if p.languageCode != "en-US" {
-			t.Errorf("expected default language en-US, got %s", p.languageCode)
-		}
-		if p.model != GoogleModelDefault {
-			t.Errorf("expected default model %s, got %s", GoogleModelDefault, p.model)
-		}
-		if p.retries != 2 {
-			t.Errorf("expected 2 retries, got %d", p.retries)
-		}
-		if p.client != fake {
-			t.Fatal("expected injected fake client to be used")
-		}
-	})
-
-	t.Run("applies options", func(t *testing.T) {
-		p, err := NewGoogleProvider("test-key",
-			withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}),
-			WithGoogleBaseURL("https://custom.speech.api.com/"),
-			WithGoogleLanguageCode("zh-CN"),
-			WithGoogleModel(GoogleModelLatestLong),
-			WithGoogleEnhanced(true),
-			WithGoogleTimeout(30*time.Second),
-			WithGoogleRetries(5),
-			WithGoogleCredentialsJSON(`{"type":"service_account"}`),
-		)
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-		if p.baseURL != "https://custom.speech.api.com" {
-			t.Errorf("expected custom baseURL, got %s", p.baseURL)
-		}
-		if p.languageCode != "zh-CN" {
-			t.Errorf("expected language zh-CN, got %s", p.languageCode)
-		}
-		if p.model != GoogleModelLatestLong {
-			t.Errorf("expected model %s, got %s", GoogleModelLatestLong, p.model)
-		}
-		if !p.useEnhanced {
-			t.Error("expected useEnhanced to be true")
-		}
-		if p.timeout != 30*time.Second {
-			t.Errorf("expected 30s timeout, got %v", p.timeout)
-		}
-		if p.retries != 5 {
-			t.Errorf("expected 5 retries, got %d", p.retries)
-		}
-		if p.credentialsJSON == "" {
-			t.Error("expected credentials JSON to be stored")
-		}
-	})
-}
-
-func TestGoogleProviderTranscribe(t *testing.T) {
-	t.Run("rejects empty audio", func(t *testing.T) {
-		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
-		_, err := p.Transcribe(context.Background(), nil)
-		if err == nil {
-			t.Fatal("expected error for empty audio")
-		}
-	})
-
-	t.Run("rejects audio too large", func(t *testing.T) {
-		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
-		largeAudio := make([]byte, 101*1024*1024)
-		_, err := p.Transcribe(context.Background(), largeAudio)
-		if err == nil {
-			t.Fatal("expected error for audio too large")
-		}
-		sttErr, ok := err.(*STTError)
-		if !ok {
-			t.Fatalf("expected *STTError, got %T", err)
-		}
-		if sttErr.Code != ErrAudioTooLarge {
-			t.Errorf("expected ErrAudioTooLarge, got %s", sttErr.Code)
-		}
-	})
-
-	t.Run("successful transcription and request mapping", func(t *testing.T) {
-		fake := &fakeGoogleRecognizeClient{
-			recognizeFn: func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
-				return &speechpb.RecognizeResponse{
-					Results: []*speechpb.SpeechRecognitionResult{
-						{
-							Alternatives: []*speechpb.SpeechRecognitionAlternative{
-								{
-									Transcript: "Hello world",
-									Confidence: 0.95,
-									Words: []*speechpb.WordInfo{
-										{
-											Word:       "Hello",
-											Confidence: 0.96,
-											StartTime:  durationpb.New(0),
-											EndTime:    durationpb.New(500 * time.Millisecond),
-										},
-										{
-											Word:       "world",
-											Confidence: 0.94,
-											StartTime:  durationpb.New(600 * time.Millisecond),
-											EndTime:    durationpb.New(time.Second),
-										},
-									},
-								},
-							},
-							LanguageCode:  "en-US",
-							ResultEndTime: durationpb.New(2500 * time.Millisecond),
-						},
-					},
-				}, nil
-			},
-		}
-
-		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake))
-		result, err := p.Transcribe(context.Background(), []byte("fake-audio-data"),
-			WithSTTLanguage("zh-CN"),
-			WithSTTWordTimestamps(true),
-			WithSTTMaxAlternatives(3),
-		)
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-
-		if result.Text != "Hello world" {
-			t.Errorf("expected 'Hello world', got '%s'", result.Text)
-		}
-		if result.Language != "en-US" {
-			t.Errorf("expected language 'en-US', got '%s'", result.Language)
-		}
-		if result.Duration != time.Second {
-			t.Errorf("expected duration 1s from word timestamps, got %v", result.Duration)
-		}
-		if len(result.Segments) != 1 {
-			t.Fatalf("expected 1 segment, got %d", len(result.Segments))
-		}
-		if len(result.Segments[0].Words) != 2 {
-			t.Fatalf("expected 2 words, got %d", len(result.Segments[0].Words))
-		}
-		if math.Abs(result.Confidence-0.95) > 0.0001 {
-			t.Errorf("expected confidence 0.95, got %f", result.Confidence)
-		}
-
-		req := fake.lastRequest
-		if req == nil {
-			t.Fatal("expected request to be captured")
-		}
-		if req.GetConfig().GetLanguageCode() != "zh-CN" {
-			t.Errorf("expected request language zh-CN, got %s", req.GetConfig().GetLanguageCode())
-		}
-		if !req.GetConfig().GetEnableWordTimeOffsets() {
-			t.Error("expected EnableWordTimeOffsets to be true")
-		}
-		if req.GetConfig().GetMaxAlternatives() != 3 {
-			t.Errorf("expected max alternatives 3, got %d", req.GetConfig().GetMaxAlternatives())
-		}
-		if req.GetConfig().GetEncoding() != speechpb.RecognitionConfig_MP3 {
-			t.Errorf("expected MP3 encoding, got %v", req.GetConfig().GetEncoding())
-		}
-		if len(req.GetAudio().GetContent()) == 0 {
-			t.Error("expected inline audio content to be populated")
-		}
-	})
-
-	t.Run("multiple segments and alternatives", func(t *testing.T) {
-		fake := &fakeGoogleRecognizeClient{
-			recognizeFn: func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
-				return &speechpb.RecognizeResponse{
-					Results: []*speechpb.SpeechRecognitionResult{
-						{
-							Alternatives: []*speechpb.SpeechRecognitionAlternative{
-								{Transcript: "First segment", Confidence: 0.9},
-								{Transcript: "First segments", Confidence: 0.7},
-							},
-							LanguageCode:  "en-US",
-							ResultEndTime: durationpb.New(time.Second),
-						},
-						{
-							Alternatives: []*speechpb.SpeechRecognitionAlternative{
-								{Transcript: "Second segment", Confidence: 0.8},
-								{Transcript: "Second segments", Confidence: 0.6},
-							},
-							LanguageCode:  "en-US",
-							ResultEndTime: durationpb.New(2 * time.Second),
-						},
-					},
-				}, nil
-			},
-		}
-
-		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake))
-		result, err := p.Transcribe(context.Background(), []byte("fake-audio"), WithSTTMaxAlternatives(3))
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-
-		if result.Text != "First segment Second segment" {
-			t.Errorf("unexpected combined text: %s", result.Text)
-		}
-		if len(result.Segments) != 2 {
-			t.Fatalf("expected 2 segments, got %d", len(result.Segments))
-		}
-		if len(result.Alternatives) != 2 {
-			t.Fatalf("expected 2 alternatives, got %d", len(result.Alternatives))
-		}
-		if result.Duration != 2*time.Second {
-			t.Errorf("expected duration 2s, got %v", result.Duration)
-		}
-	})
-
-	t.Run("empty results", func(t *testing.T) {
-		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
-		result, err := p.Transcribe(context.Background(), []byte("fake-audio"))
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-		if result.Text != "" {
-			t.Errorf("expected empty text, got %q", result.Text)
-		}
-	})
-
-	t.Run("does not retry auth errors", func(t *testing.T) {
-		fake := &fakeGoogleRecognizeClient{
-			recognizeFn: func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
-				return nil, &googleapi.Error{Code: 401, Message: "invalid API key"}
-			},
-		}
-		p, _ := NewGoogleProvider("bad-key", withGoogleRecognizeClient(fake), WithGoogleRetries(3))
-		_, err := p.Transcribe(context.Background(), []byte("fake-audio"))
-		if err == nil {
-			t.Fatal("expected authentication error")
-		}
-		if fake.calls != 1 {
-			t.Errorf("expected 1 call, got %d", fake.calls)
-		}
-	})
-
-	t.Run("retries transient errors then succeeds", func(t *testing.T) {
-		fake := &fakeGoogleRecognizeClient{}
-		fake.recognizeFn = func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
-			if fake.calls == 1 {
-				return nil, &googleapi.Error{Code: 503, Message: "service unavailable"}
-			}
-			return &speechpb.RecognizeResponse{
-				Results: []*speechpb.SpeechRecognitionResult{
-					{
-						Alternatives:  []*speechpb.SpeechRecognitionAlternative{{Transcript: "Success after retry"}},
-						LanguageCode:  "en-US",
-						ResultEndTime: durationpb.New(time.Second),
-					},
-				},
-			}, nil
-		}
-
-		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake), WithGoogleRetries(2))
-		result, err := p.Transcribe(context.Background(), []byte("fake-audio"))
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-		if result.Text != "Success after retry" {
-			t.Errorf("expected 'Success after retry', got '%s'", result.Text)
-		}
-		if fake.calls != 2 {
-			t.Errorf("expected 2 calls, got %d", fake.calls)
-		}
-	})
-
-	t.Run("context cancellation", func(t *testing.T) {
-		fake := &fakeGoogleRecognizeClient{
-			recognizeFn: func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
-				return nil, context.Canceled
-			},
-		}
-		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake), WithGoogleRetries(0))
-		_, err := p.Transcribe(context.Background(), []byte("fake-audio"))
-		if err == nil {
-			t.Fatal("expected context cancellation error")
-		}
-	})
-}
-
-func TestGoogleProviderTranscribeStream(t *testing.T) {
-	t.Run("rejects nil reader", func(t *testing.T) {
-		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
-		_, err := p.TranscribeStream(context.Background(), nil)
-		if err == nil {
-			t.Fatal("expected error for nil reader")
-		}
-	})
-
-	t.Run("successful stream transcription", func(t *testing.T) {
-		fake := &fakeGoogleRecognizeClient{
-			recognizeFn: func(ctx context.Context, req *speechpb.RecognizeRequest) (*speechpb.RecognizeResponse, error) {
-				return &speechpb.RecognizeResponse{
-					Results: []*speechpb.SpeechRecognitionResult{
-						{
-							Alternatives: []*speechpb.SpeechRecognitionAlternative{{Transcript: "Stream content", Confidence: 0.9}},
-							LanguageCode: "en-US",
-						},
-					},
-				}, nil
-			},
-		}
-		p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(fake))
-		reader := strings.NewReader("stream-audio-data")
-		result, err := p.TranscribeStream(context.Background(), reader)
-		if err != nil {
-			t.Fatalf("unexpected error: %v", err)
-		}
-		if result.Text != "Stream content" {
-			t.Errorf("expected 'Stream content', got '%s'", result.Text)
-		}
-	})
-}
-
-func TestGoogleProviderTranscribeFile(t *testing.T) {
-	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
-	_, err := p.TranscribeFile(context.Background(), "/some/file.mp3")
-	if err == nil {
-		t.Fatal("expected error for file transcription")
-	}
-	sttErr, ok := err.(*STTError)
-	if !ok {
-		t.Fatalf("expected *STTError, got %T", err)
-	}
-	if sttErr.Code != ErrProviderNotSupported {
-		t.Errorf("expected ErrProviderNotSupported, got %s", sttErr.Code)
-	}
-}
-
-func TestGoogleProviderListLanguages(t *testing.T) {
-	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
-	langs, err := p.ListLanguages(context.Background())
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if len(langs) == 0 {
-		t.Fatal("expected non-empty language list")
-	}
-}
-
-func TestGoogleProviderEncodingMapping(t *testing.T) {
-	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
-
-	tests := []struct {
-		format AudioInputFormat
-		want   RecognitionEncoding
-	}{
-		{InputWAV, EncodingLinear16},
-		{InputPCM, EncodingLinear16},
-		{InputFLAC, EncodingFLAC},
-		{InputMP3, EncodingMP3},
-		{InputOGG, EncodingOGGOpus},
-		{InputWEBM, EncodingWEBMOpus},
-		{InputM4A, EncodingWEBMOpus},
-		{InputMP4, EncodingWEBMOpus},
-		{InputMPEG, EncodingMP3},
-		{InputMPGA, EncodingMP3},
-	}
-
-	for _, tt := range tests {
-		t.Run(string(tt.format), func(t *testing.T) {
-			got := p.mapInputFormatToEncoding(tt.format)
-			if got != tt.want {
-				t.Errorf("mapInputFormatToEncoding(%s) = %s, want %s", tt.format, got, tt.want)
-			}
-		})
-	}
-}
-
-func TestGoogleProviderSampleRateGuessing(t *testing.T) {
-	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
-
-	tests := []struct {
-		format AudioInputFormat
-		want   int32
-	}{
-		{InputWAV, 16000},
-		{InputPCM, 16000},
-		{InputFLAC, 16000},
-		{InputMP3, 16000},
-		{InputOGG, 48000},
-		{InputWEBM, 48000},
-		{InputM4A, 44100},
-		{InputMP4, 44100},
-	}
-
-	for _, tt := range tests {
-		t.Run(string(tt.format), func(t *testing.T) {
-			got := p.guessSampleRate(tt.format)
-			if got != tt.want {
-				t.Errorf("guessSampleRate(%s) = %d, want %d", tt.format, got, tt.want)
-			}
-		})
-	}
-}
-
-func TestParseProtoDuration(t *testing.T) {
-	tests := []struct {
-		name string
-		d    *durationpb.Duration
-		want time.Duration
-	}{
-		{"nil", nil, 0},
-		{"zero", durationpb.New(0), 0},
-		{"one second", durationpb.New(time.Second), time.Second},
-		{"500ms", durationpb.New(500 * time.Millisecond), 500 * time.Millisecond},
-		{"2.5s", durationpb.New(2500 * time.Millisecond), 2500 * time.Millisecond},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got := parseProtoDuration(tt.d)
-			if got != tt.want {
-				t.Errorf("parseProtoDuration(%v) = %v, want %v", tt.d, got, tt.want)
-			}
-		})
-	}
-}
-
-func TestNewSTTProviderGoogle(t *testing.T) {
-	p, err := NewSTTProvider(STTConfig{
-		Type:    STTProviderGoogle,
-		APIKey:  "test-key",
-		Timeout: 30 * time.Second,
-	})
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if p.Type() != STTProviderGoogle {
-		t.Errorf("expected STTProviderGoogle, got %s", p.Type())
-	}
-	if p.Name() != "google-speech" {
-		t.Errorf("expected name 'google-speech', got %s", p.Name())
-	}
-}
-
-func TestGoogleSTTManager(t *testing.T) {
-	m := NewSTTManager()
-	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
-
-	err := m.Register("google", p)
-	if err != nil {
-		t.Fatalf("failed to register provider: %v", err)
-	}
-
-	got, err := m.Get("google")
-	if err != nil {
-		t.Fatalf("failed to get provider: %v", err)
-	}
-	if got.Type() != STTProviderGoogle {
-		t.Errorf("expected STTProviderGoogle, got %s", got.Type())
-	}
-}
-
-func TestGoogleProviderHandleClientError(t *testing.T) {
-	p, _ := NewGoogleProvider("test-key", withGoogleRecognizeClient(&fakeGoogleRecognizeClient{}))
-
-	tests := []struct {
-		name string
-		err  error
-		want STTErrorCode
-	}{
-		{"bad request", &googleapi.Error{Code: 400, Message: "bad request"}, ErrAudioFormatInvalid},
-		{"unauthorized", &googleapi.Error{Code: 401, Message: "unauthorized"}, ErrAuthentication},
-		{"forbidden", &googleapi.Error{Code: 403, Message: "forbidden"}, ErrAuthentication},
-		{"rate limited", &googleapi.Error{Code: 429, Message: "quota"}, ErrRateLimited},
-		{"generic", errors.New("boom"), ErrTranscriptionFailed},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			err := p.handleClientError(tt.err)
-			sttErr, ok := err.(*STTError)
-			if !ok {
-				t.Fatalf("expected *STTError, got %T", err)
-			}
-			if sttErr.Code != tt.want {
-				t.Errorf("expected %s, got %s", tt.want, sttErr.Code)
-			}
-		})
-	}
-}
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_provider.go b/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_provider.go
deleted file mode 100644
index 10f6bd5a..00000000
--- a/_speech_vad_stt_remaining_unuploaded_20260428_171936/03_stt_google_official_client/pkg/speech/stt_provider.go
+++ /dev/null
@@ -1,249 +0,0 @@
-package speech
-
-import (
-	"context"
-	"fmt"
-	"time"
-)
-
-type STTProviderType string
-
-const (
-	STTProviderOpenAI        STTProviderType = "openai"
-	STTProviderAzure         STTProviderType = "azure"
-	STTProviderGoogle        STTProviderType = "google"
-	STTProviderDeepgram      STTProviderType = "deepgram"
-	STTProviderAssemblyAI    STTProviderType = "assemblyai"
-	STTProviderWhisperCPP    STTProviderType = "whisper.cpp"
-	STTProviderVosk          STTProviderType = "vosk"
-	STTProviderFasterWhisper STTProviderType = "faster-whisper"
-	STTProviderCustom        STTProviderType = "custom"
-)
-
-type AudioInputFormat string
-
-const (
-	InputMP3  AudioInputFormat = "mp3"
-	InputWAV  AudioInputFormat = "wav"
-	InputOGG  AudioInputFormat = "ogg"
-	InputFLAC AudioInputFormat = "flac"
-	InputPCM  AudioInputFormat = "pcm"
-	InputM4A  AudioInputFormat = "m4a"
-	InputMP4  AudioInputFormat = "mp4"
-	InputMPEG AudioInputFormat = "mpeg"
-	InputMPGA AudioInputFormat = "mpga"
-	InputWEBM AudioInputFormat = "webm"
-)
-
-type STTProvider interface {
-	Name() string
-	Type() STTProviderType
-	Transcribe(ctx context.Context, audio []byte, opts ...TranscribeOption) (*TranscriptResult, error)
-	ListLanguages(ctx context.Context) ([]string, error)
-}
-
-type STTConfig struct {
-	Type            STTProviderType
-	APIKey          string
-	CredentialsJSON string
-	BaseURL         string
-	Model           string
-	Language        string
-	SampleRate      int
-	Timeout         time.Duration
-}
-
-func NewSTTProvider(cfg STTConfig) (STTProvider, error) {
-	switch cfg.Type {
-	case STTProviderOpenAI:
-		opts := []WhisperOption{}
-		if cfg.BaseURL != "" {
-			opts = append(opts, WithWhisperBaseURL(cfg.BaseURL))
-		}
-		if cfg.Model != "" {
-			opts = append(opts, WithWhisperModel(WhisperModel(cfg.Model)))
-		}
-		if cfg.Language != "" {
-			opts = append(opts, WithWhisperLanguage(cfg.Language))
-		}
-		if cfg.Timeout > 0 {
-			opts = append(opts, WithWhisperTimeout(cfg.Timeout))
-		}
-		return NewWhisperProvider(cfg.APIKey, opts...)
-	case STTProviderGoogle:
-		opts := []GoogleOption{}
-		if cfg.CredentialsJSON != "" {
-			opts = append(opts, WithGoogleCredentialsJSON(cfg.CredentialsJSON))
-		}
-		if cfg.BaseURL != "" {
-			opts = append(opts, WithGoogleBaseURL(cfg.BaseURL))
-		}
-		if cfg.Language != "" {
-			opts = append(opts, WithGoogleLanguageCode(cfg.Language))
-		}
-		if cfg.Timeout > 0 {
-			opts = append(opts, WithGoogleTimeout(cfg.Timeout))
-		}
-		return NewGoogleProvider(cfg.APIKey, opts...)
-	case STTProviderWhisperCPP:
-		opts := []WhisperCPPOption{}
-		if cfg.Model != "" {
-			opts = append(opts, WithWhisperCPPModelPath(cfg.Model))
-		}
-		if cfg.Language != "" {
-			opts = append(opts, WithWhisperCPPLanguage(cfg.Language))
-		}
-		if cfg.Timeout > 0 {
-			opts = append(opts, WithWhisperCPPTimeout(cfg.Timeout))
-		}
-		return NewWhisperCPPProvider(opts...)
-	default:
-		return nil, NewSTTError(ErrProviderNotSupported, "unknown STT provider: "+string(cfg.Type))
-	}
-}
-
-type TranscribeMode string
-
-const (
-	ModeTranscription TranscribeMode = "transcription"
-	ModeTranslation   TranscribeMode = "translation"
-)
-
-type TranscribeOptions struct {
-	Language        string
-	Model           string
-	Prompt          string
-	Temperature     float64
-	Mode            TranscribeMode
-	InputFormat     AudioInputFormat
-	SampleRate      int
-	WordTimestamps  bool
-	SpeakerLabels   bool
-	MaxAlternatives int
-}
-
-type TranscribeOption func(*TranscribeOptions)
-
-func WithSTTLanguage(lang string) TranscribeOption {
-	return func(o *TranscribeOptions) {
-		o.Language = lang
-	}
-}
-
-func WithSTTModel(model string) TranscribeOption {
-	return func(o *TranscribeOptions) {
-		o.Model = model
-	}
-}
-
-func WithSTTPrompt(prompt string) TranscribeOption {
-	return func(o *TranscribeOptions) {
-		o.Prompt = prompt
-	}
-}
-
-func WithSTTTemperature(temp float64) TranscribeOption {
-	return func(o *TranscribeOptions) {
-		o.Temperature = temp
-	}
-}
-
-func WithSTTMode(mode TranscribeMode) TranscribeOption {
-	return func(o *TranscribeOptions) {
-		o.Mode = mode
-	}
-}
-
-func WithSTTInputFormat(format AudioInputFormat) TranscribeOption {
-	return func(o *TranscribeOptions) {
-		o.InputFormat = format
-	}
-}
-
-func WithSTTSampleRate(rate int) TranscribeOption {
-	return func(o *TranscribeOptions) {
-		o.SampleRate = rate
-	}
-}
-
-func WithSTTWordTimestamps(enabled bool) TranscribeOption {
-	return func(o *TranscribeOptions) {
-		o.WordTimestamps = enabled
-	}
-}
-
-func WithSTTSpeakerLabels(enabled bool) TranscribeOption {
-	return func(o *TranscribeOptions) {
-		o.SpeakerLabels = enabled
-	}
-}
-
-func WithSTTMaxAlternatives(n int) TranscribeOption {
-	return func(o *TranscribeOptions) {
-		o.MaxAlternatives = n
-	}
-}
-
-type WordInfo struct {
-	Word       string
-	StartTime  time.Duration
-	EndTime    time.Duration
-	Confidence float64
-}
-
-type SegmentInfo struct {
-	ID         int
-	Text       string
-	StartTime  time.Duration
-	EndTime    time.Duration
-	Confidence float64
-	Speaker    string
-	Words      []WordInfo
-}
-
-type TranscriptResult struct {
-	Text         string
-	Language     string
-	Duration     time.Duration
-	Confidence   float64
-	Segments     []SegmentInfo
-	Words        []WordInfo
-	Alternatives []string
-}
-
-type STTErrorCode string
-
-const (
-	ErrProviderNotSupported STTErrorCode = "provider_not_supported"
-	ErrAudioFormatInvalid   STTErrorCode = "audio_format_invalid"
-	ErrTranscriptionFailed  STTErrorCode = "transcription_failed"
-	ErrAudioTooLong         STTErrorCode = "audio_too_long"
-	ErrAudioTooLarge        STTErrorCode = "audio_too_large"
-	ErrRateLimited          STTErrorCode = "rate_limited"
-	ErrAuthentication       STTErrorCode = "authentication_failed"
-)
-
-type STTError struct {
-	Code    STTErrorCode
-	Message string
-	Err     error
-}
-
-func NewSTTError(code STTErrorCode, message string) *STTError {
-	return &STTError{Code: code, Message: message}
-}
-
-func NewSTTErrorf(code STTErrorCode, format string, args ...interface{}) *STTError {
-	return &STTError{Code: code, Message: fmt.Sprintf(format, args...)}
-}
-
-func (e *STTError) Error() string {
-	if e.Err != nil {
-		return string(e.Code) + ": " + e.Message + ": " + e.Err.Error()
-	}
-	return string(e.Code) + ": " + e.Message
-}
-
-func (e *STTError) Unwrap() error {
-	return e.Err
-}
diff --git a/_speech_vad_stt_remaining_unuploaded_20260428_171936/README.md b/_speech_vad_stt_remaining_unuploaded_20260428_171936/README.md
deleted file mode 100644
index 19636a76..00000000
--- a/_speech_vad_stt_remaining_unuploaded_20260428_171936/README.md
+++ /dev/null
@@ -1,46 +0,0 @@
-# Speech VAD/STT Remaining Unuploaded Export
-
-This folder collects the speech-related changes exported from branch:
-
-- `feature/speech-vad-stt-phase1`
-
-It follows the same idea as the existing `_gateway_remaining_unuploaded_*` folders:
-
-- group changed files by topic
-- preserve relative project paths under each group
-- make it easy to review or move patches in batches
-
-## Groups
-
-### 01_vad_provider_and_voicewake
-
-Contains the VAD-side refactor and default provider switch:
-
-- provider abstraction for VAD
-- fallback heuristic VAD retained
-- WebRTC VAD implementation added
-- `VoiceWake` switched to use the provider interface and default WebRTC path
-
-### 02_stt_openai_client_refactor
-
-Contains the OpenAI STT cleanup:
-
-- multipart/request logic extracted into a thin client
-- `WhisperProvider` simplified to reuse the shared client
-
-### 03_stt_google_official_client
-
-Contains the Google STT migration:
-
-- provider moved from handwritten REST calls to the official Google Speech Go client
-- thin Google client wrapper added
-- provider config extended with `CredentialsJSON`
-- tests updated to use fake client injection
-- `go.mod` / `go.sum` included because the dependency graph changed
-
-## Validation
-
-The branch state corresponding to these files passed:
-
-- `go test ./pkg/speech/...`
-- `go test ./pkg/gateway/...`

From 63a63ae01028a2f73a5eb2993c2912041a5e8ecd Mon Sep 17 00:00:00 2001
From: TheShigure7 <2947458856@qq.com>
Date: Wed, 29 Apr 2026 01:17:26 +0800
Subject: [PATCH 6/6] fix(speech): address review feedback for VAD and STT

---
 pkg/speech/stt_openai_client.go |  4 +++-
 pkg/speech/vad_webrtc.go        | 15 +++++++++++----
 pkg/speech/voicewake.go         |  1 +
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/pkg/speech/stt_openai_client.go b/pkg/speech/stt_openai_client.go
index 3d986daa..a03cef4c 100644
--- a/pkg/speech/stt_openai_client.go
+++ b/pkg/speech/stt_openai_client.go
@@ -90,7 +90,9 @@ func (c *openAIAudioAPIClient) buildMultipartBody(audio []byte, options Transcri
 		}
 	}
 
-	if options.WordTimestamps || options.SpeakerLabels {
+	// Streaming requests use response_format=json and should not send
+	// verbose-only timestamp granularities.
+	if !stream && (options.WordTimestamps || options.SpeakerLabels) {
 		if options.WordTimestamps {
 			if err := writer.WriteField("timestamp_granularities[]", "word"); err != nil {
 				return nil, "", NewSTTErrorf(ErrTranscriptionFailed, "openai-whisper: failed to write word timestamp_granularities: %v", err)
diff --git a/pkg/speech/vad_webrtc.go b/pkg/speech/vad_webrtc.go
index 4a7d2163..9c5b41f2 100644
--- a/pkg/speech/vad_webrtc.go
+++ b/pkg/speech/vad_webrtc.go
@@ -13,6 +13,7 @@ type WebRTCVAD struct {
 	mode       int
 	sampleRate int
 	frameSize  int
+	scratch    []byte
 }
 
 func NewWebRTCVAD(cfg VADConfig) (*WebRTCVAD, error) {
@@ -41,6 +42,7 @@ func NewWebRTCVAD(cfg VADConfig) (*WebRTCVAD, error) {
 		mode:       cfg.Aggressiveness,
 		sampleRate: cfg.SampleRate,
 		frameSize:  cfg.FrameSize,
+		scratch:    make([]byte, cfg.FrameSize*2),
 	}, nil
 }
 
@@ -57,13 +59,14 @@ func (v *WebRTCVAD) ProcessFrame(samples []int16) VADState {
 		return v.inner.ProcessFrame(samples)
 	}
 
-	audio := int16ToLittleEndianBytes(samples)
+	v.inner.mu.Lock()
+	audio := v.frameBytes(samples)
 	isSpeech, err := v.detector.IsSpeech(audio, v.sampleRate)
 	if err != nil {
+		v.inner.mu.Unlock()
 		return v.inner.ProcessFrame(samples)
 	}
 
-	v.inner.mu.Lock()
 	defer v.inner.mu.Unlock()
 
 	energy := v.inner.calculateRMS(samples)
@@ -134,8 +137,12 @@ func (v *WebRTCVAD) Config() VADConfig {
 	return cfg
 }
 
-func int16ToLittleEndianBytes(samples []int16) []byte {
-	out := make([]byte, len(samples)*2)
+func (v *WebRTCVAD) frameBytes(samples []int16) []byte {
+	size := len(samples) * 2
+	if cap(v.scratch) < size {
+		v.scratch = make([]byte, size)
+	}
+	out := v.scratch[:size]
 	for i, s := range samples {
 		binary.LittleEndian.PutUint16(out[i*2:], uint16(s))
 	}
diff --git a/pkg/speech/voicewake.go b/pkg/speech/voicewake.go
index 64d89ffc..61ae30a9 100644
--- a/pkg/speech/voicewake.go
+++ b/pkg/speech/voicewake.go
@@ -130,6 +130,7 @@ func NewVoiceWake(cfg VoiceWakeConfig) *VoiceWake {
 	vad, err := vadManager.New(cfg.VADConfig, cfg.VADProvider)
 	if err != nil {
 		log.Printf("voicewake: failed to create VAD provider %q, fallback to heuristic: %v", cfg.VADProvider, err)
+		cfg.VADProvider = VADProviderHeuristic
 		vad = NewVAD(cfg.VADConfig)
 	}
 	wakeDetector := NewWakeWordDetector(cfg.WakeWordConfig)