diff --git a/ai-logic/firebase-ai/CHANGELOG.md b/ai-logic/firebase-ai/CHANGELOG.md index 0b709ed88a0..f402cc6d277 100644 --- a/ai-logic/firebase-ai/CHANGELOG.md +++ b/ai-logic/firebase-ai/CHANGELOG.md @@ -1,5 +1,7 @@ # Unreleased +- [feature] Added support for `LiveRealtimeInputConfig` and `LiveActivityDetection` to configure voice activity detection in Live API. Added `sendStartActivityRealtime` and `sendStopActivityRealtime` to `LiveSession` for manual activity control. (#8080) + - [feature] Added support for `ImageConfig` and `finishMessage`. (#8020) - [feature] Added a Java-friendly wrapper for TemplateChat interactions (`TemplateChatFutures`). diff --git a/ai-logic/firebase-ai/api.txt b/ai-logic/firebase-ai/api.txt index 7dff0b70e60..17e1c1c66b2 100644 --- a/ai-logic/firebase-ai/api.txt +++ b/ai-logic/firebase-ai/api.txt @@ -261,6 +261,8 @@ package com.google.firebase.ai.java { method public abstract com.google.common.util.concurrent.ListenableFuture sendAudioRealtime(com.google.firebase.ai.type.InlineData audio); method public abstract com.google.common.util.concurrent.ListenableFuture sendFunctionResponse(java.util.List functionList); method @Deprecated public abstract com.google.common.util.concurrent.ListenableFuture sendMediaStream(java.util.List mediaChunks); + method public abstract com.google.common.util.concurrent.ListenableFuture sendStartActivityRealtime(); + method public abstract com.google.common.util.concurrent.ListenableFuture sendStopActivityRealtime(); method public abstract com.google.common.util.concurrent.ListenableFuture sendTextRealtime(String text); method public abstract com.google.common.util.concurrent.ListenableFuture sendVideoRealtime(com.google.firebase.ai.type.InlineData video); method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture startAudioConversation(); @@ -1306,6 +1308,43 @@ package com.google.firebase.ai.type { property public final double longitude; } + @com.google.firebase.ai.type.PublicPreviewAPI public final class LiveActivityDetection { + method public static com.google.firebase.ai.type.LiveActivityDetection.Builder builder(); + field public static final com.google.firebase.ai.type.LiveActivityDetection.Companion Companion; + } + + public static final class LiveActivityDetection.Builder { + ctor public LiveActivityDetection.Builder(); + method public com.google.firebase.ai.type.LiveActivityDetection build(); + method public com.google.firebase.ai.type.LiveActivityDetection.Builder setDisabled(boolean disabled); + method public com.google.firebase.ai.type.LiveActivityDetection.Builder setEndSensitivity(com.google.firebase.ai.type.LiveActivityDetection.Sensitivity sensitivity); + method public com.google.firebase.ai.type.LiveActivityDetection.Builder setPrefixPaddingMS(int paddingMs); + method public com.google.firebase.ai.type.LiveActivityDetection.Builder setSilenceDurationMS(int durationMs); + method public com.google.firebase.ai.type.LiveActivityDetection.Builder setStartSensitivity(com.google.firebase.ai.type.LiveActivityDetection.Sensitivity sensitivity); + field public Boolean? disabled; + field public com.google.firebase.ai.type.LiveActivityDetection.Sensitivity? endSensitivity; + field public Integer? prefixPaddingMS; + field public Integer? silenceDurationMS; + field public com.google.firebase.ai.type.LiveActivityDetection.Sensitivity? startSensitivity; + } + + public static final class LiveActivityDetection.Companion { + method public com.google.firebase.ai.type.LiveActivityDetection.Builder builder(); + } + + public static final class LiveActivityDetection.Sensitivity { + field public static final com.google.firebase.ai.type.LiveActivityDetection.Sensitivity.Companion Companion; + field public static final com.google.firebase.ai.type.LiveActivityDetection.Sensitivity HIGH; + field public static final com.google.firebase.ai.type.LiveActivityDetection.Sensitivity LOW; + } + + public static final class LiveActivityDetection.Sensitivity.Companion { + } + + public final class LiveActivityDetectionKt { + method public static com.google.firebase.ai.type.LiveActivityDetection liveActivityDetection(kotlin.jvm.functions.Function1 init); + } + @com.google.firebase.ai.type.PublicPreviewAPI public final class LiveAudioConversationConfig { field public static final com.google.firebase.ai.type.LiveAudioConversationConfig.Companion Companion; } @@ -1346,6 +1385,7 @@ package com.google.firebase.ai.type { method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setMaxOutputTokens(Integer? maxOutputTokens); method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setOutputAudioTranscription(com.google.firebase.ai.type.AudioTranscriptionConfig? config); method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setPresencePenalty(Float? presencePenalty); + method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setRealtimeInputConfig(com.google.firebase.ai.type.LiveRealtimeInputConfig? config); method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setResponseModality(com.google.firebase.ai.type.ResponseModality? responseModality); method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setSpeechConfig(com.google.firebase.ai.type.SpeechConfig? speechConfig); method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setTemperature(Float? temperature); @@ -1357,6 +1397,7 @@ package com.google.firebase.ai.type { field public Integer? maxOutputTokens; field public com.google.firebase.ai.type.AudioTranscriptionConfig? outputAudioTranscription; field public Float? presencePenalty; + field public com.google.firebase.ai.type.LiveRealtimeInputConfig? realtimeInputConfig; field public com.google.firebase.ai.type.ResponseModality? responseModality; field public com.google.firebase.ai.type.SpeechConfig? speechConfig; field public Float? temperature; @@ -1372,6 +1413,48 @@ package com.google.firebase.ai.type { method public static com.google.firebase.ai.type.LiveGenerationConfig liveGenerationConfig(kotlin.jvm.functions.Function1 init); } + @com.google.firebase.ai.type.PublicPreviewAPI public final class LiveRealtimeInputConfig { + method public static com.google.firebase.ai.type.LiveRealtimeInputConfig.Builder builder(); + field public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.Companion Companion; + } + + public static final class LiveRealtimeInputConfig.ActivityHandling { + field public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling.Companion Companion; + field public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling INTERRUPT; + field public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling NO_INTERRUPT; + } + + public static final class LiveRealtimeInputConfig.ActivityHandling.Companion { + } + + public static final class LiveRealtimeInputConfig.Builder { + ctor public LiveRealtimeInputConfig.Builder(); + method public com.google.firebase.ai.type.LiveRealtimeInputConfig build(); + method public com.google.firebase.ai.type.LiveRealtimeInputConfig.Builder setActivityHandling(com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling handling); + method public com.google.firebase.ai.type.LiveRealtimeInputConfig.Builder setAutomaticActivityDetection(com.google.firebase.ai.type.LiveActivityDetection config); + method public com.google.firebase.ai.type.LiveRealtimeInputConfig.Builder setTurnCoverage(com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage coverage); + field public com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling? activityHandling; + field public com.google.firebase.ai.type.LiveActivityDetection? automaticActivityDetection; + field public com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage? turnCoverage; + } + + public static final class LiveRealtimeInputConfig.Companion { + method public com.google.firebase.ai.type.LiveRealtimeInputConfig.Builder builder(); + } + + public static final class LiveRealtimeInputConfig.TurnCoverage { + field public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage ALL_INPUT; + field public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage.Companion Companion; + field public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage ONLY_ACTIVITY; + } + + public static final class LiveRealtimeInputConfig.TurnCoverage.Companion { + } + + public final class LiveRealtimeInputConfigKt { + method public static com.google.firebase.ai.type.LiveRealtimeInputConfig liveRealtimeInputConfig(kotlin.jvm.functions.Function1 init); + } + @com.google.firebase.ai.type.PublicPreviewAPI public final class LiveServerContent implements com.google.firebase.ai.type.LiveServerMessage { ctor @Deprecated public LiveServerContent(com.google.firebase.ai.type.Content? content, boolean interrupted, boolean turnComplete, boolean generationComplete, com.google.firebase.ai.type.Transcription? inputTranscription, com.google.firebase.ai.type.Transcription? outputTranscription); method public com.google.firebase.ai.type.Content? getContent(); @@ -1430,6 +1513,8 @@ package com.google.firebase.ai.type { method public suspend Object? sendAudioRealtime(com.google.firebase.ai.type.InlineData audio, kotlin.coroutines.Continuation); method public suspend Object? sendFunctionResponse(java.util.List functionList, kotlin.coroutines.Continuation); method @Deprecated public suspend Object? sendMediaStream(java.util.List mediaChunks, kotlin.coroutines.Continuation); + method public suspend Object? sendStartActivityRealtime(kotlin.coroutines.Continuation); + method public suspend Object? sendStopActivityRealtime(kotlin.coroutines.Continuation); method public suspend Object? sendTextRealtime(String text, kotlin.coroutines.Continuation); method public suspend Object? sendVideoRealtime(com.google.firebase.ai.type.InlineData video, kotlin.coroutines.Continuation); method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(com.google.firebase.ai.type.LiveAudioConversationConfig liveAudioConversationConfig, kotlin.coroutines.Continuation); diff --git a/ai-logic/firebase-ai/src/androidTest/kotlin/com/google/firebase/ai/LiveSessionTests.kt b/ai-logic/firebase-ai/src/androidTest/kotlin/com/google/firebase/ai/LiveSessionTests.kt index 7f918232dc1..b44d0e02a31 100644 --- a/ai-logic/firebase-ai/src/androidTest/kotlin/com/google/firebase/ai/LiveSessionTests.kt +++ b/ai-logic/firebase-ai/src/androidTest/kotlin/com/google/firebase/ai/LiveSessionTests.kt @@ -25,7 +25,9 @@ import com.google.firebase.ai.type.Content import com.google.firebase.ai.type.FunctionResponsePart import com.google.firebase.ai.type.GenerativeBackend import com.google.firebase.ai.type.InlineData +import com.google.firebase.ai.type.LiveActivityDetection import com.google.firebase.ai.type.LiveGenerationConfig +import com.google.firebase.ai.type.LiveRealtimeInputConfig import com.google.firebase.ai.type.LiveServerContent import com.google.firebase.ai.type.LiveServerToolCall import com.google.firebase.ai.type.LiveSession @@ -36,7 +38,9 @@ import com.google.firebase.ai.type.Schema import com.google.firebase.ai.type.SessionResumptionConfig import com.google.firebase.ai.type.Tool import com.google.firebase.ai.type.content +import com.google.firebase.ai.type.liveActivityDetection import com.google.firebase.ai.type.liveGenerationConfig +import com.google.firebase.ai.type.liveRealtimeInputConfig import io.kotest.matchers.ints.shouldBeGreaterThan import io.kotest.matchers.longs.shouldBeGreaterThan import io.kotest.matchers.nulls.shouldNotBeNull @@ -318,4 +322,65 @@ class LiveSessionTests { .collect {} return transcriptBuilder.toString() } + + @Test + fun testRealtimeInputConfig_manualActivity(): Unit = runBlocking { + val config = liveGenerationConfig { + responseModality = ResponseModality.AUDIO + outputAudioTranscription = AudioTranscriptionConfig() + realtimeInputConfig = liveRealtimeInputConfig { + automaticActivityDetection = liveActivityDetection { disabled = true } + } + } + val liveModel = + getLiveModel( + modelName = modelName, + config = config, + systemInstruction = SystemInstructions.yesOrNo + ) + val session = liveModel.connect() + try { + session.sendStartActivityRealtime() + session.sendTextRealtime("Does five plus five equal ten?") + session.sendStopActivityRealtime() + + val text = withTimeoutOrNull(15.seconds) { session.collectNextAudioOutputTranscript() } ?: "" + text.toLowerCasePreservingASCIIRules() shouldContain "yes" + } finally { + session.close() + } + } + + @Test + fun testRealtimeInputConfig_fullConfiguration(): Unit = runBlocking { + val config = liveGenerationConfig { + responseModality = ResponseModality.AUDIO + outputAudioTranscription = AudioTranscriptionConfig() + realtimeInputConfig = liveRealtimeInputConfig { + activityHandling = LiveRealtimeInputConfig.ActivityHandling.NO_INTERRUPT + turnCoverage = LiveRealtimeInputConfig.TurnCoverage.ONLY_ACTIVITY + automaticActivityDetection = liveActivityDetection { + startSensitivity = LiveActivityDetection.Sensitivity.HIGH + endSensitivity = LiveActivityDetection.Sensitivity.LOW + prefixPaddingMS = 100 + silenceDurationMS = 500 + disabled = false + } + } + } + val liveModel = + getLiveModel( + modelName = modelName, + config = config, + systemInstruction = SystemInstructions.yesOrNo + ) + val session = liveModel.connect() + try { + session.sendTextRealtime("Is the sky blue?") + val text = withTimeoutOrNull(15.seconds) { session.collectNextAudioOutputTranscript() } ?: "" + text.toLowerCasePreservingASCIIRules() shouldContain "yes" + } finally { + session.close() + } + } } diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/LiveGenerativeModel.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/LiveGenerativeModel.kt index 4b248d01f40..c59ddb079ea 100644 --- a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/LiveGenerativeModel.kt +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/LiveGenerativeModel.kt @@ -118,7 +118,8 @@ internal constructor( config?.inputAudioTranscription?.toInternal(), config?.outputAudioTranscription?.toInternal(), resumption?.toInternal(), - config?.contextWindowCompression?.toInternal() + config?.contextWindowCompression?.toInternal(), + config?.realtimeInputConfig?.toInternal() ) .toInternal() val data: String = JSON.encodeToString(clientMessage) diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt index 604f0180f61..eb85bed2f9f 100644 --- a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt @@ -220,6 +220,18 @@ public abstract class LiveSessionFutures internal constructor() { */ public abstract fun sendTextRealtime(text: String): ListenableFuture + /** + * Manually marks the start of user activity. Required only when automatic activity detection is + * disabled. + */ + public abstract fun sendStartActivityRealtime(): ListenableFuture + + /** + * Manually marks the end of user activity. Required only when automatic activity detection is + * disabled. + */ + public abstract fun sendStopActivityRealtime(): ListenableFuture + /** * Streams client data to the model. * @@ -294,6 +306,12 @@ public abstract class LiveSessionFutures internal constructor() { override fun sendTextRealtime(text: String): ListenableFuture = SuspendToFutureAdapter.launchFuture { session.sendTextRealtime(text) } + override fun sendStartActivityRealtime(): ListenableFuture = + SuspendToFutureAdapter.launchFuture { session.sendStartActivityRealtime() } + + override fun sendStopActivityRealtime(): ListenableFuture = + SuspendToFutureAdapter.launchFuture { session.sendStopActivityRealtime() } + override fun sendMediaStream(mediaChunks: List) = SuspendToFutureAdapter.launchFuture { session.sendMediaStream(mediaChunks) } diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveActivityDetection.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveActivityDetection.kt new file mode 100644 index 00000000000..c4290d103df --- /dev/null +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveActivityDetection.kt @@ -0,0 +1,134 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.firebase.ai.type + +import kotlinx.serialization.SerialName +import kotlinx.serialization.Serializable + +/** + * Configures the model's automatic detection of user activity. + * + * @property startSensitivity Determines how likely the start of speech is detected. + * @property endSensitivity Determines how likely the end of speech is detected. + * @property prefixPaddingMS How long detected speech should be present before start-of-speech is + * committed. The lower this value, the more sensitive the start-of-speech detection is and the + * shorter the speech that can be recognized. However, this also increases the probability of false + * positives. + * @property silenceDurationMS How long silence (or non-speech) should be present before + * end-of-speech is committed. The larger this value, the longer speech gaps can be without + * interrupting the user's activity, but this will increase the model's latency. + * @property disabled Disables automatic activity detection. When automatic activity detection is + * enabled, the model will interpret detected voices and text as the start of activity. When + * automatic activity detection is disabled, the user must send activity signals manually. + */ +@PublicPreviewAPI +public class LiveActivityDetection +private constructor( + internal val startSensitivity: Sensitivity?, + internal val endSensitivity: Sensitivity?, + internal val prefixPaddingMS: Int?, + internal val silenceDurationMS: Int?, + internal val disabled: Boolean? +) { + + /** How sensitive the model interprets speech activity. */ + public class Sensitivity private constructor(internal val value: String) { + public companion object { + /** + * The model will detect speech less often. In other words, a higher volume of speech is + * required for the model to consider the user is speaking. + */ + @JvmField public val LOW: Sensitivity = Sensitivity("LOW") + + /** + * The model will detect speech more often. In other words, a lower volume of speech is + * required for the model to consider the user is speaking. + */ + @JvmField public val HIGH: Sensitivity = Sensitivity("HIGH") + } + } + + /** Builder for creating a [LiveActivityDetection]. */ + public class Builder { + @JvmField public var startSensitivity: Sensitivity? = null + @JvmField public var endSensitivity: Sensitivity? = null + @JvmField public var prefixPaddingMS: Int? = null + @JvmField public var silenceDurationMS: Int? = null + @JvmField public var disabled: Boolean? = null + + public fun setStartSensitivity(sensitivity: Sensitivity): Builder = apply { + this.startSensitivity = sensitivity + } + + public fun setEndSensitivity(sensitivity: Sensitivity): Builder = apply { + this.endSensitivity = sensitivity + } + + public fun setPrefixPaddingMS(paddingMs: Int): Builder = apply { + this.prefixPaddingMS = paddingMs + } + + public fun setSilenceDurationMS(durationMs: Int): Builder = apply { + this.silenceDurationMS = durationMs + } + + public fun setDisabled(disabled: Boolean): Builder = apply { this.disabled = disabled } + + /** Create a new [LiveActivityDetection] with the attached arguments. */ + public fun build(): LiveActivityDetection = + LiveActivityDetection( + startSensitivity, + endSensitivity, + prefixPaddingMS, + silenceDurationMS, + disabled + ) + } + + internal fun toInternal(): Internal = + Internal( + startSensitivity = startSensitivity?.let { "START_SENSITIVITY_${it.value}" }, + endSensitivity = endSensitivity?.let { "END_SENSITIVITY_${it.value}" }, + prefixPaddingMs = prefixPaddingMS, + silenceDurationMs = silenceDurationMS, + disabled = disabled + ) + + @Serializable + internal data class Internal( + @SerialName("start_of_speech_sensitivity") val startSensitivity: String? = null, + @SerialName("end_of_speech_sensitivity") val endSensitivity: String? = null, + @SerialName("prefix_padding_ms") val prefixPaddingMs: Int? = null, + @SerialName("silence_duration_ms") val silenceDurationMs: Int? = null, + @SerialName("disabled") val disabled: Boolean? = null + ) + + public companion object { + /** Creates a new [Builder]. */ + @JvmStatic public fun builder(): Builder = Builder() + } +} + +/** Helper method to construct a [LiveActivityDetection] in a DSL-like manner. */ +@OptIn(PublicPreviewAPI::class) +public fun liveActivityDetection( + init: LiveActivityDetection.Builder.() -> Unit +): LiveActivityDetection { + val builder = LiveActivityDetection.builder() + builder.init() + return builder.build() +} diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveClientSetupMessage.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveClientSetupMessage.kt index bbe1633b97e..d296917e8de 100644 --- a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveClientSetupMessage.kt +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveClientSetupMessage.kt @@ -38,6 +38,7 @@ internal class LiveClientSetupMessage( val outputAudioTranscription: AudioTranscriptionConfig.Internal?, val sessionResumption: SessionResumptionConfig.Internal?, val contextWindowCompression: ContextWindowCompressionConfig.Internal?, + val realtimeInputConfig: LiveRealtimeInputConfig.Internal?, ) { @Serializable internal class Internal(val setup: LiveClientSetup) { @@ -53,6 +54,8 @@ internal class LiveClientSetupMessage( val sessionResumption: SessionResumptionConfig.Internal? = null, @SerialName("context_window_compression") val contextWindowCompression: ContextWindowCompressionConfig.Internal? = null, + @SerialName("realtime_input_config") + val realtimeInputConfig: LiveRealtimeInputConfig.Internal? = null, ) } @@ -66,7 +69,8 @@ internal class LiveClientSetupMessage( inputAudioTranscription, outputAudioTranscription, sessionResumption, - contextWindowCompression + contextWindowCompression, + realtimeInputConfig ) ) } diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveGenerationConfig.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveGenerationConfig.kt index 764100fe9b2..5de4e9c2c8d 100644 --- a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveGenerationConfig.kt +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveGenerationConfig.kt @@ -58,6 +58,8 @@ import kotlinx.serialization.Serializable * @property outputAudioTranscription Specifies the configuration for transcribing output audio from * the model. * + * @property realtimeInputConfig Configures realtime input for the session + * * Refer to the * [Control generated output](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/control-generated-output) * guide for more details. @@ -76,6 +78,7 @@ private constructor( internal val inputAudioTranscription: AudioTranscriptionConfig?, internal val outputAudioTranscription: AudioTranscriptionConfig?, internal val contextWindowCompression: ContextWindowCompressionConfig?, + internal val realtimeInputConfig: LiveRealtimeInputConfig?, ) { /** @@ -105,6 +108,8 @@ private constructor( * @property outputAudioTranscription see [LiveGenerationConfig.outputAudioTranscription] * * @property contextWindowCompression see [LiveGenerationConfig.contextWindowCompression] + * + * @property realtimeInputConfig see [LiveGenerationConfig.realtimeInputConfig] */ public class Builder { @JvmField public var temperature: Float? = null @@ -118,6 +123,7 @@ private constructor( @JvmField public var inputAudioTranscription: AudioTranscriptionConfig? = null @JvmField public var outputAudioTranscription: AudioTranscriptionConfig? = null @JvmField public var contextWindowCompression: ContextWindowCompressionConfig? = null + @JvmField public var realtimeInputConfig: LiveRealtimeInputConfig? = null public fun setTemperature(temperature: Float?): Builder = apply { this.temperature = temperature @@ -153,6 +159,10 @@ private constructor( this.contextWindowCompression = config } + public fun setRealtimeInputConfig(config: LiveRealtimeInputConfig?): Builder = apply { + this.realtimeInputConfig = config + } + /** Create a new [LiveGenerationConfig] with the attached arguments. */ public fun build(): LiveGenerationConfig = LiveGenerationConfig( @@ -167,6 +177,7 @@ private constructor( inputAudioTranscription = inputAudioTranscription, outputAudioTranscription = outputAudioTranscription, contextWindowCompression = contextWindowCompression, + realtimeInputConfig = realtimeInputConfig, ) } diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveRealtimeInputConfig.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveRealtimeInputConfig.kt new file mode 100644 index 00000000000..4cc2e7a23c3 --- /dev/null +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveRealtimeInputConfig.kt @@ -0,0 +1,127 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.firebase.ai.type + +import kotlinx.serialization.SerialName +import kotlinx.serialization.Serializable + +/** + * Configures model input behavior when generating content in the Live API via realtime supported + * methods. + * + * @property automaticActivityDetection Configures automatic activity detection on the model. When + * not set, automatic activity detection is enabled by default. If set, the user must send activity + * signals. + * @property activityHandling Defines how the model treats user input activity. + * @property turnCoverage Defines which input is included in the user's turn, relative to the + * starting and ending of the activity. + */ +@PublicPreviewAPI +public class LiveRealtimeInputConfig +private constructor( + internal val automaticActivityDetection: LiveActivityDetection?, + internal val activityHandling: ActivityHandling?, + internal val turnCoverage: TurnCoverage? +) { + + /** How a model handles user input activity. */ + public class ActivityHandling private constructor(internal val value: String) { + public companion object { + /** + * When the user sends input marking the start of activity, the model's current response will + * be cut-off immediately. + */ + @JvmField + public val INTERRUPT: ActivityHandling = ActivityHandling("START_OF_ACTIVITY_INTERRUPTS") + + /** + * When the user sends input marking the start of activity, the model will process it, but + * won't cut-off its current response. + */ + @JvmField public val NO_INTERRUPT: ActivityHandling = ActivityHandling("NO_INTERRUPTION") + } + } + + /** How the model considers which input is included in the user's turn. */ + public class TurnCoverage private constructor(internal val value: String) { + public companion object { + /** + * The model will exclude inactivity (e.g, silence on the audio stream) from the user's input. + */ + @JvmField public val ONLY_ACTIVITY: TurnCoverage = TurnCoverage("TURN_INCLUDES_ONLY_ACTIVITY") + + /** + * The model will include all input (including inactivity) since the last turn as the user's + * input. + */ + @JvmField public val ALL_INPUT: TurnCoverage = TurnCoverage("TURN_INCLUDES_ALL_INPUT") + } + } + + /** Builder for creating a [LiveRealtimeInputConfig]. */ + public class Builder { + @JvmField public var automaticActivityDetection: LiveActivityDetection? = null + @JvmField public var activityHandling: ActivityHandling? = null + @JvmField public var turnCoverage: TurnCoverage? = null + + public fun setAutomaticActivityDetection(config: LiveActivityDetection): Builder = apply { + this.automaticActivityDetection = config + } + + public fun setActivityHandling(handling: ActivityHandling): Builder = apply { + this.activityHandling = handling + } + + public fun setTurnCoverage(coverage: TurnCoverage): Builder = apply { + this.turnCoverage = coverage + } + + /** Create a new [LiveRealtimeInputConfig] with the attached arguments. */ + public fun build(): LiveRealtimeInputConfig = + LiveRealtimeInputConfig(automaticActivityDetection, activityHandling, turnCoverage) + } + + internal fun toInternal(): Internal = + Internal( + automaticActivityDetection = automaticActivityDetection?.toInternal(), + activityHandling = activityHandling?.value, + turnCoverage = turnCoverage?.value + ) + + @Serializable + internal data class Internal( + @SerialName("automatic_activity_detection") + val automaticActivityDetection: LiveActivityDetection.Internal? = null, + @SerialName("activity_handling") val activityHandling: String? = null, + @SerialName("turn_coverage") val turnCoverage: String? = null + ) + + public companion object { + /** Creates a new [Builder]. */ + @JvmStatic public fun builder(): Builder = Builder() + } +} + +/** Helper method to construct a [LiveRealtimeInputConfig] in a DSL-like manner. */ +@OptIn(PublicPreviewAPI::class) +public fun liveRealtimeInputConfig( + init: LiveRealtimeInputConfig.Builder.() -> Unit +): LiveRealtimeInputConfig { + val builder = LiveRealtimeInputConfig.builder() + builder.init() + return builder.build() +} diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt index 7aac4b73da0..0ef1d889f4e 100644 --- a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt @@ -61,6 +61,7 @@ import kotlinx.coroutines.flow.onEach import kotlinx.coroutines.isActive import kotlinx.coroutines.launch import kotlinx.serialization.ExperimentalSerializationApi +import kotlinx.serialization.SerialName import kotlinx.serialization.Serializable import kotlinx.serialization.encodeToString @@ -430,6 +431,37 @@ internal constructor( sendFrame(BidiGenerateContentRealtimeInputSetup(text = text).toInternal()) } + /** + * Manually marks the start of user activity, using the realtime API. + * + * The start of user activity is effectively the start of a user's turn, but depending on the + * configuration defined in [LiveRealtimeInputConfig], it may not be interpreted as an + * interruption. An example of the start of user activity could be the user speaking (not + * silence). + * + * Should be followed with a call to [sendStopActivityRealtime] after all the data has been sent + * for the user's turn. + * + * Only required when automatic activity detection is disabled via [LiveRealtimeInputConfig]. + */ + public suspend fun sendStartActivityRealtime() { + sendFrame(BidiGenerateContentRealtimeInputSetup(activityStart = true).toInternal()) + } + + /** + * Manually marks the end of user activity, using the realtime API. + * + * The end of user activity is effectively the end of a user's turn, and signals that the model + * can start sending responses. + * + * Should follow after a previous call to [sendStartActivityRealtime]. + * + * Only required when automatic activity detection is disabled via [LiveRealtimeInputConfig]. + */ + public suspend fun sendStopActivityRealtime() { + sendFrame(BidiGenerateContentRealtimeInputSetup(activityEnd = true).toInternal()) + } + /** * Streams client data to the model. * @@ -706,8 +738,13 @@ internal constructor( val mediaChunks: List? = null, val audio: InlineData? = null, val video: InlineData? = null, - val text: String? = null + val text: String? = null, + val activityStart: Boolean = false, + val activityEnd: Boolean = false ) { + @Serializable internal class ActivityStart + @Serializable internal class ActivityEnd + @Serializable internal class Internal(val realtimeInput: BidiGenerateContentRealtimeInput) { @Serializable @@ -715,7 +752,9 @@ internal constructor( val mediaChunks: List?, val audio: InlineData.Internal?, val video: InlineData.Internal?, - val text: String? + val text: String?, + @SerialName("activity_start") val activityStart: ActivityStart? = null, + @SerialName("activity_end") val activityEnd: ActivityEnd? = null ) } fun toInternal() = @@ -724,7 +763,9 @@ internal constructor( mediaChunks?.map { it.toInternal() }, audio?.toInternal(), video?.toInternal(), - text + text, + if (activityStart) ActivityStart() else null, + if (activityEnd) ActivityEnd() else null ) ) } diff --git a/ai-logic/firebase-ai/src/test/java/com/google/firebase/ai/type/LiveRealtimeInputConfigTest.kt b/ai-logic/firebase-ai/src/test/java/com/google/firebase/ai/type/LiveRealtimeInputConfigTest.kt new file mode 100644 index 00000000000..f1002b43170 --- /dev/null +++ b/ai-logic/firebase-ai/src/test/java/com/google/firebase/ai/type/LiveRealtimeInputConfigTest.kt @@ -0,0 +1,161 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.firebase.ai.type + +import com.google.firebase.ai.common.JSON +import io.kotest.assertions.json.shouldEqualJson +import io.kotest.matchers.equals.shouldBeEqual +import kotlinx.serialization.encodeToString +import org.junit.Test + +@OptIn(PublicPreviewAPI::class) +internal class LiveRealtimeInputConfigTest { + + @Test + fun `Basic LiveRealtimeInputConfig`() { + val config = liveRealtimeInputConfig { + activityHandling = LiveRealtimeInputConfig.ActivityHandling.NO_INTERRUPT + turnCoverage = LiveRealtimeInputConfig.TurnCoverage.ONLY_ACTIVITY + automaticActivityDetection = liveActivityDetection { disabled = true } + } + + val expectedJson = + """ + { + "activity_handling": "NO_INTERRUPTION", + "turn_coverage": "TURN_INCLUDES_ONLY_ACTIVITY", + "automatic_activity_detection": { + "disabled": true + } + } + """ + .trimIndent() + + JSON.encodeToString(config.toInternal()).shouldEqualJson(expectedJson) + } + + @Test + fun `LiveActivityDetection full config`() { + val detection = liveActivityDetection { + startSensitivity = LiveActivityDetection.Sensitivity.HIGH + endSensitivity = LiveActivityDetection.Sensitivity.LOW + prefixPaddingMS = 100 + silenceDurationMS = 500 + disabled = false + } + + val expectedJson = + """ + { + "start_of_speech_sensitivity": "START_SENSITIVITY_HIGH", + "end_of_speech_sensitivity": "END_SENSITIVITY_LOW", + "prefix_padding_ms": 100, + "silence_duration_ms": 500, + "disabled": false + } + """ + .trimIndent() + + JSON.encodeToString(detection.toInternal()).shouldEqualJson(expectedJson) + } + + @Test + fun `DSL correctly delegates to Builder`() { + val config = + LiveRealtimeInputConfig.builder() + .setActivityHandling(LiveRealtimeInputConfig.ActivityHandling.INTERRUPT) + .build() + + val configDsl = liveRealtimeInputConfig { + activityHandling = LiveRealtimeInputConfig.ActivityHandling.INTERRUPT + } + + config.activityHandling?.shouldBeEqual( + configDsl.activityHandling as LiveRealtimeInputConfig.ActivityHandling + ) + } + + @Test + fun `LiveClientSetupMessage includes LiveRealtimeInputConfig in serialization`() { + val config = liveGenerationConfig { + realtimeInputConfig = liveRealtimeInputConfig { + activityHandling = LiveRealtimeInputConfig.ActivityHandling.NO_INTERRUPT + } + } + val message = + LiveClientSetupMessage( + model = "my-model", + generationConfig = null, + tools = null, + systemInstruction = null, + inputAudioTranscription = null, + outputAudioTranscription = null, + sessionResumption = null, + contextWindowCompression = null, + realtimeInputConfig = config.realtimeInputConfig?.toInternal() + ) + + val expectedJson = + """ + { + "setup": { + "model": "my-model", + "realtime_input_config": { + "activity_handling": "NO_INTERRUPTION" + } + } + } + """ + .trimIndent() + + JSON.encodeToString(message.toInternal()).shouldEqualJson(expectedJson) + } + + @Test + fun `BidiGenerateContentRealtimeInputSetup with activityStart serializes correctly`() { + val setup = LiveSession.BidiGenerateContentRealtimeInputSetup(activityStart = true) + + val expectedJson = + """ + { + "realtimeInput": { + "activity_start": {} + } + } + """ + .trimIndent() + + JSON.encodeToString(setup.toInternal()).shouldEqualJson(expectedJson) + } + + @Test + fun `BidiGenerateContentRealtimeInputSetup with activityEnd serializes correctly`() { + val setup = LiveSession.BidiGenerateContentRealtimeInputSetup(activityEnd = true) + + val expectedJson = + """ + { + "realtimeInput": { + "activity_end": {} + } + } + """ + .trimIndent() + + JSON.encodeToString(setup.toInternal()).shouldEqualJson(expectedJson) + } +} diff --git a/ai-logic/firebase-ai/src/testUtil/java/com/google/firebase/ai/JavaCompileTests.java b/ai-logic/firebase-ai/src/testUtil/java/com/google/firebase/ai/JavaCompileTests.java index e5372155fd0..aeeb317bce2 100644 --- a/ai-logic/firebase-ai/src/testUtil/java/com/google/firebase/ai/JavaCompileTests.java +++ b/ai-logic/firebase-ai/src/testUtil/java/com/google/firebase/ai/JavaCompileTests.java @@ -63,7 +63,9 @@ import com.google.firebase.ai.type.InlineData; import com.google.firebase.ai.type.InlineDataPart; import com.google.firebase.ai.type.LatLng; +import com.google.firebase.ai.type.LiveActivityDetection; import com.google.firebase.ai.type.LiveGenerationConfig; +import com.google.firebase.ai.type.LiveRealtimeInputConfig; import com.google.firebase.ai.type.LiveServerContent; import com.google.firebase.ai.type.LiveServerMessage; import com.google.firebase.ai.type.LiveServerSetupComplete; @@ -183,6 +185,19 @@ private LiveGenerationConfig getLiveConfig() { .setPresencePenalty(2.0F) .setResponseModality(ResponseModality.AUDIO) .setSpeechConfig(new SpeechConfig(new Voice("AOEDE"))) + .setRealtimeInputConfig( + new LiveRealtimeInputConfig.Builder() + .setActivityHandling(LiveRealtimeInputConfig.ActivityHandling.NO_INTERRUPT) + .setTurnCoverage(LiveRealtimeInputConfig.TurnCoverage.ONLY_ACTIVITY) + .setAutomaticActivityDetection( + new LiveActivityDetection.Builder() + .setDisabled(true) + .setStartSensitivity(LiveActivityDetection.Sensitivity.HIGH) + .setEndSensitivity(LiveActivityDetection.Sensitivity.LOW) + .setPrefixPaddingMS(100) + .setSilenceDurationMS(500) + .build()) + .build()) .build(); } @@ -448,6 +463,8 @@ public void onComplete() { session.sendAudioRealtime(new InlineData(bytes, "audio/jxl", null)); session.sendVideoRealtime(new InlineData(bytes, "image/jxl", null)); session.sendTextRealtime("text"); + session.sendStartActivityRealtime(); + session.sendStopActivityRealtime(); FunctionResponsePart functionResponse = new FunctionResponsePart("myFunction", new JsonObject(Map.of()));