From 826d43b02594efc8ee90860dea494b782d05feaf Mon Sep 17 00:00:00 2001 From: Mila <107142260+milaGGL@users.noreply.github.com> Date: Thu, 30 Apr 2026 12:00:19 -0400 Subject: [PATCH 1/4] [AI] implement RealtimeInputConfig --- ai-logic/firebase-ai/api.txt | 89 ++++++++++ .../google/firebase/ai/LiveSessionTests.kt | 62 ++++++- .../google/firebase/ai/LiveGenerativeModel.kt | 3 +- .../firebase/ai/java/LiveSessionFutures.kt | 18 ++ .../firebase/ai/type/LiveActivityDetection.kt | 155 +++++++++++++++++ .../ai/type/LiveClientSetupMessage.kt | 6 +- .../firebase/ai/type/LiveGenerationConfig.kt | 11 ++ .../ai/type/LiveRealtimeInputConfig.kt | 153 +++++++++++++++++ .../google/firebase/ai/type/LiveSession.kt | 47 ++++- .../ai/type/LiveRealtimeInputConfigTest.kt | 161 ++++++++++++++++++ .../google/firebase/ai/JavaCompileTests.java | 17 ++ 11 files changed, 714 insertions(+), 8 deletions(-) create mode 100644 ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveActivityDetection.kt create mode 100644 ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveRealtimeInputConfig.kt create mode 100644 ai-logic/firebase-ai/src/test/java/com/google/firebase/ai/type/LiveRealtimeInputConfigTest.kt diff --git a/ai-logic/firebase-ai/api.txt b/ai-logic/firebase-ai/api.txt index 18a650770bd..a3d878f9805 100644 --- a/ai-logic/firebase-ai/api.txt +++ b/ai-logic/firebase-ai/api.txt @@ -248,6 +248,8 @@ package com.google.firebase.ai.java { method public abstract com.google.common.util.concurrent.ListenableFuture sendAudioRealtime(com.google.firebase.ai.type.InlineData audio); method public abstract com.google.common.util.concurrent.ListenableFuture sendFunctionResponse(java.util.List functionList); method @Deprecated public abstract com.google.common.util.concurrent.ListenableFuture sendMediaStream(java.util.List mediaChunks); + method public abstract com.google.common.util.concurrent.ListenableFuture sendStartActivityRealtime(); + method public abstract com.google.common.util.concurrent.ListenableFuture sendStopActivityRealtime(); method public abstract com.google.common.util.concurrent.ListenableFuture sendTextRealtime(String text); method public abstract com.google.common.util.concurrent.ListenableFuture sendVideoRealtime(com.google.firebase.ai.type.InlineData video); method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public abstract com.google.common.util.concurrent.ListenableFuture startAudioConversation(); @@ -1293,6 +1295,49 @@ package com.google.firebase.ai.type { property public final double longitude; } + @com.google.firebase.ai.type.PublicPreviewAPI public final class LiveActivityDetection { + method public static com.google.firebase.ai.type.LiveActivityDetection.Builder builder(); + method public Boolean? getDisabled(); + method public com.google.firebase.ai.type.LiveActivityDetection.Sensitivity? getEndSensitivity(); + method public Integer? getPrefixPaddingMS(); + method public Integer? getSilenceDurationMS(); + method public com.google.firebase.ai.type.LiveActivityDetection.Sensitivity? getStartSensitivity(); + property public final Boolean? disabled; + property public final com.google.firebase.ai.type.LiveActivityDetection.Sensitivity? endSensitivity; + property public final Integer? prefixPaddingMS; + property public final Integer? silenceDurationMS; + property public final com.google.firebase.ai.type.LiveActivityDetection.Sensitivity? startSensitivity; + field public static final com.google.firebase.ai.type.LiveActivityDetection.Companion Companion; + } + + public static final class LiveActivityDetection.Builder { + ctor public LiveActivityDetection.Builder(); + method public com.google.firebase.ai.type.LiveActivityDetection build(); + method public com.google.firebase.ai.type.LiveActivityDetection.Builder setDisabled(boolean disabled); + method public com.google.firebase.ai.type.LiveActivityDetection.Builder setEndSensitivity(com.google.firebase.ai.type.LiveActivityDetection.Sensitivity sensitivity); + method public com.google.firebase.ai.type.LiveActivityDetection.Builder setPrefixPaddingMS(int paddingMs); + method public com.google.firebase.ai.type.LiveActivityDetection.Builder setSilenceDurationMS(int durationMs); + method public com.google.firebase.ai.type.LiveActivityDetection.Builder setStartSensitivity(com.google.firebase.ai.type.LiveActivityDetection.Sensitivity sensitivity); + field public Boolean? disabled; + field public com.google.firebase.ai.type.LiveActivityDetection.Sensitivity? endSensitivity; + field public Integer? prefixPaddingMS; + field public Integer? silenceDurationMS; + field public com.google.firebase.ai.type.LiveActivityDetection.Sensitivity? startSensitivity; + } + + public static final class LiveActivityDetection.Companion { + method public com.google.firebase.ai.type.LiveActivityDetection.Builder builder(); + } + + public enum LiveActivityDetection.Sensitivity { + enum_constant public static final com.google.firebase.ai.type.LiveActivityDetection.Sensitivity HIGH; + enum_constant public static final com.google.firebase.ai.type.LiveActivityDetection.Sensitivity LOW; + } + + public final class LiveActivityDetectionKt { + method public static com.google.firebase.ai.type.LiveActivityDetection liveActivityDetection(kotlin.jvm.functions.Function1 init); + } + @com.google.firebase.ai.type.PublicPreviewAPI public final class LiveAudioConversationConfig { field public static final com.google.firebase.ai.type.LiveAudioConversationConfig.Companion Companion; } @@ -1333,6 +1378,7 @@ package com.google.firebase.ai.type { method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setMaxOutputTokens(Integer? maxOutputTokens); method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setOutputAudioTranscription(com.google.firebase.ai.type.AudioTranscriptionConfig? config); method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setPresencePenalty(Float? presencePenalty); + method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setRealtimeInputConfig(com.google.firebase.ai.type.LiveRealtimeInputConfig? config); method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setResponseModality(com.google.firebase.ai.type.ResponseModality? responseModality); method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setSpeechConfig(com.google.firebase.ai.type.SpeechConfig? speechConfig); method public com.google.firebase.ai.type.LiveGenerationConfig.Builder setTemperature(Float? temperature); @@ -1344,6 +1390,7 @@ package com.google.firebase.ai.type { field public Integer? maxOutputTokens; field public com.google.firebase.ai.type.AudioTranscriptionConfig? outputAudioTranscription; field public Float? presencePenalty; + field public com.google.firebase.ai.type.LiveRealtimeInputConfig? realtimeInputConfig; field public com.google.firebase.ai.type.ResponseModality? responseModality; field public com.google.firebase.ai.type.SpeechConfig? speechConfig; field public Float? temperature; @@ -1359,6 +1406,46 @@ package com.google.firebase.ai.type { method public static com.google.firebase.ai.type.LiveGenerationConfig liveGenerationConfig(kotlin.jvm.functions.Function1 init); } + @com.google.firebase.ai.type.PublicPreviewAPI public final class LiveRealtimeInputConfig { + method public static com.google.firebase.ai.type.LiveRealtimeInputConfig.Builder builder(); + method public com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling? getActivityHandling(); + method public com.google.firebase.ai.type.LiveActivityDetection? getAutomaticActivityDetection(); + method public com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage? getTurnCoverage(); + property public final com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling? activityHandling; + property public final com.google.firebase.ai.type.LiveActivityDetection? automaticActivityDetection; + property public final com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage? turnCoverage; + field public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.Companion Companion; + } + + public enum LiveRealtimeInputConfig.ActivityHandling { + enum_constant public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling INTERRUPT; + enum_constant public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling NO_INTERRUPT; + } + + public static final class LiveRealtimeInputConfig.Builder { + ctor public LiveRealtimeInputConfig.Builder(); + method public com.google.firebase.ai.type.LiveRealtimeInputConfig build(); + method public com.google.firebase.ai.type.LiveRealtimeInputConfig.Builder setActivityHandling(com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling handling); + method public com.google.firebase.ai.type.LiveRealtimeInputConfig.Builder setAutomaticActivityDetection(com.google.firebase.ai.type.LiveActivityDetection config); + method public com.google.firebase.ai.type.LiveRealtimeInputConfig.Builder setTurnCoverage(com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage coverage); + field public com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling? activityHandling; + field public com.google.firebase.ai.type.LiveActivityDetection? automaticActivityDetection; + field public com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage? turnCoverage; + } + + public static final class LiveRealtimeInputConfig.Companion { + method public com.google.firebase.ai.type.LiveRealtimeInputConfig.Builder builder(); + } + + public enum LiveRealtimeInputConfig.TurnCoverage { + enum_constant public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage ALL_INPUT; + enum_constant public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage ONLY_ACTIVITY; + } + + public final class LiveRealtimeInputConfigKt { + method public static com.google.firebase.ai.type.LiveRealtimeInputConfig liveRealtimeInputConfig(kotlin.jvm.functions.Function1 init); + } + @com.google.firebase.ai.type.PublicPreviewAPI public final class LiveServerContent implements com.google.firebase.ai.type.LiveServerMessage { ctor @Deprecated public LiveServerContent(com.google.firebase.ai.type.Content? content, boolean interrupted, boolean turnComplete, boolean generationComplete, com.google.firebase.ai.type.Transcription? inputTranscription, com.google.firebase.ai.type.Transcription? outputTranscription); method public com.google.firebase.ai.type.Content? getContent(); @@ -1417,6 +1504,8 @@ package com.google.firebase.ai.type { method public suspend Object? sendAudioRealtime(com.google.firebase.ai.type.InlineData audio, kotlin.coroutines.Continuation); method public suspend Object? sendFunctionResponse(java.util.List functionList, kotlin.coroutines.Continuation); method @Deprecated public suspend Object? sendMediaStream(java.util.List mediaChunks, kotlin.coroutines.Continuation); + method public suspend Object? sendStartActivityRealtime(kotlin.coroutines.Continuation); + method public suspend Object? sendStopActivityRealtime(kotlin.coroutines.Continuation); method public suspend Object? sendTextRealtime(String text, kotlin.coroutines.Continuation); method public suspend Object? sendVideoRealtime(com.google.firebase.ai.type.InlineData video, kotlin.coroutines.Continuation); method @RequiresPermission(android.Manifest.permission.RECORD_AUDIO) public suspend Object? startAudioConversation(com.google.firebase.ai.type.LiveAudioConversationConfig liveAudioConversationConfig, kotlin.coroutines.Continuation); diff --git a/ai-logic/firebase-ai/src/androidTest/kotlin/com/google/firebase/ai/LiveSessionTests.kt b/ai-logic/firebase-ai/src/androidTest/kotlin/com/google/firebase/ai/LiveSessionTests.kt index b7a0039ff0e..e2536fdfadc 100644 --- a/ai-logic/firebase-ai/src/androidTest/kotlin/com/google/firebase/ai/LiveSessionTests.kt +++ b/ai-logic/firebase-ai/src/androidTest/kotlin/com/google/firebase/ai/LiveSessionTests.kt @@ -25,7 +25,9 @@ import com.google.firebase.ai.type.Content import com.google.firebase.ai.type.FunctionResponsePart import com.google.firebase.ai.type.GenerativeBackend import com.google.firebase.ai.type.InlineData +import com.google.firebase.ai.type.LiveActivityDetection import com.google.firebase.ai.type.LiveGenerationConfig +import com.google.firebase.ai.type.LiveRealtimeInputConfig import com.google.firebase.ai.type.LiveServerContent import com.google.firebase.ai.type.LiveServerToolCall import com.google.firebase.ai.type.LiveSession @@ -36,7 +38,9 @@ import com.google.firebase.ai.type.Schema import com.google.firebase.ai.type.SessionResumptionConfig import com.google.firebase.ai.type.Tool import com.google.firebase.ai.type.content +import com.google.firebase.ai.type.liveActivityDetection import com.google.firebase.ai.type.liveGenerationConfig +import com.google.firebase.ai.type.liveRealtimeInputConfig import io.kotest.matchers.ints.shouldBeGreaterThan import io.kotest.matchers.longs.shouldBeGreaterThan import io.kotest.matchers.nulls.shouldNotBeNull @@ -279,6 +283,7 @@ class LiveSessionTests { val session = liveModel.connect(SessionResumptionConfig()) session.send("My favorite color is blue. Remember that.", true) var lastResumptionUpdate: LiveSessionResumptionUpdate? = null + var gotTurnComplete = false withTimeout(30.seconds) { session .receive() @@ -287,10 +292,10 @@ class LiveSessionTests { lastResumptionUpdate = it } if (it is LiveServerContent && it.turnComplete) { - false - } else { - true + gotTurnComplete = true } + // Stop collecting when there's a new handle AND turnComplete is true + !(gotTurnComplete && lastResumptionUpdate?.newHandle != null) } .collect {} } @@ -317,4 +322,55 @@ class LiveSessionTests { .collect {} return transcriptBuilder.toString() } + + @Test + fun testRealtimeInputConfig_manualActivity(): Unit = runBlocking { + val config = liveGenerationConfig { + responseModality = ResponseModality.AUDIO + outputAudioTranscription = AudioTranscriptionConfig() + realtimeInputConfig = liveRealtimeInputConfig { + automaticActivityDetection = liveActivityDetection { disabled = true } + } + } + val liveModel = getLiveModel(modelName = modelName, config = config) + val session = liveModel.connect() + try { + session.sendStartActivityRealtime() + session.sendTextRealtime("Hello") + session.sendStopActivityRealtime() + + // Wait for some response, just ensure no crash + val text = withTimeoutOrNull(10.seconds) { session.collectNextAudioOutputTranscript() } + } finally { + session.close() + } + } + + @Test + fun testRealtimeInputConfig_fullConfiguration(): Unit = runBlocking { + val config = liveGenerationConfig { + responseModality = ResponseModality.AUDIO + outputAudioTranscription = AudioTranscriptionConfig() + realtimeInputConfig = liveRealtimeInputConfig { + activityHandling = LiveRealtimeInputConfig.ActivityHandling.NO_INTERRUPT + turnCoverage = LiveRealtimeInputConfig.TurnCoverage.ONLY_ACTIVITY + automaticActivityDetection = liveActivityDetection { + startSensitivity = LiveActivityDetection.Sensitivity.HIGH + endSensitivity = LiveActivityDetection.Sensitivity.LOW + prefixPaddingMS = 100 + silenceDurationMS = 500 + disabled = false + } + } + } + val liveModel = getLiveModel(modelName = modelName, config = config) + val session = liveModel.connect() + try { + session.sendTextRealtime("Hello") + val text = withTimeoutOrNull(15.seconds) { session.collectNextAudioOutputTranscript() } + text.shouldNotBeNull() + } finally { + session.close() + } + } } diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/LiveGenerativeModel.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/LiveGenerativeModel.kt index 4b248d01f40..c59ddb079ea 100644 --- a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/LiveGenerativeModel.kt +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/LiveGenerativeModel.kt @@ -118,7 +118,8 @@ internal constructor( config?.inputAudioTranscription?.toInternal(), config?.outputAudioTranscription?.toInternal(), resumption?.toInternal(), - config?.contextWindowCompression?.toInternal() + config?.contextWindowCompression?.toInternal(), + config?.realtimeInputConfig?.toInternal() ) .toInternal() val data: String = JSON.encodeToString(clientMessage) diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt index 604f0180f61..eb85bed2f9f 100644 --- a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/java/LiveSessionFutures.kt @@ -220,6 +220,18 @@ public abstract class LiveSessionFutures internal constructor() { */ public abstract fun sendTextRealtime(text: String): ListenableFuture + /** + * Manually marks the start of user activity. Required only when automatic activity detection is + * disabled. + */ + public abstract fun sendStartActivityRealtime(): ListenableFuture + + /** + * Manually marks the end of user activity. Required only when automatic activity detection is + * disabled. + */ + public abstract fun sendStopActivityRealtime(): ListenableFuture + /** * Streams client data to the model. * @@ -294,6 +306,12 @@ public abstract class LiveSessionFutures internal constructor() { override fun sendTextRealtime(text: String): ListenableFuture = SuspendToFutureAdapter.launchFuture { session.sendTextRealtime(text) } + override fun sendStartActivityRealtime(): ListenableFuture = + SuspendToFutureAdapter.launchFuture { session.sendStartActivityRealtime() } + + override fun sendStopActivityRealtime(): ListenableFuture = + SuspendToFutureAdapter.launchFuture { session.sendStopActivityRealtime() } + override fun sendMediaStream(mediaChunks: List) = SuspendToFutureAdapter.launchFuture { session.sendMediaStream(mediaChunks) } diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveActivityDetection.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveActivityDetection.kt new file mode 100644 index 00000000000..fd4e1a59d2a --- /dev/null +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveActivityDetection.kt @@ -0,0 +1,155 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.firebase.ai.type + +import kotlinx.serialization.SerialName +import kotlinx.serialization.Serializable + +/** + * Configures the model's automatic detection of user activity. + * + * @property startSensitivity Determines how likely the start of speech is detected. + * @property endSensitivity Determines how likely the end of speech is detected. + * @property prefixPaddingMS How long detected speech should be present before start-of-speech is + * committed. The lower this value, the more sensitive the start-of-speech detection is and the + * shorter the speech that can be recognized. However, this also increases the probability of false + * positives. + * @property silenceDurationMS How long silence (or non-speech) should be present before + * end-of-speech is committed. The larger this value, the longer speech gaps can be without + * interrupting the user's activity, but this will increase the model's latency. + * @property disabled Disables automatic activity detection. When automatic activity detection is + * enabled, the model will interpret detected voices and text as the start of activity. When + * automatic activity detection is disabled, the user must send activity signals manually. + */ +@PublicPreviewAPI +public class LiveActivityDetection +private constructor( + public val startSensitivity: Sensitivity?, + public val endSensitivity: Sensitivity?, + public val prefixPaddingMS: Int?, + public val silenceDurationMS: Int?, + public val disabled: Boolean? +) { + + /** How sensitive the model interprets speech activity. */ + public enum class Sensitivity { + /** + * The model will detect speech less often. In other words, a higher volume of speech is + * required for the model to consider the user is speaking. + */ + LOW, + /** + * The model will detect speech more often. In other words, a lower volume of speech is required + * for the model to consider the user is speaking. + */ + HIGH + } + + /** Builder for creating a [LiveActivityDetection]. */ + public class Builder { + @JvmField public var startSensitivity: Sensitivity? = null + @JvmField public var endSensitivity: Sensitivity? = null + @JvmField public var prefixPaddingMS: Int? = null + @JvmField public var silenceDurationMS: Int? = null + @JvmField public var disabled: Boolean? = null + + public fun setStartSensitivity(sensitivity: Sensitivity): Builder = apply { + this.startSensitivity = sensitivity + } + + public fun setEndSensitivity(sensitivity: Sensitivity): Builder = apply { + this.endSensitivity = sensitivity + } + + public fun setPrefixPaddingMS(paddingMs: Int): Builder = apply { + this.prefixPaddingMS = paddingMs + } + + public fun setSilenceDurationMS(durationMs: Int): Builder = apply { + this.silenceDurationMS = durationMs + } + + public fun setDisabled(disabled: Boolean): Builder = apply { this.disabled = disabled } + + /** Create a new [LiveActivityDetection] with the attached arguments. */ + public fun build(): LiveActivityDetection = + LiveActivityDetection( + startSensitivity, + endSensitivity, + prefixPaddingMS, + silenceDurationMS, + disabled + ) + } + + internal fun toInternal(): Internal = + Internal( + startSensitivity = + when (startSensitivity) { + Sensitivity.LOW -> Internal.StartSensitivity.LOW + Sensitivity.HIGH -> Internal.StartSensitivity.HIGH + null -> null + }, + endSensitivity = + when (endSensitivity) { + Sensitivity.LOW -> Internal.EndSensitivity.LOW + Sensitivity.HIGH -> Internal.EndSensitivity.HIGH + null -> null + }, + prefixPaddingMs = prefixPaddingMS, + silenceDurationMs = silenceDurationMS, + disabled = disabled + ) + + @Serializable + internal data class Internal( + @SerialName("start_of_speech_sensitivity") val startSensitivity: StartSensitivity? = null, + @SerialName("end_of_speech_sensitivity") val endSensitivity: EndSensitivity? = null, + @SerialName("prefix_padding_ms") val prefixPaddingMs: Int? = null, + @SerialName("silence_duration_ms") val silenceDurationMs: Int? = null, + @SerialName("disabled") val disabled: Boolean? = null + ) { + @Serializable + internal enum class StartSensitivity { + @SerialName("START_SENSITIVITY_UNSPECIFIED") UNSPECIFIED, + @SerialName("START_SENSITIVITY_HIGH") HIGH, + @SerialName("START_SENSITIVITY_LOW") LOW + } + + @Serializable + internal enum class EndSensitivity { + @SerialName("END_SENSITIVITY_UNSPECIFIED") UNSPECIFIED, + @SerialName("END_SENSITIVITY_HIGH") HIGH, + @SerialName("END_SENSITIVITY_LOW") LOW + } + } + + public companion object { + /** Creates a new [Builder]. */ + @JvmStatic public fun builder(): Builder = Builder() + } +} + +/** Helper method to construct a [LiveActivityDetection] in a DSL-like manner. */ +@OptIn(PublicPreviewAPI::class) +public fun liveActivityDetection( + init: LiveActivityDetection.Builder.() -> Unit +): LiveActivityDetection { + val builder = LiveActivityDetection.builder() + builder.init() + return builder.build() +} diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveClientSetupMessage.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveClientSetupMessage.kt index bbe1633b97e..d296917e8de 100644 --- a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveClientSetupMessage.kt +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveClientSetupMessage.kt @@ -38,6 +38,7 @@ internal class LiveClientSetupMessage( val outputAudioTranscription: AudioTranscriptionConfig.Internal?, val sessionResumption: SessionResumptionConfig.Internal?, val contextWindowCompression: ContextWindowCompressionConfig.Internal?, + val realtimeInputConfig: LiveRealtimeInputConfig.Internal?, ) { @Serializable internal class Internal(val setup: LiveClientSetup) { @@ -53,6 +54,8 @@ internal class LiveClientSetupMessage( val sessionResumption: SessionResumptionConfig.Internal? = null, @SerialName("context_window_compression") val contextWindowCompression: ContextWindowCompressionConfig.Internal? = null, + @SerialName("realtime_input_config") + val realtimeInputConfig: LiveRealtimeInputConfig.Internal? = null, ) } @@ -66,7 +69,8 @@ internal class LiveClientSetupMessage( inputAudioTranscription, outputAudioTranscription, sessionResumption, - contextWindowCompression + contextWindowCompression, + realtimeInputConfig ) ) } diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveGenerationConfig.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveGenerationConfig.kt index 764100fe9b2..5de4e9c2c8d 100644 --- a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveGenerationConfig.kt +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveGenerationConfig.kt @@ -58,6 +58,8 @@ import kotlinx.serialization.Serializable * @property outputAudioTranscription Specifies the configuration for transcribing output audio from * the model. * + * @property realtimeInputConfig Configures realtime input for the session + * * Refer to the * [Control generated output](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/control-generated-output) * guide for more details. @@ -76,6 +78,7 @@ private constructor( internal val inputAudioTranscription: AudioTranscriptionConfig?, internal val outputAudioTranscription: AudioTranscriptionConfig?, internal val contextWindowCompression: ContextWindowCompressionConfig?, + internal val realtimeInputConfig: LiveRealtimeInputConfig?, ) { /** @@ -105,6 +108,8 @@ private constructor( * @property outputAudioTranscription see [LiveGenerationConfig.outputAudioTranscription] * * @property contextWindowCompression see [LiveGenerationConfig.contextWindowCompression] + * + * @property realtimeInputConfig see [LiveGenerationConfig.realtimeInputConfig] */ public class Builder { @JvmField public var temperature: Float? = null @@ -118,6 +123,7 @@ private constructor( @JvmField public var inputAudioTranscription: AudioTranscriptionConfig? = null @JvmField public var outputAudioTranscription: AudioTranscriptionConfig? = null @JvmField public var contextWindowCompression: ContextWindowCompressionConfig? = null + @JvmField public var realtimeInputConfig: LiveRealtimeInputConfig? = null public fun setTemperature(temperature: Float?): Builder = apply { this.temperature = temperature @@ -153,6 +159,10 @@ private constructor( this.contextWindowCompression = config } + public fun setRealtimeInputConfig(config: LiveRealtimeInputConfig?): Builder = apply { + this.realtimeInputConfig = config + } + /** Create a new [LiveGenerationConfig] with the attached arguments. */ public fun build(): LiveGenerationConfig = LiveGenerationConfig( @@ -167,6 +177,7 @@ private constructor( inputAudioTranscription = inputAudioTranscription, outputAudioTranscription = outputAudioTranscription, contextWindowCompression = contextWindowCompression, + realtimeInputConfig = realtimeInputConfig, ) } diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveRealtimeInputConfig.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveRealtimeInputConfig.kt new file mode 100644 index 00000000000..b741ef03100 --- /dev/null +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveRealtimeInputConfig.kt @@ -0,0 +1,153 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.firebase.ai.type + +import kotlinx.serialization.SerialName +import kotlinx.serialization.Serializable + +/** + * Configures model input behavior when generating content in the Live API via realtime supported + * methods. + * + * @property automaticActivityDetection Configures automatic activity detection on the model. When + * not set, automatic activity detection is enabled by default. If set, the user must send activity + * signals. + * @property activityHandling Defines how the model treats user input activity. + * @property turnCoverage Defines which input is included in the user's turn, relative to the + * starting and ending of the activity. + */ +@PublicPreviewAPI +public class LiveRealtimeInputConfig +private constructor( + public val automaticActivityDetection: LiveActivityDetection?, + public val activityHandling: ActivityHandling?, + public val turnCoverage: TurnCoverage? +) { + + /** How a model handles user input activity. */ + public enum class ActivityHandling { + /** + * When the user sends input marking the start of activity, the model's current response will be + * cut-off immediately. + * + * The start of activity could be manually specified in the call, or the model could interpret + * it automatically (depending on the value of `automaticActivityDetection`). + * + * An example of activity starting implicitly could be the user speaking over the model. + */ + INTERRUPT, + + /** + * When the user sends input marking the start of activity, the model will process it, but won't + * cut-off its current response. + * + * This is the inverse of `interrupt`. + */ + NO_INTERRUPT + } + + /** How the model considers which input is included in the user's turn. */ + public enum class TurnCoverage { + /** + * The model will exclude inactivity (e.g, silence on the audio stream) from the user's input. + */ + ONLY_ACTIVITY, + + /** + * The model will include all input (including inactivity) since the last turn as the user's + * input. + */ + ALL_INPUT + } + + /** Builder for creating a [LiveRealtimeInputConfig]. */ + public class Builder { + @JvmField public var automaticActivityDetection: LiveActivityDetection? = null + @JvmField public var activityHandling: ActivityHandling? = null + @JvmField public var turnCoverage: TurnCoverage? = null + + public fun setAutomaticActivityDetection(config: LiveActivityDetection): Builder = apply { + this.automaticActivityDetection = config + } + + public fun setActivityHandling(handling: ActivityHandling): Builder = apply { + this.activityHandling = handling + } + + public fun setTurnCoverage(coverage: TurnCoverage): Builder = apply { + this.turnCoverage = coverage + } + + /** Create a new [LiveRealtimeInputConfig] with the attached arguments. */ + public fun build(): LiveRealtimeInputConfig = + LiveRealtimeInputConfig(automaticActivityDetection, activityHandling, turnCoverage) + } + + internal fun toInternal(): Internal = + Internal( + automaticActivityDetection = automaticActivityDetection?.toInternal(), + activityHandling = + when (activityHandling) { + ActivityHandling.INTERRUPT -> Internal.ActivityHandling.START_OF_ACTIVITY_INTERRUPTS + ActivityHandling.NO_INTERRUPT -> Internal.ActivityHandling.NO_INTERRUPTION + null -> null + }, + turnCoverage = + when (turnCoverage) { + TurnCoverage.ONLY_ACTIVITY -> Internal.TurnCoverage.ONLY_ACTIVITY + TurnCoverage.ALL_INPUT -> Internal.TurnCoverage.ALL_INPUT + null -> null + } + ) + + @Serializable + internal data class Internal( + @SerialName("automatic_activity_detection") + val automaticActivityDetection: LiveActivityDetection.Internal? = null, + @SerialName("activity_handling") val activityHandling: ActivityHandling? = null, + @SerialName("turn_coverage") val turnCoverage: TurnCoverage? = null + ) { + @Serializable + internal enum class ActivityHandling { + @SerialName("ACTIVITY_HANDLING_UNSPECIFIED") UNSPECIFIED, + @SerialName("START_OF_ACTIVITY_INTERRUPTS") START_OF_ACTIVITY_INTERRUPTS, + @SerialName("NO_INTERRUPTION") NO_INTERRUPTION + } + + @Serializable + internal enum class TurnCoverage { + @SerialName("TURN_COVERAGE_UNSPECIFIED") UNSPECIFIED, + @SerialName("TURN_INCLUDES_ONLY_ACTIVITY") ONLY_ACTIVITY, + @SerialName("TURN_INCLUDES_ALL_INPUT") ALL_INPUT, + } + } + + public companion object { + /** Creates a new [Builder]. */ + @JvmStatic public fun builder(): Builder = Builder() + } +} + +/** Helper method to construct a [LiveRealtimeInputConfig] in a DSL-like manner. */ +@OptIn(PublicPreviewAPI::class) +public fun liveRealtimeInputConfig( + init: LiveRealtimeInputConfig.Builder.() -> Unit +): LiveRealtimeInputConfig { + val builder = LiveRealtimeInputConfig.builder() + builder.init() + return builder.build() +} diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt index 7aac4b73da0..e41c631671a 100644 --- a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt @@ -61,6 +61,7 @@ import kotlinx.coroutines.flow.onEach import kotlinx.coroutines.isActive import kotlinx.coroutines.launch import kotlinx.serialization.ExperimentalSerializationApi +import kotlinx.serialization.SerialName import kotlinx.serialization.Serializable import kotlinx.serialization.encodeToString @@ -430,6 +431,37 @@ internal constructor( sendFrame(BidiGenerateContentRealtimeInputSetup(text = text).toInternal()) } + /** + * Manually marks the start of user activity, using the realtime API. + * + * The start of user activity is effectively the start of a user's turn, but depending on the + * configuration defined in [LiveRealtimeInputConfig], it may not be interpreted as an + * interruption. An example of the start of user activity could be the user speaking (not + * silence). + * + * Should be followed with a call to [sendStopActivityRealtime]; after all the data has been sent + * for the user's turn. + * + * Only required when automatic activity detection is disabled via [LiveRealtimeInputConfig]. + */ + public suspend fun sendStartActivityRealtime() { + sendFrame(BidiGenerateContentRealtimeInputSetup(activityStart = true).toInternal()) + } + + /** + * Manually marks the end of user activity, using the realtime API. + * + * The end of user activity is effectively the end of a user's turn, and signals that the model + * can start sending responses. + * + * Should follow after a previous call to [sendStartActivityRealtime]. + * + * Only required when automatic activity detection is disabled via ``LiveRealtimeInputConfig``. + */ + public suspend fun sendStopActivityRealtime() { + sendFrame(BidiGenerateContentRealtimeInputSetup(activityEnd = true).toInternal()) + } + /** * Streams client data to the model. * @@ -706,8 +738,13 @@ internal constructor( val mediaChunks: List? = null, val audio: InlineData? = null, val video: InlineData? = null, - val text: String? = null + val text: String? = null, + val activityStart: Boolean = false, + val activityEnd: Boolean = false ) { + @Serializable internal class ActivityStart + @Serializable internal class ActivityEnd + @Serializable internal class Internal(val realtimeInput: BidiGenerateContentRealtimeInput) { @Serializable @@ -715,7 +752,9 @@ internal constructor( val mediaChunks: List?, val audio: InlineData.Internal?, val video: InlineData.Internal?, - val text: String? + val text: String?, + @SerialName("activity_start") val activityStart: ActivityStart? = null, + @SerialName("activity_end") val activityEnd: ActivityEnd? = null ) } fun toInternal() = @@ -724,7 +763,9 @@ internal constructor( mediaChunks?.map { it.toInternal() }, audio?.toInternal(), video?.toInternal(), - text + text, + if (activityStart) ActivityStart() else null, + if (activityEnd) ActivityEnd() else null ) ) } diff --git a/ai-logic/firebase-ai/src/test/java/com/google/firebase/ai/type/LiveRealtimeInputConfigTest.kt b/ai-logic/firebase-ai/src/test/java/com/google/firebase/ai/type/LiveRealtimeInputConfigTest.kt new file mode 100644 index 00000000000..f1002b43170 --- /dev/null +++ b/ai-logic/firebase-ai/src/test/java/com/google/firebase/ai/type/LiveRealtimeInputConfigTest.kt @@ -0,0 +1,161 @@ +/* + * Copyright 2026 Google LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.firebase.ai.type + +import com.google.firebase.ai.common.JSON +import io.kotest.assertions.json.shouldEqualJson +import io.kotest.matchers.equals.shouldBeEqual +import kotlinx.serialization.encodeToString +import org.junit.Test + +@OptIn(PublicPreviewAPI::class) +internal class LiveRealtimeInputConfigTest { + + @Test + fun `Basic LiveRealtimeInputConfig`() { + val config = liveRealtimeInputConfig { + activityHandling = LiveRealtimeInputConfig.ActivityHandling.NO_INTERRUPT + turnCoverage = LiveRealtimeInputConfig.TurnCoverage.ONLY_ACTIVITY + automaticActivityDetection = liveActivityDetection { disabled = true } + } + + val expectedJson = + """ + { + "activity_handling": "NO_INTERRUPTION", + "turn_coverage": "TURN_INCLUDES_ONLY_ACTIVITY", + "automatic_activity_detection": { + "disabled": true + } + } + """ + .trimIndent() + + JSON.encodeToString(config.toInternal()).shouldEqualJson(expectedJson) + } + + @Test + fun `LiveActivityDetection full config`() { + val detection = liveActivityDetection { + startSensitivity = LiveActivityDetection.Sensitivity.HIGH + endSensitivity = LiveActivityDetection.Sensitivity.LOW + prefixPaddingMS = 100 + silenceDurationMS = 500 + disabled = false + } + + val expectedJson = + """ + { + "start_of_speech_sensitivity": "START_SENSITIVITY_HIGH", + "end_of_speech_sensitivity": "END_SENSITIVITY_LOW", + "prefix_padding_ms": 100, + "silence_duration_ms": 500, + "disabled": false + } + """ + .trimIndent() + + JSON.encodeToString(detection.toInternal()).shouldEqualJson(expectedJson) + } + + @Test + fun `DSL correctly delegates to Builder`() { + val config = + LiveRealtimeInputConfig.builder() + .setActivityHandling(LiveRealtimeInputConfig.ActivityHandling.INTERRUPT) + .build() + + val configDsl = liveRealtimeInputConfig { + activityHandling = LiveRealtimeInputConfig.ActivityHandling.INTERRUPT + } + + config.activityHandling?.shouldBeEqual( + configDsl.activityHandling as LiveRealtimeInputConfig.ActivityHandling + ) + } + + @Test + fun `LiveClientSetupMessage includes LiveRealtimeInputConfig in serialization`() { + val config = liveGenerationConfig { + realtimeInputConfig = liveRealtimeInputConfig { + activityHandling = LiveRealtimeInputConfig.ActivityHandling.NO_INTERRUPT + } + } + val message = + LiveClientSetupMessage( + model = "my-model", + generationConfig = null, + tools = null, + systemInstruction = null, + inputAudioTranscription = null, + outputAudioTranscription = null, + sessionResumption = null, + contextWindowCompression = null, + realtimeInputConfig = config.realtimeInputConfig?.toInternal() + ) + + val expectedJson = + """ + { + "setup": { + "model": "my-model", + "realtime_input_config": { + "activity_handling": "NO_INTERRUPTION" + } + } + } + """ + .trimIndent() + + JSON.encodeToString(message.toInternal()).shouldEqualJson(expectedJson) + } + + @Test + fun `BidiGenerateContentRealtimeInputSetup with activityStart serializes correctly`() { + val setup = LiveSession.BidiGenerateContentRealtimeInputSetup(activityStart = true) + + val expectedJson = + """ + { + "realtimeInput": { + "activity_start": {} + } + } + """ + .trimIndent() + + JSON.encodeToString(setup.toInternal()).shouldEqualJson(expectedJson) + } + + @Test + fun `BidiGenerateContentRealtimeInputSetup with activityEnd serializes correctly`() { + val setup = LiveSession.BidiGenerateContentRealtimeInputSetup(activityEnd = true) + + val expectedJson = + """ + { + "realtimeInput": { + "activity_end": {} + } + } + """ + .trimIndent() + + JSON.encodeToString(setup.toInternal()).shouldEqualJson(expectedJson) + } +} diff --git a/ai-logic/firebase-ai/src/testUtil/java/com/google/firebase/ai/JavaCompileTests.java b/ai-logic/firebase-ai/src/testUtil/java/com/google/firebase/ai/JavaCompileTests.java index e5372155fd0..aeeb317bce2 100644 --- a/ai-logic/firebase-ai/src/testUtil/java/com/google/firebase/ai/JavaCompileTests.java +++ b/ai-logic/firebase-ai/src/testUtil/java/com/google/firebase/ai/JavaCompileTests.java @@ -63,7 +63,9 @@ import com.google.firebase.ai.type.InlineData; import com.google.firebase.ai.type.InlineDataPart; import com.google.firebase.ai.type.LatLng; +import com.google.firebase.ai.type.LiveActivityDetection; import com.google.firebase.ai.type.LiveGenerationConfig; +import com.google.firebase.ai.type.LiveRealtimeInputConfig; import com.google.firebase.ai.type.LiveServerContent; import com.google.firebase.ai.type.LiveServerMessage; import com.google.firebase.ai.type.LiveServerSetupComplete; @@ -183,6 +185,19 @@ private LiveGenerationConfig getLiveConfig() { .setPresencePenalty(2.0F) .setResponseModality(ResponseModality.AUDIO) .setSpeechConfig(new SpeechConfig(new Voice("AOEDE"))) + .setRealtimeInputConfig( + new LiveRealtimeInputConfig.Builder() + .setActivityHandling(LiveRealtimeInputConfig.ActivityHandling.NO_INTERRUPT) + .setTurnCoverage(LiveRealtimeInputConfig.TurnCoverage.ONLY_ACTIVITY) + .setAutomaticActivityDetection( + new LiveActivityDetection.Builder() + .setDisabled(true) + .setStartSensitivity(LiveActivityDetection.Sensitivity.HIGH) + .setEndSensitivity(LiveActivityDetection.Sensitivity.LOW) + .setPrefixPaddingMS(100) + .setSilenceDurationMS(500) + .build()) + .build()) .build(); } @@ -448,6 +463,8 @@ public void onComplete() { session.sendAudioRealtime(new InlineData(bytes, "audio/jxl", null)); session.sendVideoRealtime(new InlineData(bytes, "image/jxl", null)); session.sendTextRealtime("text"); + session.sendStartActivityRealtime(); + session.sendStopActivityRealtime(); FunctionResponsePart functionResponse = new FunctionResponsePart("myFunction", new JsonObject(Map.of())); From a8edacdd58c4137b38e602523c32e03742ae34fe Mon Sep 17 00:00:00 2001 From: Mila <107142260+milaGGL@users.noreply.github.com> Date: Thu, 30 Apr 2026 13:37:42 -0400 Subject: [PATCH 2/4] add changelog, format code --- ai-logic/firebase-ai/CHANGELOG.md | 2 ++ ai-logic/firebase-ai/api.txt | 16 ---------------- .../firebase/ai/type/LiveActivityDetection.kt | 10 +++++----- .../firebase/ai/type/LiveRealtimeInputConfig.kt | 6 +++--- .../com/google/firebase/ai/type/LiveSession.kt | 4 ++-- 5 files changed, 12 insertions(+), 26 deletions(-) diff --git a/ai-logic/firebase-ai/CHANGELOG.md b/ai-logic/firebase-ai/CHANGELOG.md index 0b709ed88a0..f402cc6d277 100644 --- a/ai-logic/firebase-ai/CHANGELOG.md +++ b/ai-logic/firebase-ai/CHANGELOG.md @@ -1,5 +1,7 @@ # Unreleased +- [feature] Added support for `LiveRealtimeInputConfig` and `LiveActivityDetection` to configure voice activity detection in Live API. Added `sendStartActivityRealtime` and `sendStopActivityRealtime` to `LiveSession` for manual activity control. (#8080) + - [feature] Added support for `ImageConfig` and `finishMessage`. (#8020) - [feature] Added a Java-friendly wrapper for TemplateChat interactions (`TemplateChatFutures`). diff --git a/ai-logic/firebase-ai/api.txt b/ai-logic/firebase-ai/api.txt index 2528666586a..019a5d6e249 100644 --- a/ai-logic/firebase-ai/api.txt +++ b/ai-logic/firebase-ai/api.txt @@ -1310,16 +1310,6 @@ package com.google.firebase.ai.type { @com.google.firebase.ai.type.PublicPreviewAPI public final class LiveActivityDetection { method public static com.google.firebase.ai.type.LiveActivityDetection.Builder builder(); - method public Boolean? getDisabled(); - method public com.google.firebase.ai.type.LiveActivityDetection.Sensitivity? getEndSensitivity(); - method public Integer? getPrefixPaddingMS(); - method public Integer? getSilenceDurationMS(); - method public com.google.firebase.ai.type.LiveActivityDetection.Sensitivity? getStartSensitivity(); - property public final Boolean? disabled; - property public final com.google.firebase.ai.type.LiveActivityDetection.Sensitivity? endSensitivity; - property public final Integer? prefixPaddingMS; - property public final Integer? silenceDurationMS; - property public final com.google.firebase.ai.type.LiveActivityDetection.Sensitivity? startSensitivity; field public static final com.google.firebase.ai.type.LiveActivityDetection.Companion Companion; } @@ -1421,12 +1411,6 @@ package com.google.firebase.ai.type { @com.google.firebase.ai.type.PublicPreviewAPI public final class LiveRealtimeInputConfig { method public static com.google.firebase.ai.type.LiveRealtimeInputConfig.Builder builder(); - method public com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling? getActivityHandling(); - method public com.google.firebase.ai.type.LiveActivityDetection? getAutomaticActivityDetection(); - method public com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage? getTurnCoverage(); - property public final com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling? activityHandling; - property public final com.google.firebase.ai.type.LiveActivityDetection? automaticActivityDetection; - property public final com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage? turnCoverage; field public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.Companion Companion; } diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveActivityDetection.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveActivityDetection.kt index fd4e1a59d2a..49967c697e8 100644 --- a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveActivityDetection.kt +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveActivityDetection.kt @@ -38,11 +38,11 @@ import kotlinx.serialization.Serializable @PublicPreviewAPI public class LiveActivityDetection private constructor( - public val startSensitivity: Sensitivity?, - public val endSensitivity: Sensitivity?, - public val prefixPaddingMS: Int?, - public val silenceDurationMS: Int?, - public val disabled: Boolean? + internal val startSensitivity: Sensitivity?, + internal val endSensitivity: Sensitivity?, + internal val prefixPaddingMS: Int?, + internal val silenceDurationMS: Int?, + internal val disabled: Boolean? ) { /** How sensitive the model interprets speech activity. */ diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveRealtimeInputConfig.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveRealtimeInputConfig.kt index b741ef03100..ec68f5b7a88 100644 --- a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveRealtimeInputConfig.kt +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveRealtimeInputConfig.kt @@ -33,9 +33,9 @@ import kotlinx.serialization.Serializable @PublicPreviewAPI public class LiveRealtimeInputConfig private constructor( - public val automaticActivityDetection: LiveActivityDetection?, - public val activityHandling: ActivityHandling?, - public val turnCoverage: TurnCoverage? + internal val automaticActivityDetection: LiveActivityDetection?, + internal val activityHandling: ActivityHandling?, + internal val turnCoverage: TurnCoverage? ) { /** How a model handles user input activity. */ diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt index e41c631671a..0ef1d889f4e 100644 --- a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveSession.kt @@ -439,7 +439,7 @@ internal constructor( * interruption. An example of the start of user activity could be the user speaking (not * silence). * - * Should be followed with a call to [sendStopActivityRealtime]; after all the data has been sent + * Should be followed with a call to [sendStopActivityRealtime] after all the data has been sent * for the user's turn. * * Only required when automatic activity detection is disabled via [LiveRealtimeInputConfig]. @@ -456,7 +456,7 @@ internal constructor( * * Should follow after a previous call to [sendStartActivityRealtime]. * - * Only required when automatic activity detection is disabled via ``LiveRealtimeInputConfig``. + * Only required when automatic activity detection is disabled via [LiveRealtimeInputConfig]. */ public suspend fun sendStopActivityRealtime() { sendFrame(BidiGenerateContentRealtimeInputSetup(activityEnd = true).toInternal()) From a708b302070a841b1ea6a5355310be11e40a339f Mon Sep 17 00:00:00 2001 From: Mila <107142260+milaGGL@users.noreply.github.com> Date: Fri, 1 May 2026 11:40:45 -0400 Subject: [PATCH 3/4] Update LiveSessionTests.kt --- .../kotlin/com/google/firebase/ai/LiveSessionTests.kt | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/ai-logic/firebase-ai/src/androidTest/kotlin/com/google/firebase/ai/LiveSessionTests.kt b/ai-logic/firebase-ai/src/androidTest/kotlin/com/google/firebase/ai/LiveSessionTests.kt index e2536fdfadc..010d1eeaec1 100644 --- a/ai-logic/firebase-ai/src/androidTest/kotlin/com/google/firebase/ai/LiveSessionTests.kt +++ b/ai-logic/firebase-ai/src/androidTest/kotlin/com/google/firebase/ai/LiveSessionTests.kt @@ -283,7 +283,6 @@ class LiveSessionTests { val session = liveModel.connect(SessionResumptionConfig()) session.send("My favorite color is blue. Remember that.", true) var lastResumptionUpdate: LiveSessionResumptionUpdate? = null - var gotTurnComplete = false withTimeout(30.seconds) { session .receive() @@ -292,10 +291,10 @@ class LiveSessionTests { lastResumptionUpdate = it } if (it is LiveServerContent && it.turnComplete) { - gotTurnComplete = true + false + } else { + true } - // Stop collecting when there's a new handle AND turnComplete is true - !(gotTurnComplete && lastResumptionUpdate?.newHandle != null) } .collect {} } From 4268984d8f918db28648b6c21e80909c150c2d44 Mon Sep 17 00:00:00 2001 From: Mila <107142260+milaGGL@users.noreply.github.com> Date: Mon, 4 May 2026 12:35:54 -0400 Subject: [PATCH 4/4] resolve comments --- ai-logic/firebase-ai/api.txt | 30 ++++-- .../google/firebase/ai/LiveSessionTests.kt | 26 ++++-- .../firebase/ai/type/LiveActivityDetection.kt | 59 ++++-------- .../ai/type/LiveRealtimeInputConfig.kt | 92 +++++++------------ 4 files changed, 91 insertions(+), 116 deletions(-) diff --git a/ai-logic/firebase-ai/api.txt b/ai-logic/firebase-ai/api.txt index 019a5d6e249..17e1c1c66b2 100644 --- a/ai-logic/firebase-ai/api.txt +++ b/ai-logic/firebase-ai/api.txt @@ -1332,9 +1332,13 @@ package com.google.firebase.ai.type { method public com.google.firebase.ai.type.LiveActivityDetection.Builder builder(); } - public enum LiveActivityDetection.Sensitivity { - enum_constant public static final com.google.firebase.ai.type.LiveActivityDetection.Sensitivity HIGH; - enum_constant public static final com.google.firebase.ai.type.LiveActivityDetection.Sensitivity LOW; + public static final class LiveActivityDetection.Sensitivity { + field public static final com.google.firebase.ai.type.LiveActivityDetection.Sensitivity.Companion Companion; + field public static final com.google.firebase.ai.type.LiveActivityDetection.Sensitivity HIGH; + field public static final com.google.firebase.ai.type.LiveActivityDetection.Sensitivity LOW; + } + + public static final class LiveActivityDetection.Sensitivity.Companion { } public final class LiveActivityDetectionKt { @@ -1414,9 +1418,13 @@ package com.google.firebase.ai.type { field public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.Companion Companion; } - public enum LiveRealtimeInputConfig.ActivityHandling { - enum_constant public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling INTERRUPT; - enum_constant public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling NO_INTERRUPT; + public static final class LiveRealtimeInputConfig.ActivityHandling { + field public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling.Companion Companion; + field public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling INTERRUPT; + field public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.ActivityHandling NO_INTERRUPT; + } + + public static final class LiveRealtimeInputConfig.ActivityHandling.Companion { } public static final class LiveRealtimeInputConfig.Builder { @@ -1434,9 +1442,13 @@ package com.google.firebase.ai.type { method public com.google.firebase.ai.type.LiveRealtimeInputConfig.Builder builder(); } - public enum LiveRealtimeInputConfig.TurnCoverage { - enum_constant public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage ALL_INPUT; - enum_constant public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage ONLY_ACTIVITY; + public static final class LiveRealtimeInputConfig.TurnCoverage { + field public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage ALL_INPUT; + field public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage.Companion Companion; + field public static final com.google.firebase.ai.type.LiveRealtimeInputConfig.TurnCoverage ONLY_ACTIVITY; + } + + public static final class LiveRealtimeInputConfig.TurnCoverage.Companion { } public final class LiveRealtimeInputConfigKt { diff --git a/ai-logic/firebase-ai/src/androidTest/kotlin/com/google/firebase/ai/LiveSessionTests.kt b/ai-logic/firebase-ai/src/androidTest/kotlin/com/google/firebase/ai/LiveSessionTests.kt index 475b63ec8af..b44d0e02a31 100644 --- a/ai-logic/firebase-ai/src/androidTest/kotlin/com/google/firebase/ai/LiveSessionTests.kt +++ b/ai-logic/firebase-ai/src/androidTest/kotlin/com/google/firebase/ai/LiveSessionTests.kt @@ -332,15 +332,20 @@ class LiveSessionTests { automaticActivityDetection = liveActivityDetection { disabled = true } } } - val liveModel = getLiveModel(modelName = modelName, config = config) + val liveModel = + getLiveModel( + modelName = modelName, + config = config, + systemInstruction = SystemInstructions.yesOrNo + ) val session = liveModel.connect() try { session.sendStartActivityRealtime() - session.sendTextRealtime("Hello") + session.sendTextRealtime("Does five plus five equal ten?") session.sendStopActivityRealtime() - // Wait for some response, just ensure no crash - val text = withTimeoutOrNull(10.seconds) { session.collectNextAudioOutputTranscript() } + val text = withTimeoutOrNull(15.seconds) { session.collectNextAudioOutputTranscript() } ?: "" + text.toLowerCasePreservingASCIIRules() shouldContain "yes" } finally { session.close() } @@ -363,12 +368,17 @@ class LiveSessionTests { } } } - val liveModel = getLiveModel(modelName = modelName, config = config) + val liveModel = + getLiveModel( + modelName = modelName, + config = config, + systemInstruction = SystemInstructions.yesOrNo + ) val session = liveModel.connect() try { - session.sendTextRealtime("Hello") - val text = withTimeoutOrNull(15.seconds) { session.collectNextAudioOutputTranscript() } - text.shouldNotBeNull() + session.sendTextRealtime("Is the sky blue?") + val text = withTimeoutOrNull(15.seconds) { session.collectNextAudioOutputTranscript() } ?: "" + text.toLowerCasePreservingASCIIRules() shouldContain "yes" } finally { session.close() } diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveActivityDetection.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveActivityDetection.kt index 49967c697e8..c4290d103df 100644 --- a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveActivityDetection.kt +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveActivityDetection.kt @@ -46,17 +46,20 @@ private constructor( ) { /** How sensitive the model interprets speech activity. */ - public enum class Sensitivity { - /** - * The model will detect speech less often. In other words, a higher volume of speech is - * required for the model to consider the user is speaking. - */ - LOW, - /** - * The model will detect speech more often. In other words, a lower volume of speech is required - * for the model to consider the user is speaking. - */ - HIGH + public class Sensitivity private constructor(internal val value: String) { + public companion object { + /** + * The model will detect speech less often. In other words, a higher volume of speech is + * required for the model to consider the user is speaking. + */ + @JvmField public val LOW: Sensitivity = Sensitivity("LOW") + + /** + * The model will detect speech more often. In other words, a lower volume of speech is + * required for the model to consider the user is speaking. + */ + @JvmField public val HIGH: Sensitivity = Sensitivity("HIGH") + } } /** Builder for creating a [LiveActivityDetection]. */ @@ -98,18 +101,8 @@ private constructor( internal fun toInternal(): Internal = Internal( - startSensitivity = - when (startSensitivity) { - Sensitivity.LOW -> Internal.StartSensitivity.LOW - Sensitivity.HIGH -> Internal.StartSensitivity.HIGH - null -> null - }, - endSensitivity = - when (endSensitivity) { - Sensitivity.LOW -> Internal.EndSensitivity.LOW - Sensitivity.HIGH -> Internal.EndSensitivity.HIGH - null -> null - }, + startSensitivity = startSensitivity?.let { "START_SENSITIVITY_${it.value}" }, + endSensitivity = endSensitivity?.let { "END_SENSITIVITY_${it.value}" }, prefixPaddingMs = prefixPaddingMS, silenceDurationMs = silenceDurationMS, disabled = disabled @@ -117,26 +110,12 @@ private constructor( @Serializable internal data class Internal( - @SerialName("start_of_speech_sensitivity") val startSensitivity: StartSensitivity? = null, - @SerialName("end_of_speech_sensitivity") val endSensitivity: EndSensitivity? = null, + @SerialName("start_of_speech_sensitivity") val startSensitivity: String? = null, + @SerialName("end_of_speech_sensitivity") val endSensitivity: String? = null, @SerialName("prefix_padding_ms") val prefixPaddingMs: Int? = null, @SerialName("silence_duration_ms") val silenceDurationMs: Int? = null, @SerialName("disabled") val disabled: Boolean? = null - ) { - @Serializable - internal enum class StartSensitivity { - @SerialName("START_SENSITIVITY_UNSPECIFIED") UNSPECIFIED, - @SerialName("START_SENSITIVITY_HIGH") HIGH, - @SerialName("START_SENSITIVITY_LOW") LOW - } - - @Serializable - internal enum class EndSensitivity { - @SerialName("END_SENSITIVITY_UNSPECIFIED") UNSPECIFIED, - @SerialName("END_SENSITIVITY_HIGH") HIGH, - @SerialName("END_SENSITIVITY_LOW") LOW - } - } + ) public companion object { /** Creates a new [Builder]. */ diff --git a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveRealtimeInputConfig.kt b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveRealtimeInputConfig.kt index ec68f5b7a88..4cc2e7a23c3 100644 --- a/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveRealtimeInputConfig.kt +++ b/ai-logic/firebase-ai/src/main/kotlin/com/google/firebase/ai/type/LiveRealtimeInputConfig.kt @@ -39,39 +39,37 @@ private constructor( ) { /** How a model handles user input activity. */ - public enum class ActivityHandling { - /** - * When the user sends input marking the start of activity, the model's current response will be - * cut-off immediately. - * - * The start of activity could be manually specified in the call, or the model could interpret - * it automatically (depending on the value of `automaticActivityDetection`). - * - * An example of activity starting implicitly could be the user speaking over the model. - */ - INTERRUPT, - - /** - * When the user sends input marking the start of activity, the model will process it, but won't - * cut-off its current response. - * - * This is the inverse of `interrupt`. - */ - NO_INTERRUPT + public class ActivityHandling private constructor(internal val value: String) { + public companion object { + /** + * When the user sends input marking the start of activity, the model's current response will + * be cut-off immediately. + */ + @JvmField + public val INTERRUPT: ActivityHandling = ActivityHandling("START_OF_ACTIVITY_INTERRUPTS") + + /** + * When the user sends input marking the start of activity, the model will process it, but + * won't cut-off its current response. + */ + @JvmField public val NO_INTERRUPT: ActivityHandling = ActivityHandling("NO_INTERRUPTION") + } } /** How the model considers which input is included in the user's turn. */ - public enum class TurnCoverage { - /** - * The model will exclude inactivity (e.g, silence on the audio stream) from the user's input. - */ - ONLY_ACTIVITY, - - /** - * The model will include all input (including inactivity) since the last turn as the user's - * input. - */ - ALL_INPUT + public class TurnCoverage private constructor(internal val value: String) { + public companion object { + /** + * The model will exclude inactivity (e.g, silence on the audio stream) from the user's input. + */ + @JvmField public val ONLY_ACTIVITY: TurnCoverage = TurnCoverage("TURN_INCLUDES_ONLY_ACTIVITY") + + /** + * The model will include all input (including inactivity) since the last turn as the user's + * input. + */ + @JvmField public val ALL_INPUT: TurnCoverage = TurnCoverage("TURN_INCLUDES_ALL_INPUT") + } } /** Builder for creating a [LiveRealtimeInputConfig]. */ @@ -100,41 +98,17 @@ private constructor( internal fun toInternal(): Internal = Internal( automaticActivityDetection = automaticActivityDetection?.toInternal(), - activityHandling = - when (activityHandling) { - ActivityHandling.INTERRUPT -> Internal.ActivityHandling.START_OF_ACTIVITY_INTERRUPTS - ActivityHandling.NO_INTERRUPT -> Internal.ActivityHandling.NO_INTERRUPTION - null -> null - }, - turnCoverage = - when (turnCoverage) { - TurnCoverage.ONLY_ACTIVITY -> Internal.TurnCoverage.ONLY_ACTIVITY - TurnCoverage.ALL_INPUT -> Internal.TurnCoverage.ALL_INPUT - null -> null - } + activityHandling = activityHandling?.value, + turnCoverage = turnCoverage?.value ) @Serializable internal data class Internal( @SerialName("automatic_activity_detection") val automaticActivityDetection: LiveActivityDetection.Internal? = null, - @SerialName("activity_handling") val activityHandling: ActivityHandling? = null, - @SerialName("turn_coverage") val turnCoverage: TurnCoverage? = null - ) { - @Serializable - internal enum class ActivityHandling { - @SerialName("ACTIVITY_HANDLING_UNSPECIFIED") UNSPECIFIED, - @SerialName("START_OF_ACTIVITY_INTERRUPTS") START_OF_ACTIVITY_INTERRUPTS, - @SerialName("NO_INTERRUPTION") NO_INTERRUPTION - } - - @Serializable - internal enum class TurnCoverage { - @SerialName("TURN_COVERAGE_UNSPECIFIED") UNSPECIFIED, - @SerialName("TURN_INCLUDES_ONLY_ACTIVITY") ONLY_ACTIVITY, - @SerialName("TURN_INCLUDES_ALL_INPUT") ALL_INPUT, - } - } + @SerialName("activity_handling") val activityHandling: String? = null, + @SerialName("turn_coverage") val turnCoverage: String? = null + ) public companion object { /** Creates a new [Builder]. */