-
-
Save davidzhao/61f43c479369b74e436033efed3db88f to your computer and use it in GitHub Desktop.
| import logging | |
| from dotenv import load_dotenv | |
| from livekit.agents import ( | |
| Agent, | |
| AgentSession, | |
| JobContext, | |
| WorkerOptions, | |
| cli, | |
| ) | |
| from livekit.plugins import deepgram, silero | |
| logger = logging.getLogger("flux-agent") | |
| load_dotenv() | |
| class MyAgent(Agent): | |
| def __init__(self) -> None: | |
| super().__init__( | |
| instructions="Your name is Flux. You would interact with users via voice." | |
| "with that in mind keep your responses concise and to the point." | |
| "do not use emojis, asterisks, markdown, or other special characters in your responses." | |
| "You are curious and friendly, and have a sense of humor." | |
| "you will speak english to the user", | |
| ) | |
| async def on_enter(self): | |
| # when the agent is added to the session, it'll generate a reply | |
| # according to its instructions | |
| self.session.generate_reply(instructions="introduce yourself to the user") | |
| async def entrypoint(ctx: JobContext): | |
| # each log entry will include these fields | |
| ctx.log_context_fields = { | |
| "room": ctx.room.name, | |
| } | |
| session = AgentSession( | |
| # a VAD for the best interruption handling | |
| # Flux also has VAD builtin, so you can try disabling this | |
| vad=silero.VAD.load(), | |
| llm="openai/gpt-4.1", | |
| stt=deepgram.STTv2(model="flux-general-en", eager_eot_threshold=0.4), | |
| tts="cartesia/sonic-2", | |
| # allow the LLM to generate a response while waiting for the end of turn | |
| # this works with Flux's EagerEndOfTurn events to start LLM generation earlier | |
| preemptive_generation=True, | |
| # sometimes background noise could interrupt the agent session, these are considered false positive interruptions | |
| # when it's detected, you may resume the agent's speech | |
| resume_false_interruption=True, | |
| ) | |
| await session.start( | |
| agent=MyAgent(), | |
| room=ctx.room, | |
| ) | |
| if __name__ == "__main__": | |
| cli.run_app(WorkerOptions(entrypoint_fnc=entrypoint)) |
whats the version of livekit that supports this
+1
whats the version of livekit that supports this
livekit/agents#3561 (comment), 1.2.14
Default Turn Detection Mode:
When both vad and stt are provided but turn_detection is not explicitly set, which mode is used? Based on the code (_vad_base_turn_detection = self._turn_detection_mode in ("vad", None)), it appears VAD mode is the default, meaning Deepgram Flux's intelligent EndOfTurn events are ignored. Is this correct?
Your Comment: In your gist, you mention "Flux also has VAD builtin, so you can try disabling this." Does this mean:
Deepgram Flux has its own voice activity detection server-side?
We can remove vad=silero.VAD.load() entirely and rely on Flux's turn detection?
Or should we keep external VAD for interruption handling but use turn_detection="stt" for end-of-turn?
STT Mode Behavior: If we explicitly set turn_detection="stt", is min_endpointing_delay additive with Deepgram's eot_timeout_ms, or does it work differently? The docs say "in STT mode it is applied after the STT end-of-speech signal, so it can be additive."
Recommended Configuration: For optimal latency while maintaining good turn detection accuracy with Flux, which configuration would you recommend:
Option A: vad=silero.VAD.load() + turn_detection="stt" + low min_endpointing_delay
Option B: No VAD + rely entirely on Flux's built-in detection
Option C: Current setup (VAD mode) but tune Silero's parameters
whats the version of livekit that supports this