Skip to content

Instantly share code, notes, and snippets.

@notnotrishi
Created August 19, 2025 23:57
Show Gist options
  • Select an option

  • Save notnotrishi/368d5267c2baf35aefd47177b66f6cbb to your computer and use it in GitHub Desktop.

Select an option

Save notnotrishi/368d5267c2baf35aefd47177b66f6cbb to your computer and use it in GitHub Desktop.
Cartesia STT streaming example
import asyncio
import os
import pyaudio
from cartesia import AsyncCartesia
async def streaming_stt_example():
"""
Advanced async STT example for real-time streaming applications.
This example captures microphone input and processes it in real-time
with proper error handling and demonstrates endpointing and word timestamp features.
"""
client = AsyncCartesia(
api_key=os.getenv("CARTESIA_API_KEY"),
)
# Audio configuration - optimized to reduce word dropping
SAMPLE_RATE = 16000
CHUNK_SIZE = 4096 # Increased from 1024 - larger chunks for better continuity
FORMAT = pyaudio.paInt16
CHANNELS = 1
# Initialize PyAudio
audio = pyaudio.PyAudio()
try:
# Create websocket connection with voice activity detection
ws = await client.stt.websocket(
model="ink-whisper", # Model (required)
language="en", # Language of your audio (required)
encoding="pcm_s16le", # Audio encoding format (required)
sample_rate=SAMPLE_RATE, # Audio sample rate (required)
min_volume=0.05, # Lowered from 0.15 - more sensitive to quiet speech
max_silence_duration_secs=1.0, # Increased from 0.3 - allow longer natural pauses
)
# Open microphone stream
stream = audio.open(
format=FORMAT,
channels=CHANNELS,
rate=SAMPLE_RATE,
input=True,
frames_per_buffer=CHUNK_SIZE
)
print("🎀 Listening to microphone... (Press Ctrl+C to stop)")
async def capture_microphone():
"""Capture audio from microphone and send to STT websocket"""
try:
loop = asyncio.get_event_loop()
while True:
# Read audio data from microphone (non-blocking)
audio_data = await loop.run_in_executor(
None, stream.read, CHUNK_SIZE, False
)
if audio_data:
await ws.send(audio_data)
# Removed sleep to eliminate gaps in audio transmission
# await asyncio.sleep(0.01)
except KeyboardInterrupt:
print("\nπŸ›‘ Stopping microphone capture...")
await ws.send("finalize")
await ws.send("done")
except Exception as e:
print(f"Error capturing microphone: {e}")
await ws.send("finalize")
await ws.send("done")
async def receive_transcripts():
"""Receive and process transcription results with word timestamps"""
current_line = ""
try:
async for result in ws.receive():
if result['type'] == 'transcript':
text = result['text']
is_final = result['is_final']
if is_final:
# Final result - print it and move to new line
if text.strip():
print(f"\rβœ… {text}")
current_line = ""
# Handle word-level timestamps if available
if 'words' in result and result['words']:
word_timestamps = result['words']
print("πŸ“ Word timestamps:")
for word_info in word_timestamps:
word = word_info['word']
start = word_info['start']
end = word_info['end']
print(f" '{word}': {start:.2f}s - {end:.2f}s")
print() # Add blank line for readability
else:
# Partial result - update current line
if text.strip():
current_line = f"πŸ”„ {text}"
print(f"\r{current_line}", end="", flush=True)
elif result['type'] == 'done':
print("\n🏁 Transcription session ended")
break
except Exception as e:
print(f"Error receiving transcripts: {e}")
# Use asyncio.gather to run microphone capture and transcript receiving concurrently
await asyncio.gather(
capture_microphone(),
receive_transcripts()
)
except KeyboardInterrupt:
print("\nπŸ‘‹ Goodbye!")
except Exception as e:
print(f"STT streaming error: {e}")
finally:
# Clean up resources
if 'stream' in locals():
stream.stop_stream()
stream.close()
audio.terminate()
await ws.close()
await client.close()
# Run the example
if __name__ == "__main__":
try:
asyncio.run(streaming_stt_example())
except KeyboardInterrupt:
print("\nπŸ‘‹ Goodbye!")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment