Created
August 19, 2025 23:57
-
-
Save notnotrishi/368d5267c2baf35aefd47177b66f6cbb to your computer and use it in GitHub Desktop.
Cartesia STT streaming example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import asyncio | |
| import os | |
| import pyaudio | |
| from cartesia import AsyncCartesia | |
| async def streaming_stt_example(): | |
| """ | |
| Advanced async STT example for real-time streaming applications. | |
| This example captures microphone input and processes it in real-time | |
| with proper error handling and demonstrates endpointing and word timestamp features. | |
| """ | |
| client = AsyncCartesia( | |
| api_key=os.getenv("CARTESIA_API_KEY"), | |
| ) | |
| # Audio configuration - optimized to reduce word dropping | |
| SAMPLE_RATE = 16000 | |
| CHUNK_SIZE = 4096 # Increased from 1024 - larger chunks for better continuity | |
| FORMAT = pyaudio.paInt16 | |
| CHANNELS = 1 | |
| # Initialize PyAudio | |
| audio = pyaudio.PyAudio() | |
| try: | |
| # Create websocket connection with voice activity detection | |
| ws = await client.stt.websocket( | |
| model="ink-whisper", # Model (required) | |
| language="en", # Language of your audio (required) | |
| encoding="pcm_s16le", # Audio encoding format (required) | |
| sample_rate=SAMPLE_RATE, # Audio sample rate (required) | |
| min_volume=0.05, # Lowered from 0.15 - more sensitive to quiet speech | |
| max_silence_duration_secs=1.0, # Increased from 0.3 - allow longer natural pauses | |
| ) | |
| # Open microphone stream | |
| stream = audio.open( | |
| format=FORMAT, | |
| channels=CHANNELS, | |
| rate=SAMPLE_RATE, | |
| input=True, | |
| frames_per_buffer=CHUNK_SIZE | |
| ) | |
| print("π€ Listening to microphone... (Press Ctrl+C to stop)") | |
| async def capture_microphone(): | |
| """Capture audio from microphone and send to STT websocket""" | |
| try: | |
| loop = asyncio.get_event_loop() | |
| while True: | |
| # Read audio data from microphone (non-blocking) | |
| audio_data = await loop.run_in_executor( | |
| None, stream.read, CHUNK_SIZE, False | |
| ) | |
| if audio_data: | |
| await ws.send(audio_data) | |
| # Removed sleep to eliminate gaps in audio transmission | |
| # await asyncio.sleep(0.01) | |
| except KeyboardInterrupt: | |
| print("\nπ Stopping microphone capture...") | |
| await ws.send("finalize") | |
| await ws.send("done") | |
| except Exception as e: | |
| print(f"Error capturing microphone: {e}") | |
| await ws.send("finalize") | |
| await ws.send("done") | |
| async def receive_transcripts(): | |
| """Receive and process transcription results with word timestamps""" | |
| current_line = "" | |
| try: | |
| async for result in ws.receive(): | |
| if result['type'] == 'transcript': | |
| text = result['text'] | |
| is_final = result['is_final'] | |
| if is_final: | |
| # Final result - print it and move to new line | |
| if text.strip(): | |
| print(f"\rβ {text}") | |
| current_line = "" | |
| # Handle word-level timestamps if available | |
| if 'words' in result and result['words']: | |
| word_timestamps = result['words'] | |
| print("π Word timestamps:") | |
| for word_info in word_timestamps: | |
| word = word_info['word'] | |
| start = word_info['start'] | |
| end = word_info['end'] | |
| print(f" '{word}': {start:.2f}s - {end:.2f}s") | |
| print() # Add blank line for readability | |
| else: | |
| # Partial result - update current line | |
| if text.strip(): | |
| current_line = f"π {text}" | |
| print(f"\r{current_line}", end="", flush=True) | |
| elif result['type'] == 'done': | |
| print("\nπ Transcription session ended") | |
| break | |
| except Exception as e: | |
| print(f"Error receiving transcripts: {e}") | |
| # Use asyncio.gather to run microphone capture and transcript receiving concurrently | |
| await asyncio.gather( | |
| capture_microphone(), | |
| receive_transcripts() | |
| ) | |
| except KeyboardInterrupt: | |
| print("\nπ Goodbye!") | |
| except Exception as e: | |
| print(f"STT streaming error: {e}") | |
| finally: | |
| # Clean up resources | |
| if 'stream' in locals(): | |
| stream.stop_stream() | |
| stream.close() | |
| audio.terminate() | |
| await ws.close() | |
| await client.close() | |
| # Run the example | |
| if __name__ == "__main__": | |
| try: | |
| asyncio.run(streaming_stt_example()) | |
| except KeyboardInterrupt: | |
| print("\nπ Goodbye!") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment