notnotrishi · August 19, 2025 23:57
diff --git a/stt.py b/stt.py
 import asyncio
 import os
 import pyaudio
 from cartesia import AsyncCartesia

 async def streaming_stt_example():
    """
    Advanced async STT example for real-time streaming applications.
    This example captures microphone input and processes it in real-time
    with proper error handling and demonstrates endpointing and word timestamp features.
    """
    client = AsyncCartesia(
    api_key=os.getenv("CARTESIA_API_KEY"),
    )
    
    # Audio configuration - optimized to reduce word dropping
    SAMPLE_RATE = 16000
    CHUNK_SIZE = 4096  # Increased from 1024 - larger chunks for better continuity
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    
    # Initialize PyAudio
    audio = pyaudio.PyAudio()
    
    try:
        # Create websocket connection with voice activity detection
        ws = await client.stt.websocket(
            model="ink-whisper",             # Model (required)
            language="en",                   # Language of your audio (required)
            encoding="pcm_s16le",            # Audio encoding format (required)
            sample_rate=SAMPLE_RATE,         # Audio sample rate (required)
            min_volume=0.05,                 # Lowered from 0.15 - more sensitive to quiet speech
            max_silence_duration_secs=1.0,   # Increased from 0.3 - allow longer natural pauses
        )
        
        # Open microphone stream
        stream = audio.open(
            format=FORMAT,
            channels=CHANNELS,
            rate=SAMPLE_RATE,
            input=True,
            frames_per_buffer=CHUNK_SIZE
        )
        
        print("🎤 Listening to microphone... (Press Ctrl+C to stop)")
        
        async def capture_microphone():
            """Capture audio from microphone and send to STT websocket"""
            try:
                loop = asyncio.get_event_loop()
                
                while True:
                    # Read audio data from microphone (non-blocking)
                    audio_data = await loop.run_in_executor(
                        None, stream.read, CHUNK_SIZE, False
                    )
                    
                    if audio_data:
                        await ws.send(audio_data)
                    
                    # Removed sleep to eliminate gaps in audio transmission
                    # await asyncio.sleep(0.01)
                    
            except KeyboardInterrupt:
                print("\n🛑 Stopping microphone capture...")
                await ws.send("finalize")
                await ws.send("done")
            except Exception as e:
                print(f"Error capturing microphone: {e}")
                await ws.send("finalize") 
                await ws.send("done")
        
        async def receive_transcripts():
            """Receive and process transcription results with word timestamps"""
            current_line = ""
            
            try:
                async for result in ws.receive():
                    if result['type'] == 'transcript':
                        text = result['text']
                        is_final = result['is_final']
                        
                        if is_final:
                            # Final result - print it and move to new line
                            if text.strip():
                                print(f"\r✅ {text}")
                                current_line = ""
                            
                            # Handle word-level timestamps if available
                            if 'words' in result and result['words']:
                                word_timestamps = result['words']
                                print("📝 Word timestamps:")
                                for word_info in word_timestamps:
                                    word = word_info['word']
                                    start = word_info['start']
                                    end = word_info['end']
                                    print(f"   '{word}': {start:.2f}s - {end:.2f}s")
                                print()  # Add blank line for readability
                        else:
                            # Partial result - update current line
                            if text.strip():
                                current_line = f"🔄 {text}"
                                print(f"\r{current_line}", end="", flush=True)
                            
                    elif result['type'] == 'done':
                        print("\n🏁 Transcription session ended")
                        break
                        
            except Exception as e:
                print(f"Error receiving transcripts: {e}")
        
        # Use asyncio.gather to run microphone capture and transcript receiving concurrently
        await asyncio.gather(
            capture_microphone(),
            receive_transcripts()
        )
        
    except KeyboardInterrupt:
        print("\n👋 Goodbye!")
    except Exception as e:
        print(f"STT streaming error: {e}")
    finally:
        # Clean up resources
        if 'stream' in locals():
            stream.stop_stream()
            stream.close()
        audio.terminate()
        await ws.close()
        await client.close()

 # Run the example
 if __name__ == "__main__":
    try:
        asyncio.run(streaming_stt_example())
    except KeyboardInterrupt:
        print("\n👋 Goodbye!")
	import asyncio
	import os
	import pyaudio
	from cartesia import AsyncCartesia

	async def streaming_stt_example():
	"""
	Advanced async STT example for real-time streaming applications.
	This example captures microphone input and processes it in real-time
	with proper error handling and demonstrates endpointing and word timestamp features.
	"""
	client = AsyncCartesia(
	api_key=os.getenv("CARTESIA_API_KEY"),
	)

	# Audio configuration - optimized to reduce word dropping
	SAMPLE_RATE = 16000
	CHUNK_SIZE = 4096 # Increased from 1024 - larger chunks for better continuity
	FORMAT = pyaudio.paInt16
	CHANNELS = 1

	# Initialize PyAudio
	audio = pyaudio.PyAudio()

	try:
	# Create websocket connection with voice activity detection
	ws = await client.stt.websocket(
	model="ink-whisper", # Model (required)
	language="en", # Language of your audio (required)
	encoding="pcm_s16le", # Audio encoding format (required)
	sample_rate=SAMPLE_RATE, # Audio sample rate (required)
	min_volume=0.05, # Lowered from 0.15 - more sensitive to quiet speech
	max_silence_duration_secs=1.0, # Increased from 0.3 - allow longer natural pauses
	)

	# Open microphone stream
	stream = audio.open(
	format=FORMAT,
	channels=CHANNELS,
	rate=SAMPLE_RATE,
	input=True,
	frames_per_buffer=CHUNK_SIZE
	)

	print("🎤 Listening to microphone... (Press Ctrl+C to stop)")

	async def capture_microphone():
	"""Capture audio from microphone and send to STT websocket"""
	try:
	loop = asyncio.get_event_loop()

	while True:
	# Read audio data from microphone (non-blocking)
	audio_data = await loop.run_in_executor(
	None, stream.read, CHUNK_SIZE, False
	)

	if audio_data:
	await ws.send(audio_data)

	# Removed sleep to eliminate gaps in audio transmission
	# await asyncio.sleep(0.01)

	except KeyboardInterrupt:
	print("\n🛑 Stopping microphone capture...")
	await ws.send("finalize")
	await ws.send("done")
	except Exception as e:
	print(f"Error capturing microphone: {e}")
	await ws.send("finalize")
	await ws.send("done")

	async def receive_transcripts():
	"""Receive and process transcription results with word timestamps"""
	current_line = ""

	try:
	async for result in ws.receive():
	if result['type'] == 'transcript':
	text = result['text']
	is_final = result['is_final']

	if is_final:
	# Final result - print it and move to new line
	if text.strip():
	print(f"\r✅ {text}")
	current_line = ""

	# Handle word-level timestamps if available
	if 'words' in result and result['words']:
	word_timestamps = result['words']
	print("📝 Word timestamps:")
	for word_info in word_timestamps:
	word = word_info['word']
	start = word_info['start']
	end = word_info['end']
	print(f" '{word}': {start:.2f}s - {end:.2f}s")
	print() # Add blank line for readability
	else:
	# Partial result - update current line
	if text.strip():
	current_line = f"🔄 {text}"
	print(f"\r{current_line}", end="", flush=True)

	elif result['type'] == 'done':
	print("\n🏁 Transcription session ended")
	break

	except Exception as e:
	print(f"Error receiving transcripts: {e}")

	# Use asyncio.gather to run microphone capture and transcript receiving concurrently
	await asyncio.gather(
	capture_microphone(),
	receive_transcripts()
	)

	except KeyboardInterrupt:
	print("\n👋 Goodbye!")
	except Exception as e:
	print(f"STT streaming error: {e}")
	finally:
	# Clean up resources
	if 'stream' in locals():
	stream.stop_stream()
	stream.close()
	audio.terminate()
	await ws.close()
	await client.close()

	# Run the example
	if __name__ == "__main__":
	try:
	asyncio.run(streaming_stt_example())
	except KeyboardInterrupt:
	print("\n👋 Goodbye!")
No results found