Created
February 22, 2026 19:14
-
-
Save tomaslucas/93ffc9fdf3f2c8712e4404342d2a859e to your computer and use it in GitHub Desktop.
Voice dictation setup for Ubuntu 24.04 (GNOME Wayland): faster-whisper + ydotool + parecord. Hard-won fixes for ydotoold daemon and wl-copy paste method.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # ============================================================================= | |
| # Voice Dictation Setup for Ubuntu 24.04 (Wayland) | |
| # Stack: faster-whisper + ydotool + parecord | |
| # Toggle: Ctrl+Shift+Space (configurable) | |
| # ============================================================================= | |
| set -euo pipefail | |
| # --- Config ------------------------------------------------------------------ | |
| WHISPER_MODEL="${WHISPER_MODEL:-small}" # tiny|base|small|medium|large-v3 | |
| HOTKEY="${HOTKEY:-<Ctrl><Shift>space}" # GNOME keybinding format | |
| DICTATION_DIR="$HOME/faster-whisper-dictation" | |
| VENV_PYTHON="$DICTATION_DIR/venv/bin/python" | |
| AUDIO_FILE="/tmp/dictation_recording.wav" | |
| PID_FILE="/tmp/dictation.pid" | |
| # Colors | |
| RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m' | |
| info() { echo -e "${GREEN}[✓]${NC} $*"; } | |
| warn() { echo -e "${YELLOW}[!]${NC} $*"; } | |
| error() { echo -e "${RED}[✗]${NC} $*" >&2; } | |
| section() { echo -e "\n${YELLOW}══ $* ══${NC}"; } | |
| # --- Checks ------------------------------------------------------------------ | |
| section "Pre-flight checks" | |
| if [[ "$EUID" -eq 0 ]]; then | |
| error "No ejecutar como root. Usa tu usuario normal." | |
| exit 1 | |
| fi | |
| # Detect display server | |
| SESSION_TYPE="${XDG_SESSION_TYPE:-unknown}" | |
| info "Session type: $SESSION_TYPE" | |
| if [[ "$SESSION_TYPE" == "x11" ]]; then | |
| warn "Estás en X11. ydotool funciona pero xdotool también sería opción." | |
| fi | |
| # RAM check (small model needs ~2GB) | |
| TOTAL_RAM_MB=$(free -m | awk '/^Mem:/{print $2}') | |
| info "RAM disponible: ${TOTAL_RAM_MB}MB" | |
| if [[ "$TOTAL_RAM_MB" -lt 2048 ]] && [[ "$WHISPER_MODEL" == "small" ]]; then | |
| warn "RAM limitada. Considera WHISPER_MODEL=base para este sistema." | |
| fi | |
| # --- Step 1: System dependencies --------------------------------------------- | |
| section "Step 1: System dependencies" | |
| sudo apt-get update -qq | |
| sudo apt-get install -y \ | |
| portaudio19-dev \ | |
| python3-venv \ | |
| python3-pip \ | |
| git \ | |
| ydotool \ | |
| ydotoold \ | |
| wl-clipboard \ | |
| pulseaudio-utils \ | |
| libnotify-bin \ | |
| cmake \ | |
| libevdev-dev | |
| info "Dependencias instaladas." | |
| # --- Step 2: ydotool permissions --------------------------------------------- | |
| section "Step 2: ydotool + uinput permissions" | |
| # udev rule | |
| if [[ ! -f /etc/udev/rules.d/60-uinput.rules ]]; then | |
| sudo tee /etc/udev/rules.d/60-uinput.rules > /dev/null << 'EOF' | |
| KERNEL=="uinput", MODE="0660", GROUP="input" | |
| EOF | |
| sudo udevadm control --reload-rules | |
| sudo udevadm trigger | |
| info "Regla udev creada." | |
| else | |
| info "Regla udev ya existe." | |
| fi | |
| # Add user to input group | |
| if ! groups | grep -qw input; then | |
| sudo usermod -aG input "$USER" | |
| warn "Usuario añadido al grupo 'input'. NECESITAS CERRAR SESIÓN Y VOLVER A ENTRAR antes de continuar." | |
| warn "Después del re-login, ejecuta este script de nuevo." | |
| exit 0 | |
| else | |
| info "Usuario ya está en el grupo 'input'." | |
| fi | |
| # ydotoold systemd user service | |
| mkdir -p ~/.config/systemd/user | |
| cat > ~/.config/systemd/user/ydotoold.service << 'EOF' | |
| [Unit] | |
| Description=ydotoold - ydotool daemon | |
| After=graphical-session.target | |
| [Service] | |
| ExecStart=/usr/bin/ydotoold | |
| Restart=on-failure | |
| RestartSec=3 | |
| [Install] | |
| WantedBy=default.target | |
| EOF | |
| systemctl --user daemon-reload | |
| systemctl --user enable --now ydotoold.service 2>/dev/null || true | |
| if systemctl --user is-active --quiet ydotoold.service; then | |
| info "ydotoold daemon activo." | |
| else | |
| warn "ydotoold no pudo iniciarse (puede ser normal sin sesión gráfica activa)." | |
| fi | |
| # --- Step 3: Python venv + faster-whisper ------------------------------------ | |
| section "Step 3: faster-whisper (Python venv)" | |
| if [[ ! -d "$DICTATION_DIR" ]]; then | |
| mkdir -p "$DICTATION_DIR" | |
| fi | |
| if [[ ! -d "$DICTATION_DIR/venv" ]]; then | |
| python3 -m venv "$DICTATION_DIR/venv" | |
| info "Venv creado en $DICTATION_DIR/venv" | |
| fi | |
| "$DICTATION_DIR/venv/bin/pip" install --quiet --upgrade pip | |
| "$DICTATION_DIR/venv/bin/pip" install --quiet \ | |
| faster-whisper \ | |
| pyaudio \ | |
| sounddevice \ | |
| numpy | |
| info "faster-whisper instalado." | |
| # Pre-download model (evita delay en primer uso) | |
| section "Step 4: Pre-descarga del modelo Whisper ($WHISPER_MODEL)" | |
| info "Descargando modelo '$WHISPER_MODEL'... (puede tardar varios minutos)" | |
| "$VENV_PYTHON" - << PYTHON | |
| from faster_whisper import WhisperModel | |
| import sys | |
| try: | |
| print(f" Descargando modelo '{sys.argv[1] if len(sys.argv) > 1 else 'small'}'...") | |
| model = WhisperModel("$WHISPER_MODEL", device="cpu", compute_type="int8") | |
| print(" Modelo descargado correctamente.") | |
| except Exception as e: | |
| print(f" Error: {e}", file=sys.stderr) | |
| sys.exit(1) | |
| PYTHON | |
| info "Modelo listo." | |
| # --- Step 5: Dictation scripts ----------------------------------------------- | |
| section "Step 5: Scripts de dictado" | |
| mkdir -p ~/.local/bin | |
| # dictate-start | |
| cat > ~/.local/bin/dictate-start << SCRIPT | |
| #!/usr/bin/env bash | |
| AUDIO_FILE="$AUDIO_FILE" | |
| PID_FILE="$PID_FILE" | |
| if [ -f "\$PID_FILE" ]; then | |
| notify-send -i microphone-sensitivity-high "Dictation" "Ya grabando... presiona de nuevo para parar" | |
| exit 0 | |
| fi | |
| notify-send -i microphone-sensitivity-high "Dictation" "Grabando... presiona hotkey para parar" | |
| # Graba en WAV mono 16kHz (formato óptimo para Whisper) | |
| parecord --channels=1 --rate=16000 --format=s16le "\$AUDIO_FILE" & | |
| echo \$! > "\$PID_FILE" | |
| SCRIPT | |
| chmod +x ~/.local/bin/dictate-start | |
| # dictate-stop | |
| cat > ~/.local/bin/dictate-stop << SCRIPT | |
| #!/usr/bin/env bash | |
| VENV_PYTHON="$VENV_PYTHON" | |
| AUDIO_FILE="$AUDIO_FILE" | |
| PID_FILE="$PID_FILE" | |
| WHISPER_MODEL="$WHISPER_MODEL" | |
| if [ ! -f "\$PID_FILE" ]; then | |
| notify-send "Dictation" "No está grabando" | |
| exit 0 | |
| fi | |
| # Detener grabación | |
| kill "\$(cat "\$PID_FILE")" 2>/dev/null | |
| rm -f "\$PID_FILE" | |
| sleep 0.3 | |
| notify-send -i emblem-synchronizing "Dictation" "Transcribiendo..." | |
| TEXT=\$("\$VENV_PYTHON" << 'PYTHON' | |
| import sys | |
| from faster_whisper import WhisperModel | |
| model = WhisperModel("$WHISPER_MODEL", device="cpu", compute_type="int8") | |
| segments, info = model.transcribe( | |
| "$AUDIO_FILE", | |
| beam_size=5, | |
| language=None, # auto-detect; pon "es" o "en" para forzar idioma | |
| vad_filter=True, # filtra silencios automáticamente | |
| ) | |
| text = " ".join(seg.text.strip() for seg in segments) | |
| print(text) | |
| PYTHON | |
| ) | |
| if [ -n "\$TEXT" ]; then | |
| # Pequeño delay para que el foco vuelva a la ventana objetivo | |
| sleep 0.2 | |
| # ydotool type is unreliable on GNOME Wayland regardless of --key-delay value | |
| # Numeric keycodes (29:1 47:1...) don't work either | |
| # Correct method: wl-copy to clipboard + ydotool key ctrl+v (symbolic names required) | |
| echo -n "\$TEXT" | wl-copy && sleep 0.1 && ydotool key ctrl+v | |
| notify-send -i emblem-default "Dictation" "✓ \$(echo "\$TEXT" | cut -c1-60)..." | |
| else | |
| notify-send -i dialog-warning "Dictation" "Sin voz detectada" | |
| fi | |
| rm -f "\$AUDIO_FILE" | |
| SCRIPT | |
| chmod +x ~/.local/bin/dictate-stop | |
| # dictate-toggle (el que se bindea al hotkey) | |
| cat > ~/.local/bin/dictate-toggle << SCRIPT | |
| #!/usr/bin/env bash | |
| PID_FILE="$PID_FILE" | |
| if [ -f "\$PID_FILE" ]; then | |
| exec ~/.local/bin/dictate-stop | |
| else | |
| exec ~/.local/bin/dictate-start | |
| fi | |
| SCRIPT | |
| chmod +x ~/.local/bin/dictate-toggle | |
| info "Scripts creados en ~/.local/bin/" | |
| # --- Step 6: GNOME keyboard shortcut ----------------------------------------- | |
| section "Step 6: Hotkey GNOME ($HOTKEY)" | |
| # Leer keybindings existentes para no pisarlos | |
| EXISTING=$(gsettings get org.gnome.settings-daemon.plugins.media-keys custom-keybindings 2>/dev/null || echo "@as []") | |
| # Determinar el próximo índice disponible | |
| NEXT_IDX=0 | |
| while echo "$EXISTING" | grep -q "custom${NEXT_IDX}"; do | |
| NEXT_IDX=$((NEXT_IDX + 1)) | |
| done | |
| CUSTOM_PATH="/org/gnome/settings-daemon/plugins/media-keys/custom-keybindings/custom${NEXT_IDX}/" | |
| # Construir nueva lista de keybindings | |
| if [[ "$EXISTING" == "@as []" ]] || [[ "$EXISTING" == "[]" ]]; then | |
| NEW_LIST="['${CUSTOM_PATH}']" | |
| else | |
| # Insertar al final de la lista existente | |
| NEW_LIST=$(echo "$EXISTING" | sed "s|]$|, '${CUSTOM_PATH}']|") | |
| fi | |
| gsettings set org.gnome.settings-daemon.plugins.media-keys custom-keybindings "$NEW_LIST" | |
| gsettings set "org.gnome.settings-daemon.plugins.media-keys.custom-keybinding:${CUSTOM_PATH}" name "Voice Dictation Toggle" | |
| gsettings set "org.gnome.settings-daemon.plugins.media-keys.custom-keybinding:${CUSTOM_PATH}" command "${HOME}/.local/bin/dictate-toggle" | |
| gsettings set "org.gnome.settings-daemon.plugins.media-keys.custom-keybinding:${CUSTOM_PATH}" binding "${HOTKEY}" | |
| info "Hotkey registrado: $HOTKEY → dictate-toggle" | |
| # Desactivar conflicto IBus (Ctrl+Space es el más común) | |
| if echo "$HOTKEY" | grep -qi "ctrl.*space"; then | |
| warn "Detectado Ctrl+Space: desactivando shortcut IBus para evitar conflicto..." | |
| gsettings set org.freedesktop.ibus.general.hotkey triggers "[]" 2>/dev/null || true | |
| fi | |
| # --- Step 7: Smoke test ------------------------------------------------------- | |
| section "Step 7: Smoke test" | |
| info "Probando ydotool..." | |
| if ydotool type -- "" 2>/dev/null; then | |
| info "ydotool OK" | |
| else | |
| warn "ydotool tuvo un warning (normal si ydotoold no está activo aún)" | |
| fi | |
| info "Probando parecord..." | |
| if command -v parecord &>/dev/null; then | |
| info "parecord disponible" | |
| else | |
| error "parecord no encontrado. Instala pulseaudio-utils." | |
| fi | |
| # --- Summary ----------------------------------------------------------------- | |
| section "Instalación completa" | |
| cat << EOF | |
| Uso: | |
| $HOTKEY → Empieza a grabar | |
| $HOTKEY → Para y transcribe (texto aparece donde está el cursor) | |
| Scripts: | |
| ~/.local/bin/dictate-start # start recording | |
| ~/.local/bin/dictate-stop # stop + transcribe + type | |
| ~/.local/bin/dictate-toggle # toggle (el que usa el hotkey) | |
| Modelo Whisper: $WHISPER_MODEL (cambia con WHISPER_MODEL=medium ./install.sh) | |
| Idioma: auto-detect (edita dictate-stop para forzar: language="es") | |
| Logs del daemon: | |
| systemctl --user status ydotoold | |
| journalctl --user -u ydotoold -f | |
| Troubleshooting: | |
| - Si el texto no aparece: verifica que estás en el grupo 'input' (groups | grep input) | |
| - Si la primera transcripción es lenta: el modelo se carga en frío; las siguientes son más rápidas | |
| - Para GNOME Wayland con IBus activo: Settings → Keyboard → elimina shortcuts de Ctrl+Space | |
| EOF | |
| info "Done. Prueba hablando con $HOTKEY en cualquier ventana." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment