Skip to content

Instantly share code, notes, and snippets.

@tomaslucas
Created February 22, 2026 19:14
Show Gist options
  • Select an option

  • Save tomaslucas/93ffc9fdf3f2c8712e4404342d2a859e to your computer and use it in GitHub Desktop.

Select an option

Save tomaslucas/93ffc9fdf3f2c8712e4404342d2a859e to your computer and use it in GitHub Desktop.
Voice dictation setup for Ubuntu 24.04 (GNOME Wayland): faster-whisper + ydotool + parecord. Hard-won fixes for ydotoold daemon and wl-copy paste method.
#!/usr/bin/env bash
# =============================================================================
# Voice Dictation Setup for Ubuntu 24.04 (Wayland)
# Stack: faster-whisper + ydotool + parecord
# Toggle: Ctrl+Shift+Space (configurable)
# =============================================================================
set -euo pipefail
# --- Config ------------------------------------------------------------------
WHISPER_MODEL="${WHISPER_MODEL:-small}" # tiny|base|small|medium|large-v3
HOTKEY="${HOTKEY:-<Ctrl><Shift>space}" # GNOME keybinding format
DICTATION_DIR="$HOME/faster-whisper-dictation"
VENV_PYTHON="$DICTATION_DIR/venv/bin/python"
AUDIO_FILE="/tmp/dictation_recording.wav"
PID_FILE="/tmp/dictation.pid"
# Colors
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; NC='\033[0m'
info() { echo -e "${GREEN}[✓]${NC} $*"; }
warn() { echo -e "${YELLOW}[!]${NC} $*"; }
error() { echo -e "${RED}[✗]${NC} $*" >&2; }
section() { echo -e "\n${YELLOW}══ $* ══${NC}"; }
# --- Checks ------------------------------------------------------------------
section "Pre-flight checks"
if [[ "$EUID" -eq 0 ]]; then
error "No ejecutar como root. Usa tu usuario normal."
exit 1
fi
# Detect display server
SESSION_TYPE="${XDG_SESSION_TYPE:-unknown}"
info "Session type: $SESSION_TYPE"
if [[ "$SESSION_TYPE" == "x11" ]]; then
warn "Estás en X11. ydotool funciona pero xdotool también sería opción."
fi
# RAM check (small model needs ~2GB)
TOTAL_RAM_MB=$(free -m | awk '/^Mem:/{print $2}')
info "RAM disponible: ${TOTAL_RAM_MB}MB"
if [[ "$TOTAL_RAM_MB" -lt 2048 ]] && [[ "$WHISPER_MODEL" == "small" ]]; then
warn "RAM limitada. Considera WHISPER_MODEL=base para este sistema."
fi
# --- Step 1: System dependencies ---------------------------------------------
section "Step 1: System dependencies"
sudo apt-get update -qq
sudo apt-get install -y \
portaudio19-dev \
python3-venv \
python3-pip \
git \
ydotool \
ydotoold \
wl-clipboard \
pulseaudio-utils \
libnotify-bin \
cmake \
libevdev-dev
info "Dependencias instaladas."
# --- Step 2: ydotool permissions ---------------------------------------------
section "Step 2: ydotool + uinput permissions"
# udev rule
if [[ ! -f /etc/udev/rules.d/60-uinput.rules ]]; then
sudo tee /etc/udev/rules.d/60-uinput.rules > /dev/null << 'EOF'
KERNEL=="uinput", MODE="0660", GROUP="input"
EOF
sudo udevadm control --reload-rules
sudo udevadm trigger
info "Regla udev creada."
else
info "Regla udev ya existe."
fi
# Add user to input group
if ! groups | grep -qw input; then
sudo usermod -aG input "$USER"
warn "Usuario añadido al grupo 'input'. NECESITAS CERRAR SESIÓN Y VOLVER A ENTRAR antes de continuar."
warn "Después del re-login, ejecuta este script de nuevo."
exit 0
else
info "Usuario ya está en el grupo 'input'."
fi
# ydotoold systemd user service
mkdir -p ~/.config/systemd/user
cat > ~/.config/systemd/user/ydotoold.service << 'EOF'
[Unit]
Description=ydotoold - ydotool daemon
After=graphical-session.target
[Service]
ExecStart=/usr/bin/ydotoold
Restart=on-failure
RestartSec=3
[Install]
WantedBy=default.target
EOF
systemctl --user daemon-reload
systemctl --user enable --now ydotoold.service 2>/dev/null || true
if systemctl --user is-active --quiet ydotoold.service; then
info "ydotoold daemon activo."
else
warn "ydotoold no pudo iniciarse (puede ser normal sin sesión gráfica activa)."
fi
# --- Step 3: Python venv + faster-whisper ------------------------------------
section "Step 3: faster-whisper (Python venv)"
if [[ ! -d "$DICTATION_DIR" ]]; then
mkdir -p "$DICTATION_DIR"
fi
if [[ ! -d "$DICTATION_DIR/venv" ]]; then
python3 -m venv "$DICTATION_DIR/venv"
info "Venv creado en $DICTATION_DIR/venv"
fi
"$DICTATION_DIR/venv/bin/pip" install --quiet --upgrade pip
"$DICTATION_DIR/venv/bin/pip" install --quiet \
faster-whisper \
pyaudio \
sounddevice \
numpy
info "faster-whisper instalado."
# Pre-download model (evita delay en primer uso)
section "Step 4: Pre-descarga del modelo Whisper ($WHISPER_MODEL)"
info "Descargando modelo '$WHISPER_MODEL'... (puede tardar varios minutos)"
"$VENV_PYTHON" - << PYTHON
from faster_whisper import WhisperModel
import sys
try:
print(f" Descargando modelo '{sys.argv[1] if len(sys.argv) > 1 else 'small'}'...")
model = WhisperModel("$WHISPER_MODEL", device="cpu", compute_type="int8")
print(" Modelo descargado correctamente.")
except Exception as e:
print(f" Error: {e}", file=sys.stderr)
sys.exit(1)
PYTHON
info "Modelo listo."
# --- Step 5: Dictation scripts -----------------------------------------------
section "Step 5: Scripts de dictado"
mkdir -p ~/.local/bin
# dictate-start
cat > ~/.local/bin/dictate-start << SCRIPT
#!/usr/bin/env bash
AUDIO_FILE="$AUDIO_FILE"
PID_FILE="$PID_FILE"
if [ -f "\$PID_FILE" ]; then
notify-send -i microphone-sensitivity-high "Dictation" "Ya grabando... presiona de nuevo para parar"
exit 0
fi
notify-send -i microphone-sensitivity-high "Dictation" "Grabando... presiona hotkey para parar"
# Graba en WAV mono 16kHz (formato óptimo para Whisper)
parecord --channels=1 --rate=16000 --format=s16le "\$AUDIO_FILE" &
echo \$! > "\$PID_FILE"
SCRIPT
chmod +x ~/.local/bin/dictate-start
# dictate-stop
cat > ~/.local/bin/dictate-stop << SCRIPT
#!/usr/bin/env bash
VENV_PYTHON="$VENV_PYTHON"
AUDIO_FILE="$AUDIO_FILE"
PID_FILE="$PID_FILE"
WHISPER_MODEL="$WHISPER_MODEL"
if [ ! -f "\$PID_FILE" ]; then
notify-send "Dictation" "No está grabando"
exit 0
fi
# Detener grabación
kill "\$(cat "\$PID_FILE")" 2>/dev/null
rm -f "\$PID_FILE"
sleep 0.3
notify-send -i emblem-synchronizing "Dictation" "Transcribiendo..."
TEXT=\$("\$VENV_PYTHON" << 'PYTHON'
import sys
from faster_whisper import WhisperModel
model = WhisperModel("$WHISPER_MODEL", device="cpu", compute_type="int8")
segments, info = model.transcribe(
"$AUDIO_FILE",
beam_size=5,
language=None, # auto-detect; pon "es" o "en" para forzar idioma
vad_filter=True, # filtra silencios automáticamente
)
text = " ".join(seg.text.strip() for seg in segments)
print(text)
PYTHON
)
if [ -n "\$TEXT" ]; then
# Pequeño delay para que el foco vuelva a la ventana objetivo
sleep 0.2
# ydotool type is unreliable on GNOME Wayland regardless of --key-delay value
# Numeric keycodes (29:1 47:1...) don't work either
# Correct method: wl-copy to clipboard + ydotool key ctrl+v (symbolic names required)
echo -n "\$TEXT" | wl-copy && sleep 0.1 && ydotool key ctrl+v
notify-send -i emblem-default "Dictation" "✓ \$(echo "\$TEXT" | cut -c1-60)..."
else
notify-send -i dialog-warning "Dictation" "Sin voz detectada"
fi
rm -f "\$AUDIO_FILE"
SCRIPT
chmod +x ~/.local/bin/dictate-stop
# dictate-toggle (el que se bindea al hotkey)
cat > ~/.local/bin/dictate-toggle << SCRIPT
#!/usr/bin/env bash
PID_FILE="$PID_FILE"
if [ -f "\$PID_FILE" ]; then
exec ~/.local/bin/dictate-stop
else
exec ~/.local/bin/dictate-start
fi
SCRIPT
chmod +x ~/.local/bin/dictate-toggle
info "Scripts creados en ~/.local/bin/"
# --- Step 6: GNOME keyboard shortcut -----------------------------------------
section "Step 6: Hotkey GNOME ($HOTKEY)"
# Leer keybindings existentes para no pisarlos
EXISTING=$(gsettings get org.gnome.settings-daemon.plugins.media-keys custom-keybindings 2>/dev/null || echo "@as []")
# Determinar el próximo índice disponible
NEXT_IDX=0
while echo "$EXISTING" | grep -q "custom${NEXT_IDX}"; do
NEXT_IDX=$((NEXT_IDX + 1))
done
CUSTOM_PATH="/org/gnome/settings-daemon/plugins/media-keys/custom-keybindings/custom${NEXT_IDX}/"
# Construir nueva lista de keybindings
if [[ "$EXISTING" == "@as []" ]] || [[ "$EXISTING" == "[]" ]]; then
NEW_LIST="['${CUSTOM_PATH}']"
else
# Insertar al final de la lista existente
NEW_LIST=$(echo "$EXISTING" | sed "s|]$|, '${CUSTOM_PATH}']|")
fi
gsettings set org.gnome.settings-daemon.plugins.media-keys custom-keybindings "$NEW_LIST"
gsettings set "org.gnome.settings-daemon.plugins.media-keys.custom-keybinding:${CUSTOM_PATH}" name "Voice Dictation Toggle"
gsettings set "org.gnome.settings-daemon.plugins.media-keys.custom-keybinding:${CUSTOM_PATH}" command "${HOME}/.local/bin/dictate-toggle"
gsettings set "org.gnome.settings-daemon.plugins.media-keys.custom-keybinding:${CUSTOM_PATH}" binding "${HOTKEY}"
info "Hotkey registrado: $HOTKEY → dictate-toggle"
# Desactivar conflicto IBus (Ctrl+Space es el más común)
if echo "$HOTKEY" | grep -qi "ctrl.*space"; then
warn "Detectado Ctrl+Space: desactivando shortcut IBus para evitar conflicto..."
gsettings set org.freedesktop.ibus.general.hotkey triggers "[]" 2>/dev/null || true
fi
# --- Step 7: Smoke test -------------------------------------------------------
section "Step 7: Smoke test"
info "Probando ydotool..."
if ydotool type -- "" 2>/dev/null; then
info "ydotool OK"
else
warn "ydotool tuvo un warning (normal si ydotoold no está activo aún)"
fi
info "Probando parecord..."
if command -v parecord &>/dev/null; then
info "parecord disponible"
else
error "parecord no encontrado. Instala pulseaudio-utils."
fi
# --- Summary -----------------------------------------------------------------
section "Instalación completa"
cat << EOF
Uso:
$HOTKEY → Empieza a grabar
$HOTKEY → Para y transcribe (texto aparece donde está el cursor)
Scripts:
~/.local/bin/dictate-start # start recording
~/.local/bin/dictate-stop # stop + transcribe + type
~/.local/bin/dictate-toggle # toggle (el que usa el hotkey)
Modelo Whisper: $WHISPER_MODEL (cambia con WHISPER_MODEL=medium ./install.sh)
Idioma: auto-detect (edita dictate-stop para forzar: language="es")
Logs del daemon:
systemctl --user status ydotoold
journalctl --user -u ydotoold -f
Troubleshooting:
- Si el texto no aparece: verifica que estás en el grupo 'input' (groups | grep input)
- Si la primera transcripción es lenta: el modelo se carga en frío; las siguientes son más rápidas
- Para GNOME Wayland con IBus activo: Settings → Keyboard → elimina shortcuts de Ctrl+Space
EOF
info "Done. Prueba hablando con $HOTKEY en cualquier ventana."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment