Skip to content

Instantly share code, notes, and snippets.

@BHznJNs
Last active July 24, 2025 13:20
Show Gist options
  • Select an option

  • Save BHznJNs/a81c7047bac79f2b464140863bdfdb0f to your computer and use it in GitHub Desktop.

Select an option

Save BHznJNs/a81c7047bac79f2b464140863bdfdb0f to your computer and use it in GitHub Desktop.
Runs SenseVoice.cpp in Python
import subprocess
import re
from pathlib import Path
from modelscope import model_file_download
def extract_and_merge_to_one_line_compact(transcript_text):
# remove times data
no_timestamps = re.sub(r'\[\d+\.\d+-\d+\.\d+\]\s', '', transcript_text)
# remove newlines, then remove extra whitespace at the beginning and end
one_line = no_timestamps.replace('\n', '').strip()
return one_line
model_path = model_file_download(
model_id="lovemefan/SenseVoiceGGUF",
file_path="sense-voice-small-fp16.gguf",
local_dir="./sensevoice"
)
assert model_path is not None # assert that file download works well
"""
the loaded audio file should be processed into one channel
"""
result = subprocess.run(
args=[
"./sense-voice-main.exe",
"-m", Path(model_path).as_posix(),
"-f", "./Recording.wav",
"-itn",
],
cwd="./",
text=True,
encoding="utf-8",
capture_output=True,
)
if result.returncode == 0:
print("Transcribe result: ", extract_and_merge_to_one_line_compact(result.stdout))
else:
print("Transcribe error:", result.stderr)
from typing import Literal
class SenseVoiceModel:
def __init__(self, executable_path: str, model_path: str):
self._executable_path = executable_path
self._model_path = model_path
@staticmethod
def remove_timedata(transcript_text: str) -> str:
import re
# remove times data
no_timestamps = re.sub(r'\[\d+\.\d+-\d+\.\d+\]\s', '', transcript_text)
# remove newlines, then remove extra whitespace at the beginning and end
one_line = no_timestamps.replace('\n', '').strip()
return one_line
def transcribe(self,
record_path: str,
language: Literal["auto", "zh", "en", "yue", "ja", "ko"]="auto"
) -> str:
"""
SenseVoice parameter list:
options:
-h, --help [default] show this help message and exit
-t N, --threads N [4 ] number of threads to use during computation
-p N, --processors N [1 ] number of processors to use during computation
-ot N, --offset-t N [0 ] time offset in milliseconds
-on N, --offset-n N [0 ] segment index offset
-d N, --duration N [0 ] duration of audio to process in milliseconds
-mc N, --max-context N [-1 ] maximum number of text context tokens to store
-ml N, --max-len N [0 ] maximum segment length in characters
-sow, --split-on-word [false ] split on word rather than on token
-bo N, --best-of N [5 ] number of best candidates to keep
-bs N, --beam-size N [5 ] beam size for beam search
-ac N, --audio-ctx N [0 ] audio context size (0 - all)
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
-tp, --temperature N [0.00 ] The sampling temperature, between 0 and 1
-tpi, --temperature-inc N [0.20 ] The increment of temperature, between 0 and 1
-debug, --debug-mode [false ] enable debug mode (eg. dump log_mel)
-di, --diarize [false ] stereo audio diarization
-tdrz, --tinydiarize [false ] enable tinydiarize (requires a tdrz model)
-nf, --no-fallback [false ] do not use temperature fallback while decoding
-otxt, --output-txt [false ] output result in a text file
-osrt, --output-srt [false ] output result in a srt file
-ocsv, --output-csv [false ] output result in a CSV file
-oj, --output-json [false ] output result in a JSON file
-ojf, --output-json-full [false ] include more information in the JSON file
-of FNAME, --output-file FNAME [ ] output file path (without file extension)
-np, --no-prints [false ] do not print anything other than the results
-ps, --print-special [false ] print special tokens
-pc, --print-colors [false ] print colors
-pp, --print-progress [false ] print progress
-nt, --no-timestamps [false ] do not print timestamps
-l LANG, --language LANG [auto ] spoken language ('auto' for auto-detect), support [`zh`, `en`, `yue`, `ja`, `ko`
--prompt PROMPT [ ] initial prompt (max n_text_ctx/2 tokens)
-m FNAME, --model FNAME model path
-f FNAME, --file FNAME [ ] input WAV file path
--min_speech_duration_ms [250 ] min_speech_duration_ms
--max_speech_duration_ms [15000 ] log probability threshold for decoder fail
--min_silence_duration_ms [100 ] min_silence_duration_ms
--speech_pad_ms [30 ] speech_pad_ms
-oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
-ls, --log-score [false ] log best decoder scores of tokens
-ng, --no-gpu [false ] disable GPU
-fa, --flash-attn [false ] flash attention
-itn, --use-itn [true ] use itn
-prefix, --use-prefix [true ] use itn
"""
import subprocess
args = [
self._executable_path,
"-m", self._model_path,
"-f", record_path,
"-l", language,
"-debug", "false",
"-prefix", "false",
"-itn",
]
result = subprocess.run(
args=args,
text=True,
encoding="utf-8",
capture_output=True,
)
if result.returncode == 0:
return SenseVoiceModel.remove_timedata(result.stdout)
else:
raise RuntimeError(result.stderr)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment