Last active
July 24, 2025 13:20
-
-
Save BHznJNs/a81c7047bac79f2b464140863bdfdb0f to your computer and use it in GitHub Desktop.
Runs SenseVoice.cpp in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import subprocess | |
| import re | |
| from pathlib import Path | |
| from modelscope import model_file_download | |
| def extract_and_merge_to_one_line_compact(transcript_text): | |
| # remove times data | |
| no_timestamps = re.sub(r'\[\d+\.\d+-\d+\.\d+\]\s', '', transcript_text) | |
| # remove newlines, then remove extra whitespace at the beginning and end | |
| one_line = no_timestamps.replace('\n', '').strip() | |
| return one_line | |
| model_path = model_file_download( | |
| model_id="lovemefan/SenseVoiceGGUF", | |
| file_path="sense-voice-small-fp16.gguf", | |
| local_dir="./sensevoice" | |
| ) | |
| assert model_path is not None # assert that file download works well | |
| """ | |
| the loaded audio file should be processed into one channel | |
| """ | |
| result = subprocess.run( | |
| args=[ | |
| "./sense-voice-main.exe", | |
| "-m", Path(model_path).as_posix(), | |
| "-f", "./Recording.wav", | |
| "-itn", | |
| ], | |
| cwd="./", | |
| text=True, | |
| encoding="utf-8", | |
| capture_output=True, | |
| ) | |
| if result.returncode == 0: | |
| print("Transcribe result: ", extract_and_merge_to_one_line_compact(result.stdout)) | |
| else: | |
| print("Transcribe error:", result.stderr) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from typing import Literal | |
| class SenseVoiceModel: | |
| def __init__(self, executable_path: str, model_path: str): | |
| self._executable_path = executable_path | |
| self._model_path = model_path | |
| @staticmethod | |
| def remove_timedata(transcript_text: str) -> str: | |
| import re | |
| # remove times data | |
| no_timestamps = re.sub(r'\[\d+\.\d+-\d+\.\d+\]\s', '', transcript_text) | |
| # remove newlines, then remove extra whitespace at the beginning and end | |
| one_line = no_timestamps.replace('\n', '').strip() | |
| return one_line | |
| def transcribe(self, | |
| record_path: str, | |
| language: Literal["auto", "zh", "en", "yue", "ja", "ko"]="auto" | |
| ) -> str: | |
| """ | |
| SenseVoice parameter list: | |
| options: | |
| -h, --help [default] show this help message and exit | |
| -t N, --threads N [4 ] number of threads to use during computation | |
| -p N, --processors N [1 ] number of processors to use during computation | |
| -ot N, --offset-t N [0 ] time offset in milliseconds | |
| -on N, --offset-n N [0 ] segment index offset | |
| -d N, --duration N [0 ] duration of audio to process in milliseconds | |
| -mc N, --max-context N [-1 ] maximum number of text context tokens to store | |
| -ml N, --max-len N [0 ] maximum segment length in characters | |
| -sow, --split-on-word [false ] split on word rather than on token | |
| -bo N, --best-of N [5 ] number of best candidates to keep | |
| -bs N, --beam-size N [5 ] beam size for beam search | |
| -ac N, --audio-ctx N [0 ] audio context size (0 - all) | |
| -wt N, --word-thold N [0.01 ] word timestamp probability threshold | |
| -et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail | |
| -lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail | |
| -tp, --temperature N [0.00 ] The sampling temperature, between 0 and 1 | |
| -tpi, --temperature-inc N [0.20 ] The increment of temperature, between 0 and 1 | |
| -debug, --debug-mode [false ] enable debug mode (eg. dump log_mel) | |
| -di, --diarize [false ] stereo audio diarization | |
| -tdrz, --tinydiarize [false ] enable tinydiarize (requires a tdrz model) | |
| -nf, --no-fallback [false ] do not use temperature fallback while decoding | |
| -otxt, --output-txt [false ] output result in a text file | |
| -osrt, --output-srt [false ] output result in a srt file | |
| -ocsv, --output-csv [false ] output result in a CSV file | |
| -oj, --output-json [false ] output result in a JSON file | |
| -ojf, --output-json-full [false ] include more information in the JSON file | |
| -of FNAME, --output-file FNAME [ ] output file path (without file extension) | |
| -np, --no-prints [false ] do not print anything other than the results | |
| -ps, --print-special [false ] print special tokens | |
| -pc, --print-colors [false ] print colors | |
| -pp, --print-progress [false ] print progress | |
| -nt, --no-timestamps [false ] do not print timestamps | |
| -l LANG, --language LANG [auto ] spoken language ('auto' for auto-detect), support [`zh`, `en`, `yue`, `ja`, `ko` | |
| --prompt PROMPT [ ] initial prompt (max n_text_ctx/2 tokens) | |
| -m FNAME, --model FNAME model path | |
| -f FNAME, --file FNAME [ ] input WAV file path | |
| --min_speech_duration_ms [250 ] min_speech_duration_ms | |
| --max_speech_duration_ms [15000 ] log probability threshold for decoder fail | |
| --min_silence_duration_ms [100 ] min_silence_duration_ms | |
| --speech_pad_ms [30 ] speech_pad_ms | |
| -oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference | |
| -ls, --log-score [false ] log best decoder scores of tokens | |
| -ng, --no-gpu [false ] disable GPU | |
| -fa, --flash-attn [false ] flash attention | |
| -itn, --use-itn [true ] use itn | |
| -prefix, --use-prefix [true ] use itn | |
| """ | |
| import subprocess | |
| args = [ | |
| self._executable_path, | |
| "-m", self._model_path, | |
| "-f", record_path, | |
| "-l", language, | |
| "-debug", "false", | |
| "-prefix", "false", | |
| "-itn", | |
| ] | |
| result = subprocess.run( | |
| args=args, | |
| text=True, | |
| encoding="utf-8", | |
| capture_output=True, | |
| ) | |
| if result.returncode == 0: | |
| return SenseVoiceModel.remove_timedata(result.stdout) | |
| else: | |
| raise RuntimeError(result.stderr) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment