BHznJNs · July 24, 2025 13:20
diff --git a/sensevoice_cpp.py b/sensevoice_cpp.py
 import subprocess
 import re
 from pathlib import Path
 from modelscope import model_file_download

 def extract_and_merge_to_one_line_compact(transcript_text):
    # remove times data
    no_timestamps = re.sub(r'\[\d+\.\d+-\d+\.\d+\]\s', '', transcript_text)
    # remove newlines, then remove extra whitespace at the beginning and end
    one_line = no_timestamps.replace('\n', '').strip()
    return one_line

 model_path = model_file_download(
    model_id="lovemefan/SenseVoiceGGUF",
    file_path="sense-voice-small-fp16.gguf",
    local_dir="./sensevoice"
 )

 assert model_path is not None # assert that file download works well

 """
 the loaded audio file should be processed into one channel
 """

 result = subprocess.run(
    args=[
        "./sense-voice-main.exe",
        "-m", Path(model_path).as_posix(),
        "-f", "./Recording.wav",
        "-itn",
    ],
    cwd="./",
    text=True,
    encoding="utf-8",
    capture_output=True,
 )
 if result.returncode == 0:
    print("Transcribe result: ", extract_and_merge_to_one_line_compact(result.stdout))
 else:
    print("Transcribe error:", result.stderr)
diff --git a/SenseVoiceModel b/SenseVoiceModel
 from typing import Literal

 class SenseVoiceModel:
    def __init__(self, executable_path: str, model_path: str):
        self._executable_path = executable_path
        self._model_path = model_path

    @staticmethod
    def remove_timedata(transcript_text: str) -> str:
        import re
        # remove times data
        no_timestamps = re.sub(r'\[\d+\.\d+-\d+\.\d+\]\s', '', transcript_text)
        # remove newlines, then remove extra whitespace at the beginning and end
        one_line = no_timestamps.replace('\n', '').strip()
        return one_line

    def transcribe(self,
            record_path: str,
            language: Literal["auto", "zh", "en", "yue", "ja", "ko"]="auto"
        ) -> str:
        """
        SenseVoice parameter list:
        options:
        -h,        --help              [default] show this help message and exit
        -t N,      --threads N         [4      ] number of threads to use during computation
        -p N,      --processors N      [1      ] number of processors to use during computation
        -ot N,     --offset-t N        [0      ] time offset in milliseconds
        -on N,     --offset-n N        [0      ] segment index offset
        -d  N,     --duration N        [0      ] duration of audio to process in milliseconds
        -mc N,     --max-context N     [-1     ] maximum number of text context tokens to store
        -ml N,     --max-len N         [0      ] maximum segment length in characters
        -sow,      --split-on-word     [false  ] split on word rather than on token
        -bo N,     --best-of N         [5      ] number of best candidates to keep
        -bs N,     --beam-size N       [5      ] beam size for beam search
        -ac N,     --audio-ctx N       [0      ] audio context size (0 - all)
        -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
        -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
        -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
        -tp,       --temperature N     [0.00   ] The sampling temperature, between 0 and 1
        -tpi,      --temperature-inc N [0.20   ] The increment of temperature, between 0 and 1
        -debug,    --debug-mode        [false  ] enable debug mode (eg. dump log_mel)
        -di,       --diarize           [false  ] stereo audio diarization
        -tdrz,     --tinydiarize       [false  ] enable tinydiarize (requires a tdrz model)
        -nf,       --no-fallback       [false  ] do not use temperature fallback while decoding
        -otxt,     --output-txt        [false  ] output result in a text file
        -osrt,     --output-srt        [false  ] output result in a srt file
        -ocsv,     --output-csv        [false  ] output result in a CSV file
        -oj,       --output-json       [false  ] output result in a JSON file
        -ojf,      --output-json-full  [false  ] include more information in the JSON file
        -of FNAME, --output-file FNAME [       ] output file path (without file extension)
        -np,       --no-prints         [false  ] do not print anything other than the results
        -ps,       --print-special     [false  ] print special tokens
        -pc,       --print-colors      [false  ] print colors
        -pp,       --print-progress    [false  ] print progress
        -nt,       --no-timestamps     [false  ] do not print timestamps
        -l LANG,   --language LANG     [auto   ] spoken language ('auto' for auto-detect), support [`zh`, `en`, `yue`, `ja`, `ko`
                    --prompt PROMPT    [       ] initial prompt (max n_text_ctx/2 tokens)
        -m FNAME,  --model FNAME                 model path
        -f FNAME,  --file FNAME        [       ] input WAV file path
                    --min_speech_duration_ms   [250    ] min_speech_duration_ms
                    --max_speech_duration_ms   [15000  ] log probability threshold for decoder fail
                    --min_silence_duration_ms  [100    ] min_silence_duration_ms
                    --speech_pad_ms    [30     ] speech_pad_ms
        -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
        -ls,       --log-score         [false  ] log best decoder scores of tokens
        -ng,       --no-gpu            [false  ] disable GPU
        -fa,       --flash-attn        [false  ] flash attention
        -itn,      --use-itn           [true   ] use itn
        -prefix,   --use-prefix        [true   ] use itn
        """

        import subprocess
        args = [
            self._executable_path,
            "-m", self._model_path,
            "-f", record_path,
            "-l", language,
            "-debug", "false",
            "-prefix", "false",
            "-itn",
        ]
        result = subprocess.run(
            args=args,
            text=True,
            encoding="utf-8",
            capture_output=True,
        )
        if result.returncode == 0:
            return SenseVoiceModel.remove_timedata(result.stdout)
        else:
            raise RuntimeError(result.stderr)
	import subprocess
	import re
	from pathlib import Path
	from modelscope import model_file_download

	def extract_and_merge_to_one_line_compact(transcript_text):
	# remove times data
	no_timestamps = re.sub(r'\[\d+\.\d+-\d+\.\d+\]\s', '', transcript_text)
	# remove newlines, then remove extra whitespace at the beginning and end
	one_line = no_timestamps.replace('\n', '').strip()
	return one_line

	model_path = model_file_download(
	model_id="lovemefan/SenseVoiceGGUF",
	file_path="sense-voice-small-fp16.gguf",
	local_dir="./sensevoice"
	)

	assert model_path is not None # assert that file download works well

	"""
	the loaded audio file should be processed into one channel
	"""

	result = subprocess.run(
	args=[
	"./sense-voice-main.exe",
	"-m", Path(model_path).as_posix(),
	"-f", "./Recording.wav",
	"-itn",
	],
	cwd="./",
	text=True,
	encoding="utf-8",
	capture_output=True,
	)
	if result.returncode == 0:
	print("Transcribe result: ", extract_and_merge_to_one_line_compact(result.stdout))
	else:
	print("Transcribe error:", result.stderr)
	from typing import Literal

	class SenseVoiceModel:
	def __init__(self, executable_path: str, model_path: str):
	self._executable_path = executable_path
	self._model_path = model_path

	@staticmethod
	def remove_timedata(transcript_text: str) -> str:
	import re
	# remove times data
	no_timestamps = re.sub(r'\[\d+\.\d+-\d+\.\d+\]\s', '', transcript_text)
	# remove newlines, then remove extra whitespace at the beginning and end
	one_line = no_timestamps.replace('\n', '').strip()
	return one_line

	def transcribe(self,
	record_path: str,
	language: Literal["auto", "zh", "en", "yue", "ja", "ko"]="auto"
	) -> str:
	"""
	SenseVoice parameter list:
	options:
	-h, --help [default] show this help message and exit
	-t N, --threads N [4 ] number of threads to use during computation
	-p N, --processors N [1 ] number of processors to use during computation
	-ot N, --offset-t N [0 ] time offset in milliseconds
	-on N, --offset-n N [0 ] segment index offset
	-d N, --duration N [0 ] duration of audio to process in milliseconds
	-mc N, --max-context N [-1 ] maximum number of text context tokens to store
	-ml N, --max-len N [0 ] maximum segment length in characters
	-sow, --split-on-word [false ] split on word rather than on token
	-bo N, --best-of N [5 ] number of best candidates to keep
	-bs N, --beam-size N [5 ] beam size for beam search
	-ac N, --audio-ctx N [0 ] audio context size (0 - all)
	-wt N, --word-thold N [0.01 ] word timestamp probability threshold
	-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
	-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
	-tp, --temperature N [0.00 ] The sampling temperature, between 0 and 1
	-tpi, --temperature-inc N [0.20 ] The increment of temperature, between 0 and 1
	-debug, --debug-mode [false ] enable debug mode (eg. dump log_mel)
	-di, --diarize [false ] stereo audio diarization
	-tdrz, --tinydiarize [false ] enable tinydiarize (requires a tdrz model)
	-nf, --no-fallback [false ] do not use temperature fallback while decoding
	-otxt, --output-txt [false ] output result in a text file
	-osrt, --output-srt [false ] output result in a srt file
	-ocsv, --output-csv [false ] output result in a CSV file
	-oj, --output-json [false ] output result in a JSON file
	-ojf, --output-json-full [false ] include more information in the JSON file
	-of FNAME, --output-file FNAME [ ] output file path (without file extension)
	-np, --no-prints [false ] do not print anything other than the results
	-ps, --print-special [false ] print special tokens
	-pc, --print-colors [false ] print colors
	-pp, --print-progress [false ] print progress
	-nt, --no-timestamps [false ] do not print timestamps
	-l LANG, --language LANG [auto ] spoken language ('auto' for auto-detect), support [`zh`, `en`, `yue`, `ja`, `ko`
	--prompt PROMPT [ ] initial prompt (max n_text_ctx/2 tokens)
	-m FNAME, --model FNAME model path
	-f FNAME, --file FNAME [ ] input WAV file path
	--min_speech_duration_ms [250 ] min_speech_duration_ms
	--max_speech_duration_ms [15000 ] log probability threshold for decoder fail
	--min_silence_duration_ms [100 ] min_silence_duration_ms
	--speech_pad_ms [30 ] speech_pad_ms
	-oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
	-ls, --log-score [false ] log best decoder scores of tokens
	-ng, --no-gpu [false ] disable GPU
	-fa, --flash-attn [false ] flash attention
	-itn, --use-itn [true ] use itn
	-prefix, --use-prefix [true ] use itn
	"""

	import subprocess
	args = [
	self._executable_path,
	"-m", self._model_path,
	"-f", record_path,
	"-l", language,
	"-debug", "false",
	"-prefix", "false",
	"-itn",
	]
	result = subprocess.run(
	args=args,
	text=True,
	encoding="utf-8",
	capture_output=True,
	)
	if result.returncode == 0:
	return SenseVoiceModel.remove_timedata(result.stdout)
	else:
	raise RuntimeError(result.stderr)