Created
July 22, 2025 15:57
-
-
Save BHznJNs/eb12f09f668107939e7d28c63bf4c6fa to your computer and use it in GitHub Desktop.
An audio preprocessor module for applications that uses Whisper or Faster-Whisper to STT.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| import noisereduce as nr | |
| from loguru import logger | |
| from pydub import AudioSegment | |
| from pydub.silence import split_on_silence | |
| def load_audio_file(audio_path: str) -> AudioSegment | None: | |
| try: | |
| audio = AudioSegment\ | |
| .from_file(audio_path)\ | |
| .set_channels(1) | |
| return audio | |
| except Exception as e: | |
| logger.error(f"Failed to load the audio file: {e}") | |
| return None | |
| def reduce_noise(audio: AudioSegment) -> AudioSegment: | |
| # resolves audio data into the format that noisereduce can handle | |
| samplerate = audio.frame_rate | |
| samples = np.array(audio.get_array_of_samples()).astype(np.float32) | |
| reduced_noise_samples = nr.reduce_noise(y=samples, sr=samplerate, prop_decrease=0.8) | |
| processed_audio = AudioSegment( | |
| reduced_noise_samples.astype(np.int16).tobytes(), | |
| frame_rate=samplerate, | |
| sample_width=2, # 16-bit audio | |
| channels=1 | |
| ) | |
| return processed_audio | |
| def calculate_silence_threshold(audio: AudioSegment) -> float: | |
| candidate_thresh = audio.dBFS - 8 | |
| safe_floor_thresh = -50.0 | |
| final_thresh = min(candidate_thresh, safe_floor_thresh) | |
| return final_thresh | |
| def audio_preprocess(audio_path: str) -> bool: | |
| audio = load_audio_file(audio_path) | |
| if audio is None: | |
| return False | |
| processed_audio = reduce_noise(audio) | |
| silence_thresh = calculate_silence_threshold(processed_audio) | |
| chunks = split_on_silence( | |
| processed_audio, | |
| min_silence_len=500, | |
| silence_thresh=int(silence_thresh), | |
| keep_silence=150, | |
| ) | |
| # merge the splited audio data | |
| if not chunks: | |
| logger.warning("Failed detect the voice segment. will use the original audio file.") | |
| return False | |
| else: | |
| final_audio = AudioSegment.empty() | |
| for chunk in chunks: | |
| final_audio += chunk | |
| try: | |
| final_audio.export(audio_path, format="wav") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Failed to export the processed audio data: {e}") | |
| return False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment