Skip to content

Instantly share code, notes, and snippets.

@LewisGet
Created December 31, 2025 08:52
Show Gist options
  • Select an option

  • Save LewisGet/3429c0e927679a120eea930c3b243aa8 to your computer and use it in GitHub Desktop.

Select an option

Save LewisGet/3429c0e927679a120eea930c3b243aa8 to your computer and use it in GitHub Desktop.
import glob
import os
from pydub import AudioSegment
database_path = "./Taiwan-Tongues-ASR-CE-dataset-zhtw/train"
database_list = []
for i in glob.glob(os.path.join(database_path, "mp3", "*.mp3")):
filename = os.path.basename(i).split(".")[0]
filetext = os.path.join(database_path, "txt", filename + ".txt")
conver_filename = os.path.join(database_path, "wav", filename + ".wav")
sound = AudioSegment.from_mp3(i)
sound = sound.set_frame_rate(32000)
sound = sound.set_channels(1)
sound = sound.set_sample_width(2)
sound.export(conver_filename, format="wav")
filecontent = ""
with open(filetext, "r") as f:
for line in f.readlines():
filecontent += line
database_list.append(f"{conver_filename}|tw_slicer_opt|ZH|{filecontent}")
with open(os.path.join(database_path, "text.list"), "w", encoding="utf-8") as f:
f.write("\n".join(database_list))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment