Skip to content

Instantly share code, notes, and snippets.

@ilovejs
Created December 30, 2025 10:35
Show Gist options
  • Select an option

  • Save ilovejs/c3f8538021c148abfdbe89d435161f51 to your computer and use it in GitHub Desktop.

Select an option

Save ilovejs/c3f8538021c148abfdbe89d435161f51 to your computer and use it in GitHub Desktop.
whisper.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4",
"authorship_tag": "ABX9TyNr89E/aBPpbqr5iLfeqRyr",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/ilovejs/c3f8538021c148abfdbe89d435161f51/whisper.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "Rnfvac1QAXRO",
"collapsed": true
},
"outputs": [],
"source": [
"!pip install git+https://github.com/openai/whisper.git\n",
"!sudo apt install ffmpeg"
]
},
{
"cell_type": "code",
"source": [
"from google.colab import files\n",
"uploaded = files.upload()\n",
"file_name = list(uploaded.keys())[0]\n",
"print(f\"Uploaded: {file_name}\")"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 90
},
"id": "_SX1rRF0pC3u",
"outputId": "693027c2-07e4-4928-b284-66bcaf5c9bc8"
},
"execution_count": 2,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"<IPython.core.display.HTML object>"
],
"text/html": [
"\n",
" <input type=\"file\" id=\"files-9f339a73-878c-4a06-ba3d-3a9abb0b9642\" name=\"files[]\" multiple disabled\n",
" style=\"border:none\" />\n",
" <output id=\"result-9f339a73-878c-4a06-ba3d-3a9abb0b9642\">\n",
" Upload widget is only available when the cell has been executed in the\n",
" current browser session. Please rerun this cell to enable.\n",
" </output>\n",
" <script>// Copyright 2017 Google LLC\n",
"//\n",
"// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"// you may not use this file except in compliance with the License.\n",
"// You may obtain a copy of the License at\n",
"//\n",
"// http://www.apache.org/licenses/LICENSE-2.0\n",
"//\n",
"// Unless required by applicable law or agreed to in writing, software\n",
"// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"// See the License for the specific language governing permissions and\n",
"// limitations under the License.\n",
"\n",
"/**\n",
" * @fileoverview Helpers for google.colab Python module.\n",
" */\n",
"(function(scope) {\n",
"function span(text, styleAttributes = {}) {\n",
" const element = document.createElement('span');\n",
" element.textContent = text;\n",
" for (const key of Object.keys(styleAttributes)) {\n",
" element.style[key] = styleAttributes[key];\n",
" }\n",
" return element;\n",
"}\n",
"\n",
"// Max number of bytes which will be uploaded at a time.\n",
"const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
"\n",
"function _uploadFiles(inputId, outputId) {\n",
" const steps = uploadFilesStep(inputId, outputId);\n",
" const outputElement = document.getElementById(outputId);\n",
" // Cache steps on the outputElement to make it available for the next call\n",
" // to uploadFilesContinue from Python.\n",
" outputElement.steps = steps;\n",
"\n",
" return _uploadFilesContinue(outputId);\n",
"}\n",
"\n",
"// This is roughly an async generator (not supported in the browser yet),\n",
"// where there are multiple asynchronous steps and the Python side is going\n",
"// to poll for completion of each step.\n",
"// This uses a Promise to block the python side on completion of each step,\n",
"// then passes the result of the previous step as the input to the next step.\n",
"function _uploadFilesContinue(outputId) {\n",
" const outputElement = document.getElementById(outputId);\n",
" const steps = outputElement.steps;\n",
"\n",
" const next = steps.next(outputElement.lastPromiseValue);\n",
" return Promise.resolve(next.value.promise).then((value) => {\n",
" // Cache the last promise value to make it available to the next\n",
" // step of the generator.\n",
" outputElement.lastPromiseValue = value;\n",
" return next.value.response;\n",
" });\n",
"}\n",
"\n",
"/**\n",
" * Generator function which is called between each async step of the upload\n",
" * process.\n",
" * @param {string} inputId Element ID of the input file picker element.\n",
" * @param {string} outputId Element ID of the output display.\n",
" * @return {!Iterable<!Object>} Iterable of next steps.\n",
" */\n",
"function* uploadFilesStep(inputId, outputId) {\n",
" const inputElement = document.getElementById(inputId);\n",
" inputElement.disabled = false;\n",
"\n",
" const outputElement = document.getElementById(outputId);\n",
" outputElement.innerHTML = '';\n",
"\n",
" const pickedPromise = new Promise((resolve) => {\n",
" inputElement.addEventListener('change', (e) => {\n",
" resolve(e.target.files);\n",
" });\n",
" });\n",
"\n",
" const cancel = document.createElement('button');\n",
" inputElement.parentElement.appendChild(cancel);\n",
" cancel.textContent = 'Cancel upload';\n",
" const cancelPromise = new Promise((resolve) => {\n",
" cancel.onclick = () => {\n",
" resolve(null);\n",
" };\n",
" });\n",
"\n",
" // Wait for the user to pick the files.\n",
" const files = yield {\n",
" promise: Promise.race([pickedPromise, cancelPromise]),\n",
" response: {\n",
" action: 'starting',\n",
" }\n",
" };\n",
"\n",
" cancel.remove();\n",
"\n",
" // Disable the input element since further picks are not allowed.\n",
" inputElement.disabled = true;\n",
"\n",
" if (!files) {\n",
" return {\n",
" response: {\n",
" action: 'complete',\n",
" }\n",
" };\n",
" }\n",
"\n",
" for (const file of files) {\n",
" const li = document.createElement('li');\n",
" li.append(span(file.name, {fontWeight: 'bold'}));\n",
" li.append(span(\n",
" `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n",
" `last modified: ${\n",
" file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
" 'n/a'} - `));\n",
" const percent = span('0% done');\n",
" li.appendChild(percent);\n",
"\n",
" outputElement.appendChild(li);\n",
"\n",
" const fileDataPromise = new Promise((resolve) => {\n",
" const reader = new FileReader();\n",
" reader.onload = (e) => {\n",
" resolve(e.target.result);\n",
" };\n",
" reader.readAsArrayBuffer(file);\n",
" });\n",
" // Wait for the data to be ready.\n",
" let fileData = yield {\n",
" promise: fileDataPromise,\n",
" response: {\n",
" action: 'continue',\n",
" }\n",
" };\n",
"\n",
" // Use a chunked sending to avoid message size limits. See b/62115660.\n",
" let position = 0;\n",
" do {\n",
" const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
" const chunk = new Uint8Array(fileData, position, length);\n",
" position += length;\n",
"\n",
" const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
" yield {\n",
" response: {\n",
" action: 'append',\n",
" file: file.name,\n",
" data: base64,\n",
" },\n",
" };\n",
"\n",
" let percentDone = fileData.byteLength === 0 ?\n",
" 100 :\n",
" Math.round((position / fileData.byteLength) * 100);\n",
" percent.textContent = `${percentDone}% done`;\n",
"\n",
" } while (position < fileData.byteLength);\n",
" }\n",
"\n",
" // All done.\n",
" yield {\n",
" response: {\n",
" action: 'complete',\n",
" }\n",
" };\n",
"}\n",
"\n",
"scope.google = scope.google || {};\n",
"scope.google.colab = scope.google.colab || {};\n",
"scope.google.colab._files = {\n",
" _uploadFiles,\n",
" _uploadFilesContinue,\n",
"};\n",
"})(self);\n",
"</script> "
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Saving audio_video.mp4 to audio_video.mp4\n",
"Uploaded: audio_video.mp4\n"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "5a728360"
},
"source": [
"## Modify Transcription for Word Timestamps\n",
"\n",
"### Subtask:\n",
"Adjust the `whisper` transcription call to include `word_timestamps=True`. This will enable access to start and end times for individual words, which is crucial for accurate sentence splitting and timestamp adjustment.\n"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "e6f246c6",
"outputId": "bbd3ddbd-cb10-4f0d-a858-17e763449df1"
},
"source": [
"import whisper\n",
"import math\n",
"import subprocess # Import subprocess to run shell commands\n",
"\n",
"# 1. Load the model (Options: tiny, base, small, medium, large, turbo)\n",
"print(\"Loading model...\")\n",
"model = whisper.load_model(\"turbo\")\n",
"\n",
"# Get audio duration using ffprobe\n",
"duration_seconds = None\n",
"try:\n",
" # Command to get duration in seconds\n",
" cmd = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', file_name]\n",
" result_ffprobe = subprocess.run(cmd, capture_output=True, text=True, check=True)\n",
" duration_seconds = float(result_ffprobe.stdout.strip())\n",
"except FileNotFoundError:\n",
" print(\"ffprobe not found. Please ensure ffmpeg is installed and in your PATH.\")\n",
"except subprocess.CalledProcessError as e:\n",
" print(f\"Error running ffprobe: {e}\\n{e.stderr}\")\n",
"except ValueError:\n",
" print(\"Could not parse duration from ffprobe output.\")\n",
"\n",
"duration_str = \"\"\n",
"if duration_seconds:\n",
" hours = int(duration_seconds // 3600)\n",
" minutes = int((duration_seconds % 3600) // 60)\n",
" seconds = duration_seconds % 60\n",
" if hours > 0:\n",
" duration_str = f\" (approx. {hours}h {minutes}m {seconds:.1f}s)\"\n",
" elif minutes > 0:\n",
" duration_str = f\" (approx. {minutes}m {seconds:.1f}s)\"\n",
" else:\n",
" duration_str = f\" (approx. {seconds:.1f}s)\"\n",
"\n",
"# 2. Transcribe the audio\n",
"print(f\"Transcribing {file_name}{duration_str}... this may take a moment.\")\n",
"result = model.transcribe(file_name, word_timestamps=True)\n",
"\n",
"# 3. Define function to format timestamps for SBV (H:MM:SS.mmm)\n",
"def format_sbv_timestamp(seconds):\n",
" hours = int(seconds // 3600)\n",
" minutes = int((seconds % 3600) // 60)\n",
" secs = seconds % 60\n",
" # SBV format: 0:00:00.000 (Hours:Minutes:Seconds.Milliseconds)\n",
" return f\"{hours}:{minutes:02d}:{secs:06.3f}\"\n"
],
"execution_count": 25,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Loading model...\n",
"Transcribing audio_video.mp4 (approx. 15m 52.0s)... this may take a moment.\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "01bdb3d8",
"outputId": "e27fa291-68ce-41a4-86f2-00b11d7f92bf"
},
"source": [
"import re\n",
"\n",
"# Configuration for sentence breaking\n",
"# Add comma as a delimiter for natural breaks\n",
"sentence_break_delimiters = \".?!,。?!\"\n",
"\n",
"# Generate the regex pattern dynamically from the configured delimiters\n",
"# re.escape is used to handle any special regex characters if they were in the delimiters string\n",
"escaped_delimiters = re.escape(sentence_break_delimiters)\n",
"punctuation_pattern = rf'[{escaped_delimiters}]$'\n",
"\n",
"# 1. Initialize an empty list to store the processed sub-segments\n",
"new_sbv_segments = []\n",
"\n",
"# Helper function to process and store a sub-segment\n",
"def add_sub_segment(words_list):\n",
" if not words_list:\n",
" return\n",
"\n",
" text = \"\".join([w['word'] for w in words_list]).strip()\n",
" start_time = words_list[0]['start']\n",
" end_time = words_list[-1]['end']\n",
" new_sbv_segments.append({\n",
" 'start': start_time,\n",
" 'end': end_time,\n",
" 'text': text\n",
" })\n",
"\n",
"# Iterate through each segment in the result['segments'] list\n",
"for segment in result['segments']:\n",
" current_sentence_words = []\n",
" words_in_segment = segment.get('words', [])\n",
"\n",
" # Iterate through each word_info dictionary in the segment['words'] list\n",
" for i, word_info in enumerate(words_in_segment):\n",
" # Append the current word_info to current_sentence_words\n",
" current_sentence_words.append(word_info)\n",
"\n",
" is_last_word_in_segment = (i == len(words_in_segment) - 1)\n",
"\n",
" # Check for conditions to create a new sub-segment:\n",
" # a. If the number of words in current_sentence_words reaches 10.\n",
" # b. If the word_info['word'] ends with a punctuation mark\n",
" # (e.g., '.', '?', '!', '。', '?', '!', ',') and current_sentence_words is not empty.\n",
" # c. If it's the last word in the segment['words'] list and current_sentence_words is not empty.\n",
"\n",
" ends_with_punctuation = bool(re.search(punctuation_pattern, word_info['word']))\n",
"\n",
" if (len(current_sentence_words) >= 10 and current_sentence_words) or \\\n",
" (ends_with_punctuation and current_sentence_words) or \\\n",
" (is_last_word_in_segment and current_sentence_words):\n",
"\n",
" add_sub_segment(current_sentence_words)\n",
" current_sentence_words = []\n",
"\n",
"# Now, generate the SBV content using the new_sbv_segments\n",
"sbv_content_new = \"\"\n",
"for seg in new_sbv_segments:\n",
" start = format_sbv_timestamp(seg['start'])\n",
" end = format_sbv_timestamp(seg['end'])\n",
" text = seg['text'].strip()\n",
"\n",
" sbv_content_new += f\"{start},{end}\\n{text}\\n\\n\"\n",
"\n",
"# Save to file, overwriting the previous audio_video.sbv\n",
"output_filename = file_name.rsplit('.', 1)[0] + \".sbv\"\n",
"with open(output_filename, \"w\", encoding=\"utf-8\") as f:\n",
" f.write(sbv_content_new)\n",
"\n",
"print(f\"Success! Created: {output_filename} with broken sentences.\")"
],
"execution_count": 23,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Success! Created: audio_video.sbv with broken sentences.\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"id": "960e4ab5",
"outputId": "d1e93cc1-bacd-486a-9eea-2afa6d901b53"
},
"source": [
"# sbv file\n",
"files.download(output_filename)"
],
"execution_count": 21,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"<IPython.core.display.Javascript object>"
],
"application/javascript": [
"\n",
" async function download(id, filename, size) {\n",
" if (!google.colab.kernel.accessAllowed) {\n",
" return;\n",
" }\n",
" const div = document.createElement('div');\n",
" const label = document.createElement('label');\n",
" label.textContent = `Downloading \"${filename}\": `;\n",
" div.appendChild(label);\n",
" const progress = document.createElement('progress');\n",
" progress.max = size;\n",
" div.appendChild(progress);\n",
" document.body.appendChild(div);\n",
"\n",
" const buffers = [];\n",
" let downloaded = 0;\n",
"\n",
" const channel = await google.colab.kernel.comms.open(id);\n",
" // Send a message to notify the kernel that we're ready.\n",
" channel.send({})\n",
"\n",
" for await (const message of channel.messages) {\n",
" // Send a message to notify the kernel that we're ready.\n",
" channel.send({})\n",
" if (message.buffers) {\n",
" for (const buffer of message.buffers) {\n",
" buffers.push(buffer);\n",
" downloaded += buffer.byteLength;\n",
" progress.value = downloaded;\n",
" }\n",
" }\n",
" }\n",
" const blob = new Blob(buffers, {type: 'application/binary'});\n",
" const a = document.createElement('a');\n",
" a.href = window.URL.createObjectURL(blob);\n",
" a.download = filename;\n",
" div.appendChild(a);\n",
" a.click();\n",
" div.remove();\n",
" }\n",
" "
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"<IPython.core.display.Javascript object>"
],
"application/javascript": [
"download(\"download_728cc862-8c45-4e4e-8612-94c5c5f1a429\", \"audio_video.sbv\", 20764)"
]
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": [
"# converting sbv (youtube format) to srt for local checking.\n",
"# im using mac and iina, so subtitle should be rendered.\n",
"\n",
"import os\n",
"\n",
"input_filename = \"audio_video.sbv\"\n",
"output_filename = \"audio_video.srt\"\n",
"\n",
"# 1. Check if file exists\n",
"if not os.path.exists(input_filename):\n",
" print(f\"Error: '{input_filename}' not found in the current directory.\")\n",
"else:\n",
" with open(input_filename, 'r', encoding='utf-8') as f:\n",
" # Split by empty lines to isolate caption blocks\n",
" blocks = f.read().strip().split('\\n\\n')\n",
"\n",
" with open(output_filename, 'w', encoding='utf-8') as f_out:\n",
" for index, block in enumerate(blocks, 1):\n",
" if not block.strip():\n",
" continue\n",
"\n",
" lines = block.split('\\n')\n",
"\n",
" # The first line is the timestamp\n",
" sbv_time = lines[0]\n",
" # The rest is the text\n",
" text = \"\\n\".join(lines[1:])\n",
"\n",
" # Convert SBV timestamp (0:00.000,0:05.000) to SRT (00:00,000 --> 00:05,000)\n",
" # 1. Split start and end\n",
" start, end = sbv_time.split(',')\n",
" # 2. Replace dots with commas\n",
" srt_time = f\"{start.replace('.', ',')} --> {end.replace('.', ',')}\"\n",
"\n",
" # Write to file (Index, Time, Text, Empty Line)\n",
" f_out.write(f\"{index}\\n{srt_time}\\n{text}\\n\\n\")\n",
"\n",
" print(f\"Done! Created: {output_filename}\")\n",
"\n",
" # Optional: Uncomment the line below to automatically download the file to your PC\n",
" # from google.colab import files; files.download(output_filename)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ktFjhz9Iqm8G",
"outputId": "7552bbb0-e55f-483d-8ec2-6102c2a5f0bb"
},
"execution_count": 24,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Done! Created: audio_video.srt\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"files.download(output_filename)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 17
},
"id": "ZINL0zEKwOZH",
"outputId": "e8def8c0-cc3b-4060-9da0-48ec60a0af86"
},
"execution_count": 14,
"outputs": [
{
"output_type": "display_data",
"data": {
"text/plain": [
"<IPython.core.display.Javascript object>"
],
"application/javascript": [
"\n",
" async function download(id, filename, size) {\n",
" if (!google.colab.kernel.accessAllowed) {\n",
" return;\n",
" }\n",
" const div = document.createElement('div');\n",
" const label = document.createElement('label');\n",
" label.textContent = `Downloading \"${filename}\": `;\n",
" div.appendChild(label);\n",
" const progress = document.createElement('progress');\n",
" progress.max = size;\n",
" div.appendChild(progress);\n",
" document.body.appendChild(div);\n",
"\n",
" const buffers = [];\n",
" let downloaded = 0;\n",
"\n",
" const channel = await google.colab.kernel.comms.open(id);\n",
" // Send a message to notify the kernel that we're ready.\n",
" channel.send({})\n",
"\n",
" for await (const message of channel.messages) {\n",
" // Send a message to notify the kernel that we're ready.\n",
" channel.send({})\n",
" if (message.buffers) {\n",
" for (const buffer of message.buffers) {\n",
" buffers.push(buffer);\n",
" downloaded += buffer.byteLength;\n",
" progress.value = downloaded;\n",
" }\n",
" }\n",
" }\n",
" const blob = new Blob(buffers, {type: 'application/binary'});\n",
" const a = document.createElement('a');\n",
" a.href = window.URL.createObjectURL(blob);\n",
" a.download = filename;\n",
" div.appendChild(a);\n",
" a.click();\n",
" div.remove();\n",
" }\n",
" "
]
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": [
"<IPython.core.display.Javascript object>"
],
"application/javascript": [
"download(\"download_959552eb-ddbd-482e-9179-2f80024edfd0\", \"audio_video.srt\", 21685)"
]
},
"metadata": {}
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment