ilovejs · December 30, 2025 10:35
diff --git a/whisper.ipynb b/whisper.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4",
      "authorship_tag": "ABX9TyNr89E/aBPpbqr5iLfeqRyr",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/ilovejs/c3f8538021c148abfdbe89d435161f51/whisper.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Rnfvac1QAXRO",
        "collapsed": true
      },
      "outputs": [],
      "source": [
        "!pip install git+https://github.com/openai/whisper.git\n",
        "!sudo apt install ffmpeg"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from google.colab import files\n",
        "uploaded = files.upload()\n",
        "file_name = list(uploaded.keys())[0]\n",
        "print(f\"Uploaded: {file_name}\")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 90
        },
        "id": "_SX1rRF0pC3u",
        "outputId": "693027c2-07e4-4928-b284-66bcaf5c9bc8"
      },
      "execution_count": 2,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "\n",
              "     <input type=\"file\" id=\"files-9f339a73-878c-4a06-ba3d-3a9abb0b9642\" name=\"files[]\" multiple disabled\n",
              "        style=\"border:none\" />\n",
              "     <output id=\"result-9f339a73-878c-4a06-ba3d-3a9abb0b9642\">\n",
              "      Upload widget is only available when the cell has been executed in the\n",
              "      current browser session. Please rerun this cell to enable.\n",
              "      </output>\n",
              "      <script>// Copyright 2017 Google LLC\n",
              "//\n",
              "// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
              "// you may not use this file except in compliance with the License.\n",
              "// You may obtain a copy of the License at\n",
              "//\n",
              "//      http://www.apache.org/licenses/LICENSE-2.0\n",
              "//\n",
              "// Unless required by applicable law or agreed to in writing, software\n",
              "// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
              "// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
              "// See the License for the specific language governing permissions and\n",
              "// limitations under the License.\n",
              "\n",
              "/**\n",
              " * @fileoverview Helpers for google.colab Python module.\n",
              " */\n",
              "(function(scope) {\n",
              "function span(text, styleAttributes = {}) {\n",
              "  const element = document.createElement('span');\n",
              "  element.textContent = text;\n",
              "  for (const key of Object.keys(styleAttributes)) {\n",
              "    element.style[key] = styleAttributes[key];\n",
              "  }\n",
              "  return element;\n",
              "}\n",
              "\n",
              "// Max number of bytes which will be uploaded at a time.\n",
              "const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
              "\n",
              "function _uploadFiles(inputId, outputId) {\n",
              "  const steps = uploadFilesStep(inputId, outputId);\n",
              "  const outputElement = document.getElementById(outputId);\n",
              "  // Cache steps on the outputElement to make it available for the next call\n",
              "  // to uploadFilesContinue from Python.\n",
              "  outputElement.steps = steps;\n",
              "\n",
              "  return _uploadFilesContinue(outputId);\n",
              "}\n",
              "\n",
              "// This is roughly an async generator (not supported in the browser yet),\n",
              "// where there are multiple asynchronous steps and the Python side is going\n",
              "// to poll for completion of each step.\n",
              "// This uses a Promise to block the python side on completion of each step,\n",
              "// then passes the result of the previous step as the input to the next step.\n",
              "function _uploadFilesContinue(outputId) {\n",
              "  const outputElement = document.getElementById(outputId);\n",
              "  const steps = outputElement.steps;\n",
              "\n",
              "  const next = steps.next(outputElement.lastPromiseValue);\n",
              "  return Promise.resolve(next.value.promise).then((value) => {\n",
              "    // Cache the last promise value to make it available to the next\n",
              "    // step of the generator.\n",
              "    outputElement.lastPromiseValue = value;\n",
              "    return next.value.response;\n",
              "  });\n",
              "}\n",
              "\n",
              "/**\n",
              " * Generator function which is called between each async step of the upload\n",
              " * process.\n",
              " * @param {string} inputId Element ID of the input file picker element.\n",
              " * @param {string} outputId Element ID of the output display.\n",
              " * @return {!Iterable<!Object>} Iterable of next steps.\n",
              " */\n",
              "function* uploadFilesStep(inputId, outputId) {\n",
              "  const inputElement = document.getElementById(inputId);\n",
              "  inputElement.disabled = false;\n",
              "\n",
              "  const outputElement = document.getElementById(outputId);\n",
              "  outputElement.innerHTML = '';\n",
              "\n",
              "  const pickedPromise = new Promise((resolve) => {\n",
              "    inputElement.addEventListener('change', (e) => {\n",
              "      resolve(e.target.files);\n",
              "    });\n",
              "  });\n",
              "\n",
              "  const cancel = document.createElement('button');\n",
              "  inputElement.parentElement.appendChild(cancel);\n",
              "  cancel.textContent = 'Cancel upload';\n",
              "  const cancelPromise = new Promise((resolve) => {\n",
              "    cancel.onclick = () => {\n",
              "      resolve(null);\n",
              "    };\n",
              "  });\n",
              "\n",
              "  // Wait for the user to pick the files.\n",
              "  const files = yield {\n",
              "    promise: Promise.race([pickedPromise, cancelPromise]),\n",
              "    response: {\n",
              "      action: 'starting',\n",
              "    }\n",
              "  };\n",
              "\n",
              "  cancel.remove();\n",
              "\n",
              "  // Disable the input element since further picks are not allowed.\n",
              "  inputElement.disabled = true;\n",
              "\n",
              "  if (!files) {\n",
              "    return {\n",
              "      response: {\n",
              "        action: 'complete',\n",
              "      }\n",
              "    };\n",
              "  }\n",
              "\n",
              "  for (const file of files) {\n",
              "    const li = document.createElement('li');\n",
              "    li.append(span(file.name, {fontWeight: 'bold'}));\n",
              "    li.append(span(\n",
              "        `(${file.type || 'n/a'}) - ${file.size} bytes, ` +\n",
              "        `last modified: ${\n",
              "            file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
              "                                    'n/a'} - `));\n",
              "    const percent = span('0% done');\n",
              "    li.appendChild(percent);\n",
              "\n",
              "    outputElement.appendChild(li);\n",
              "\n",
              "    const fileDataPromise = new Promise((resolve) => {\n",
              "      const reader = new FileReader();\n",
              "      reader.onload = (e) => {\n",
              "        resolve(e.target.result);\n",
              "      };\n",
              "      reader.readAsArrayBuffer(file);\n",
              "    });\n",
              "    // Wait for the data to be ready.\n",
              "    let fileData = yield {\n",
              "      promise: fileDataPromise,\n",
              "      response: {\n",
              "        action: 'continue',\n",
              "      }\n",
              "    };\n",
              "\n",
              "    // Use a chunked sending to avoid message size limits. See b/62115660.\n",
              "    let position = 0;\n",
              "    do {\n",
              "      const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
              "      const chunk = new Uint8Array(fileData, position, length);\n",
              "      position += length;\n",
              "\n",
              "      const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
              "      yield {\n",
              "        response: {\n",
              "          action: 'append',\n",
              "          file: file.name,\n",
              "          data: base64,\n",
              "        },\n",
              "      };\n",
              "\n",
              "      let percentDone = fileData.byteLength === 0 ?\n",
              "          100 :\n",
              "          Math.round((position / fileData.byteLength) * 100);\n",
              "      percent.textContent = `${percentDone}% done`;\n",
              "\n",
              "    } while (position < fileData.byteLength);\n",
              "  }\n",
              "\n",
              "  // All done.\n",
              "  yield {\n",
              "    response: {\n",
              "      action: 'complete',\n",
              "    }\n",
              "  };\n",
              "}\n",
              "\n",
              "scope.google = scope.google || {};\n",
              "scope.google.colab = scope.google.colab || {};\n",
              "scope.google.colab._files = {\n",
              "  _uploadFiles,\n",
              "  _uploadFilesContinue,\n",
              "};\n",
              "})(self);\n",
              "</script> "
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Saving audio_video.mp4 to audio_video.mp4\n",
            "Uploaded: audio_video.mp4\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "5a728360"
      },
      "source": [
        "## Modify Transcription for Word Timestamps\n",
        "\n",
        "### Subtask:\n",
        "Adjust the `whisper` transcription call to include `word_timestamps=True`. This will enable access to start and end times for individual words, which is crucial for accurate sentence splitting and timestamp adjustment.\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "e6f246c6",
        "outputId": "bbd3ddbd-cb10-4f0d-a858-17e763449df1"
      },
      "source": [
        "import whisper\n",
        "import math\n",
        "import subprocess # Import subprocess to run shell commands\n",
        "\n",
        "# 1. Load the model (Options: tiny, base, small, medium, large, turbo)\n",
        "print(\"Loading model...\")\n",
        "model = whisper.load_model(\"turbo\")\n",
        "\n",
        "# Get audio duration using ffprobe\n",
        "duration_seconds = None\n",
        "try:\n",
        "    # Command to get duration in seconds\n",
        "    cmd = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', file_name]\n",
        "    result_ffprobe = subprocess.run(cmd, capture_output=True, text=True, check=True)\n",
        "    duration_seconds = float(result_ffprobe.stdout.strip())\n",
        "except FileNotFoundError:\n",
        "    print(\"ffprobe not found. Please ensure ffmpeg is installed and in your PATH.\")\n",
        "except subprocess.CalledProcessError as e:\n",
        "    print(f\"Error running ffprobe: {e}\\n{e.stderr}\")\n",
        "except ValueError:\n",
        "    print(\"Could not parse duration from ffprobe output.\")\n",
        "\n",
        "duration_str = \"\"\n",
        "if duration_seconds:\n",
        "    hours = int(duration_seconds // 3600)\n",
        "    minutes = int((duration_seconds % 3600) // 60)\n",
        "    seconds = duration_seconds % 60\n",
        "    if hours > 0:\n",
        "        duration_str = f\" (approx. {hours}h {minutes}m {seconds:.1f}s)\"\n",
        "    elif minutes > 0:\n",
        "        duration_str = f\" (approx. {minutes}m {seconds:.1f}s)\"\n",
        "    else:\n",
        "        duration_str = f\" (approx. {seconds:.1f}s)\"\n",
        "\n",
        "# 2. Transcribe the audio\n",
        "print(f\"Transcribing {file_name}{duration_str}... this may take a moment.\")\n",
        "result = model.transcribe(file_name, word_timestamps=True)\n",
        "\n",
        "# 3. Define function to format timestamps for SBV (H:MM:SS.mmm)\n",
        "def format_sbv_timestamp(seconds):\n",
        "    hours = int(seconds // 3600)\n",
        "    minutes = int((seconds % 3600) // 60)\n",
        "    secs = seconds % 60\n",
        "    # SBV format: 0:00:00.000 (Hours:Minutes:Seconds.Milliseconds)\n",
        "    return f\"{hours}:{minutes:02d}:{secs:06.3f}\"\n"
      ],
      "execution_count": 25,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Loading model...\n",
            "Transcribing audio_video.mp4 (approx. 15m 52.0s)... this may take a moment.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "01bdb3d8",
        "outputId": "e27fa291-68ce-41a4-86f2-00b11d7f92bf"
      },
      "source": [
        "import re\n",
        "\n",
        "# Configuration for sentence breaking\n",
        "# Add comma as a delimiter for natural breaks\n",
        "sentence_break_delimiters = \".?!,。？！\"\n",
        "\n",
        "# Generate the regex pattern dynamically from the configured delimiters\n",
        "# re.escape is used to handle any special regex characters if they were in the delimiters string\n",
        "escaped_delimiters = re.escape(sentence_break_delimiters)\n",
        "punctuation_pattern = rf'[{escaped_delimiters}]$'\n",
        "\n",
        "# 1. Initialize an empty list to store the processed sub-segments\n",
        "new_sbv_segments = []\n",
        "\n",
        "# Helper function to process and store a sub-segment\n",
        "def add_sub_segment(words_list):\n",
        "    if not words_list:\n",
        "        return\n",
        "\n",
        "    text = \"\".join([w['word'] for w in words_list]).strip()\n",
        "    start_time = words_list[0]['start']\n",
        "    end_time = words_list[-1]['end']\n",
        "    new_sbv_segments.append({\n",
        "        'start': start_time,\n",
        "        'end': end_time,\n",
        "        'text': text\n",
        "    })\n",
        "\n",
        "# Iterate through each segment in the result['segments'] list\n",
        "for segment in result['segments']:\n",
        "    current_sentence_words = []\n",
        "    words_in_segment = segment.get('words', [])\n",
        "\n",
        "    # Iterate through each word_info dictionary in the segment['words'] list\n",
        "    for i, word_info in enumerate(words_in_segment):\n",
        "        # Append the current word_info to current_sentence_words\n",
        "        current_sentence_words.append(word_info)\n",
        "\n",
        "        is_last_word_in_segment = (i == len(words_in_segment) - 1)\n",
        "\n",
        "        # Check for conditions to create a new sub-segment:\n",
        "        # a. If the number of words in current_sentence_words reaches 10.\n",
        "        # b. If the word_info['word'] ends with a punctuation mark\n",
        "        #    (e.g., '.', '?', '!', '。', '？', '！', ',') and current_sentence_words is not empty.\n",
        "        # c. If it's the last word in the segment['words'] list and current_sentence_words is not empty.\n",
        "\n",
        "        ends_with_punctuation = bool(re.search(punctuation_pattern, word_info['word']))\n",
        "\n",
        "        if (len(current_sentence_words) >= 10 and current_sentence_words) or \\\n",
        "           (ends_with_punctuation and current_sentence_words) or \\\n",
        "           (is_last_word_in_segment and current_sentence_words):\n",
        "\n",
        "            add_sub_segment(current_sentence_words)\n",
        "            current_sentence_words = []\n",
        "\n",
        "# Now, generate the SBV content using the new_sbv_segments\n",
        "sbv_content_new = \"\"\n",
        "for seg in new_sbv_segments:\n",
        "    start = format_sbv_timestamp(seg['start'])\n",
        "    end = format_sbv_timestamp(seg['end'])\n",
        "    text = seg['text'].strip()\n",
        "\n",
        "    sbv_content_new += f\"{start},{end}\\n{text}\\n\\n\"\n",
        "\n",
        "# Save to file, overwriting the previous audio_video.sbv\n",
        "output_filename = file_name.rsplit('.', 1)[0] + \".sbv\"\n",
        "with open(output_filename, \"w\", encoding=\"utf-8\") as f:\n",
        "    f.write(sbv_content_new)\n",
        "\n",
        "print(f\"Success! Created: {output_filename} with broken sentences.\")"
      ],
      "execution_count": 23,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Success! Created: audio_video.sbv with broken sentences.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "960e4ab5",
        "outputId": "d1e93cc1-bacd-486a-9eea-2afa6d901b53"
      },
      "source": [
        "# sbv file\n",
        "files.download(output_filename)"
      ],
      "execution_count": 21,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "\n",
              "    async function download(id, filename, size) {\n",
              "      if (!google.colab.kernel.accessAllowed) {\n",
              "        return;\n",
              "      }\n",
              "      const div = document.createElement('div');\n",
              "      const label = document.createElement('label');\n",
              "      label.textContent = `Downloading \"${filename}\": `;\n",
              "      div.appendChild(label);\n",
              "      const progress = document.createElement('progress');\n",
              "      progress.max = size;\n",
              "      div.appendChild(progress);\n",
              "      document.body.appendChild(div);\n",
              "\n",
              "      const buffers = [];\n",
              "      let downloaded = 0;\n",
              "\n",
              "      const channel = await google.colab.kernel.comms.open(id);\n",
              "      // Send a message to notify the kernel that we're ready.\n",
              "      channel.send({})\n",
              "\n",
              "      for await (const message of channel.messages) {\n",
              "        // Send a message to notify the kernel that we're ready.\n",
              "        channel.send({})\n",
              "        if (message.buffers) {\n",
              "          for (const buffer of message.buffers) {\n",
              "            buffers.push(buffer);\n",
              "            downloaded += buffer.byteLength;\n",
              "            progress.value = downloaded;\n",
              "          }\n",
              "        }\n",
              "      }\n",
              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
              "      const a = document.createElement('a');\n",
              "      a.href = window.URL.createObjectURL(blob);\n",
              "      a.download = filename;\n",
              "      div.appendChild(a);\n",
              "      a.click();\n",
              "      div.remove();\n",
              "    }\n",
              "  "
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "download(\"download_728cc862-8c45-4e4e-8612-94c5c5f1a429\", \"audio_video.sbv\", 20764)"
            ]
          },
          "metadata": {}
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# converting sbv (youtube format) to srt for local checking.\n",
        "# im using mac and iina, so subtitle should be rendered.\n",
        "\n",
        "import os\n",
        "\n",
        "input_filename = \"audio_video.sbv\"\n",
        "output_filename = \"audio_video.srt\"\n",
        "\n",
        "# 1. Check if file exists\n",
        "if not os.path.exists(input_filename):\n",
        "    print(f\"Error: '{input_filename}' not found in the current directory.\")\n",
        "else:\n",
        "    with open(input_filename, 'r', encoding='utf-8') as f:\n",
        "        # Split by empty lines to isolate caption blocks\n",
        "        blocks = f.read().strip().split('\\n\\n')\n",
        "\n",
        "    with open(output_filename, 'w', encoding='utf-8') as f_out:\n",
        "        for index, block in enumerate(blocks, 1):\n",
        "            if not block.strip():\n",
        "                continue\n",
        "\n",
        "            lines = block.split('\\n')\n",
        "\n",
        "            # The first line is the timestamp\n",
        "            sbv_time = lines[0]\n",
        "            # The rest is the text\n",
        "            text = \"\\n\".join(lines[1:])\n",
        "\n",
        "            # Convert SBV timestamp (0:00.000,0:05.000) to SRT (00:00,000 --> 00:05,000)\n",
        "            # 1. Split start and end\n",
        "            start, end = sbv_time.split(',')\n",
        "            # 2. Replace dots with commas\n",
        "            srt_time = f\"{start.replace('.', ',')} --> {end.replace('.', ',')}\"\n",
        "\n",
        "            # Write to file (Index, Time, Text, Empty Line)\n",
        "            f_out.write(f\"{index}\\n{srt_time}\\n{text}\\n\\n\")\n",
        "\n",
        "    print(f\"Done! Created: {output_filename}\")\n",
        "\n",
        "    # Optional: Uncomment the line below to automatically download the file to your PC\n",
        "    # from google.colab import files; files.download(output_filename)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ktFjhz9Iqm8G",
        "outputId": "7552bbb0-e55f-483d-8ec2-6102c2a5f0bb"
      },
      "execution_count": 24,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Done! Created: audio_video.srt\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "files.download(output_filename)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 17
        },
        "id": "ZINL0zEKwOZH",
        "outputId": "e8def8c0-cc3b-4060-9da0-48ec60a0af86"
      },
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "\n",
              "    async function download(id, filename, size) {\n",
              "      if (!google.colab.kernel.accessAllowed) {\n",
              "        return;\n",
              "      }\n",
              "      const div = document.createElement('div');\n",
              "      const label = document.createElement('label');\n",
              "      label.textContent = `Downloading \"${filename}\": `;\n",
              "      div.appendChild(label);\n",
              "      const progress = document.createElement('progress');\n",
              "      progress.max = size;\n",
              "      div.appendChild(progress);\n",
              "      document.body.appendChild(div);\n",
              "\n",
              "      const buffers = [];\n",
              "      let downloaded = 0;\n",
              "\n",
              "      const channel = await google.colab.kernel.comms.open(id);\n",
              "      // Send a message to notify the kernel that we're ready.\n",
              "      channel.send({})\n",
              "\n",
              "      for await (const message of channel.messages) {\n",
              "        // Send a message to notify the kernel that we're ready.\n",
              "        channel.send({})\n",
              "        if (message.buffers) {\n",
              "          for (const buffer of message.buffers) {\n",
              "            buffers.push(buffer);\n",
              "            downloaded += buffer.byteLength;\n",
              "            progress.value = downloaded;\n",
              "          }\n",
              "        }\n",
              "      }\n",
              "      const blob = new Blob(buffers, {type: 'application/binary'});\n",
              "      const a = document.createElement('a');\n",
              "      a.href = window.URL.createObjectURL(blob);\n",
              "      a.download = filename;\n",
              "      div.appendChild(a);\n",
              "      a.click();\n",
              "      div.remove();\n",
              "    }\n",
              "  "
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.Javascript object>"
            ],
            "application/javascript": [
              "download(\"download_959552eb-ddbd-482e-9179-2f80024edfd0\", \"audio_video.srt\", 21685)"
            ]
          },
          "metadata": {}
        }
      ]
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"gpuType": "T4",
	"authorship_tag": "ABX9TyNr89E/aBPpbqr5iLfeqRyr",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	},
	"accelerator": "GPU"
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/ilovejs/c3f8538021c148abfdbe89d435161f51/whisper.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"id": "Rnfvac1QAXRO",
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"!pip install git+https://github.com/openai/whisper.git\n",
	"!sudo apt install ffmpeg"
	]
	},
	{
	"cell_type": "code",
	"source": [
	"from google.colab import files\n",
	"uploaded = files.upload()\n",
	"file_name = list(uploaded.keys())[0]\n",
	"print(f\"Uploaded: {file_name}\")"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 90
	},
	"id": "_SX1rRF0pC3u",
	"outputId": "693027c2-07e4-4928-b284-66bcaf5c9bc8"
	},
	"execution_count": 2,
	"outputs": [
	{
	"output_type": "display_data",
	"data": {
	"text/plain": [
	"<IPython.core.display.HTML object>"
	],
	"text/html": [
	"\n",
	" <input type=\"file\" id=\"files-9f339a73-878c-4a06-ba3d-3a9abb0b9642\" name=\"files[]\" multiple disabled\n",
	" style=\"border:none\" />\n",
	" <output id=\"result-9f339a73-878c-4a06-ba3d-3a9abb0b9642\">\n",
	" Upload widget is only available when the cell has been executed in the\n",
	" current browser session. Please rerun this cell to enable.\n",
	" </output>\n",
	" <script>// Copyright 2017 Google LLC\n",
	"//\n",
	"// Licensed under the Apache License, Version 2.0 (the \"License\");\n",
	"// you may not use this file except in compliance with the License.\n",
	"// You may obtain a copy of the License at\n",
	"//\n",
	"// http://www.apache.org/licenses/LICENSE-2.0\n",
	"//\n",
	"// Unless required by applicable law or agreed to in writing, software\n",
	"// distributed under the License is distributed on an \"AS IS\" BASIS,\n",
	"// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
	"// See the License for the specific language governing permissions and\n",
	"// limitations under the License.\n",
	"\n",
	"/**\n",
	" * @fileoverview Helpers for google.colab Python module.\n",
	" */\n",
	"(function(scope) {\n",
	"function span(text, styleAttributes = {}) {\n",
	" const element = document.createElement('span');\n",
	" element.textContent = text;\n",
	" for (const key of Object.keys(styleAttributes)) {\n",
	" element.style[key] = styleAttributes[key];\n",
	" }\n",
	" return element;\n",
	"}\n",
	"\n",
	"// Max number of bytes which will be uploaded at a time.\n",
	"const MAX_PAYLOAD_SIZE = 100 * 1024;\n",
	"\n",
	"function _uploadFiles(inputId, outputId) {\n",
	" const steps = uploadFilesStep(inputId, outputId);\n",
	" const outputElement = document.getElementById(outputId);\n",
	" // Cache steps on the outputElement to make it available for the next call\n",
	" // to uploadFilesContinue from Python.\n",
	" outputElement.steps = steps;\n",
	"\n",
	" return _uploadFilesContinue(outputId);\n",
	"}\n",
	"\n",
	"// This is roughly an async generator (not supported in the browser yet),\n",
	"// where there are multiple asynchronous steps and the Python side is going\n",
	"// to poll for completion of each step.\n",
	"// This uses a Promise to block the python side on completion of each step,\n",
	"// then passes the result of the previous step as the input to the next step.\n",
	"function _uploadFilesContinue(outputId) {\n",
	" const outputElement = document.getElementById(outputId);\n",
	" const steps = outputElement.steps;\n",
	"\n",
	" const next = steps.next(outputElement.lastPromiseValue);\n",
	" return Promise.resolve(next.value.promise).then((value) => {\n",
	" // Cache the last promise value to make it available to the next\n",
	" // step of the generator.\n",
	" outputElement.lastPromiseValue = value;\n",
	" return next.value.response;\n",
	" });\n",
	"}\n",
	"\n",
	"/**\n",
	" * Generator function which is called between each async step of the upload\n",
	" * process.\n",
	" * @param {string} inputId Element ID of the input file picker element.\n",
	" * @param {string} outputId Element ID of the output display.\n",
	" * @return {!Iterable<!Object>} Iterable of next steps.\n",
	" */\n",
	"function* uploadFilesStep(inputId, outputId) {\n",
	" const inputElement = document.getElementById(inputId);\n",
	" inputElement.disabled = false;\n",
	"\n",
	" const outputElement = document.getElementById(outputId);\n",
	" outputElement.innerHTML = '';\n",
	"\n",
	" const pickedPromise = new Promise((resolve) => {\n",
	" inputElement.addEventListener('change', (e) => {\n",
	" resolve(e.target.files);\n",
	" });\n",
	" });\n",
	"\n",
	" const cancel = document.createElement('button');\n",
	" inputElement.parentElement.appendChild(cancel);\n",
	" cancel.textContent = 'Cancel upload';\n",
	" const cancelPromise = new Promise((resolve) => {\n",
	" cancel.onclick = () => {\n",
	" resolve(null);\n",
	" };\n",
	" });\n",
	"\n",
	" // Wait for the user to pick the files.\n",
	" const files = yield {\n",
	" promise: Promise.race([pickedPromise, cancelPromise]),\n",
	" response: {\n",
	" action: 'starting',\n",
	" }\n",
	" };\n",
	"\n",
	" cancel.remove();\n",
	"\n",
	" // Disable the input element since further picks are not allowed.\n",
	" inputElement.disabled = true;\n",
	"\n",
	" if (!files) {\n",
	" return {\n",
	" response: {\n",
	" action: 'complete',\n",
	" }\n",
	" };\n",
	" }\n",
	"\n",
	" for (const file of files) {\n",
	" const li = document.createElement('li');\n",
	" li.append(span(file.name, {fontWeight: 'bold'}));\n",
	" li.append(span(\n",
	" `(${file.type \|\| 'n/a'}) - ${file.size} bytes, ` +\n",
	" `last modified: ${\n",
	" file.lastModifiedDate ? file.lastModifiedDate.toLocaleDateString() :\n",
	" 'n/a'} - `));\n",
	" const percent = span('0% done');\n",
	" li.appendChild(percent);\n",
	"\n",
	" outputElement.appendChild(li);\n",
	"\n",
	" const fileDataPromise = new Promise((resolve) => {\n",
	" const reader = new FileReader();\n",
	" reader.onload = (e) => {\n",
	" resolve(e.target.result);\n",
	" };\n",
	" reader.readAsArrayBuffer(file);\n",
	" });\n",
	" // Wait for the data to be ready.\n",
	" let fileData = yield {\n",
	" promise: fileDataPromise,\n",
	" response: {\n",
	" action: 'continue',\n",
	" }\n",
	" };\n",
	"\n",
	" // Use a chunked sending to avoid message size limits. See b/62115660.\n",
	" let position = 0;\n",
	" do {\n",
	" const length = Math.min(fileData.byteLength - position, MAX_PAYLOAD_SIZE);\n",
	" const chunk = new Uint8Array(fileData, position, length);\n",
	" position += length;\n",
	"\n",
	" const base64 = btoa(String.fromCharCode.apply(null, chunk));\n",
	" yield {\n",
	" response: {\n",
	" action: 'append',\n",
	" file: file.name,\n",
	" data: base64,\n",
	" },\n",
	" };\n",
	"\n",
	" let percentDone = fileData.byteLength === 0 ?\n",
	" 100 :\n",
	" Math.round((position / fileData.byteLength) * 100);\n",
	" percent.textContent = `${percentDone}% done`;\n",
	"\n",
	" } while (position < fileData.byteLength);\n",
	" }\n",
	"\n",
	" // All done.\n",
	" yield {\n",
	" response: {\n",
	" action: 'complete',\n",
	" }\n",
	" };\n",
	"}\n",
	"\n",
	"scope.google = scope.google \|\| {};\n",
	"scope.google.colab = scope.google.colab \|\| {};\n",
	"scope.google.colab._files = {\n",
	" _uploadFiles,\n",
	" _uploadFilesContinue,\n",
	"};\n",
	"})(self);\n",
	"</script> "
	]
	},
	"metadata": {}
	},
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Saving audio_video.mp4 to audio_video.mp4\n",
	"Uploaded: audio_video.mp4\n"
	]
	}
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "5a728360"
	},
	"source": [
	"## Modify Transcription for Word Timestamps\n",
	"\n",
	"### Subtask:\n",
	"Adjust the `whisper` transcription call to include `word_timestamps=True`. This will enable access to start and end times for individual words, which is crucial for accurate sentence splitting and timestamp adjustment.\n"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "e6f246c6",
	"outputId": "bbd3ddbd-cb10-4f0d-a858-17e763449df1"
	},
	"source": [
	"import whisper\n",
	"import math\n",
	"import subprocess # Import subprocess to run shell commands\n",
	"\n",
	"# 1. Load the model (Options: tiny, base, small, medium, large, turbo)\n",
	"print(\"Loading model...\")\n",
	"model = whisper.load_model(\"turbo\")\n",
	"\n",
	"# Get audio duration using ffprobe\n",
	"duration_seconds = None\n",
	"try:\n",
	" # Command to get duration in seconds\n",
	" cmd = ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', file_name]\n",
	" result_ffprobe = subprocess.run(cmd, capture_output=True, text=True, check=True)\n",
	" duration_seconds = float(result_ffprobe.stdout.strip())\n",
	"except FileNotFoundError:\n",
	" print(\"ffprobe not found. Please ensure ffmpeg is installed and in your PATH.\")\n",
	"except subprocess.CalledProcessError as e:\n",
	" print(f\"Error running ffprobe: {e}\\n{e.stderr}\")\n",
	"except ValueError:\n",
	" print(\"Could not parse duration from ffprobe output.\")\n",
	"\n",
	"duration_str = \"\"\n",
	"if duration_seconds:\n",
	" hours = int(duration_seconds // 3600)\n",
	" minutes = int((duration_seconds % 3600) // 60)\n",
	" seconds = duration_seconds % 60\n",
	" if hours > 0:\n",
	" duration_str = f\" (approx. {hours}h {minutes}m {seconds:.1f}s)\"\n",
	" elif minutes > 0:\n",
	" duration_str = f\" (approx. {minutes}m {seconds:.1f}s)\"\n",
	" else:\n",
	" duration_str = f\" (approx. {seconds:.1f}s)\"\n",
	"\n",
	"# 2. Transcribe the audio\n",
	"print(f\"Transcribing {file_name}{duration_str}... this may take a moment.\")\n",
	"result = model.transcribe(file_name, word_timestamps=True)\n",
	"\n",
	"# 3. Define function to format timestamps for SBV (H:MM:SS.mmm)\n",
	"def format_sbv_timestamp(seconds):\n",
	" hours = int(seconds // 3600)\n",
	" minutes = int((seconds % 3600) // 60)\n",
	" secs = seconds % 60\n",
	" # SBV format: 0:00:00.000 (Hours:Minutes:Seconds.Milliseconds)\n",
	" return f\"{hours}:{minutes:02d}:{secs:06.3f}\"\n"
	],
	"execution_count": 25,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Loading model...\n",
	"Transcribing audio_video.mp4 (approx. 15m 52.0s)... this may take a moment.\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "01bdb3d8",
	"outputId": "e27fa291-68ce-41a4-86f2-00b11d7f92bf"
	},
	"source": [
	"import re\n",
	"\n",
	"# Configuration for sentence breaking\n",
	"# Add comma as a delimiter for natural breaks\n",
	"sentence_break_delimiters = \".?!,。？！\"\n",
	"\n",
	"# Generate the regex pattern dynamically from the configured delimiters\n",
	"# re.escape is used to handle any special regex characters if they were in the delimiters string\n",
	"escaped_delimiters = re.escape(sentence_break_delimiters)\n",
	"punctuation_pattern = rf'[{escaped_delimiters}]$'\n",
	"\n",
	"# 1. Initialize an empty list to store the processed sub-segments\n",
	"new_sbv_segments = []\n",
	"\n",
	"# Helper function to process and store a sub-segment\n",
	"def add_sub_segment(words_list):\n",
	" if not words_list:\n",
	" return\n",
	"\n",
	" text = \"\".join([w['word'] for w in words_list]).strip()\n",
	" start_time = words_list[0]['start']\n",
	" end_time = words_list[-1]['end']\n",
	" new_sbv_segments.append({\n",
	" 'start': start_time,\n",
	" 'end': end_time,\n",
	" 'text': text\n",
	" })\n",
	"\n",
	"# Iterate through each segment in the result['segments'] list\n",
	"for segment in result['segments']:\n",
	" current_sentence_words = []\n",
	" words_in_segment = segment.get('words', [])\n",
	"\n",
	" # Iterate through each word_info dictionary in the segment['words'] list\n",
	" for i, word_info in enumerate(words_in_segment):\n",
	" # Append the current word_info to current_sentence_words\n",
	" current_sentence_words.append(word_info)\n",
	"\n",
	" is_last_word_in_segment = (i == len(words_in_segment) - 1)\n",
	"\n",
	" # Check for conditions to create a new sub-segment:\n",
	" # a. If the number of words in current_sentence_words reaches 10.\n",
	" # b. If the word_info['word'] ends with a punctuation mark\n",
	" # (e.g., '.', '?', '!', '。', '？', '！', ',') and current_sentence_words is not empty.\n",
	" # c. If it's the last word in the segment['words'] list and current_sentence_words is not empty.\n",
	"\n",
	" ends_with_punctuation = bool(re.search(punctuation_pattern, word_info['word']))\n",
	"\n",
	" if (len(current_sentence_words) >= 10 and current_sentence_words) or \\\n",
	" (ends_with_punctuation and current_sentence_words) or \\\n",
	" (is_last_word_in_segment and current_sentence_words):\n",
	"\n",
	" add_sub_segment(current_sentence_words)\n",
	" current_sentence_words = []\n",
	"\n",
	"# Now, generate the SBV content using the new_sbv_segments\n",
	"sbv_content_new = \"\"\n",
	"for seg in new_sbv_segments:\n",
	" start = format_sbv_timestamp(seg['start'])\n",
	" end = format_sbv_timestamp(seg['end'])\n",
	" text = seg['text'].strip()\n",
	"\n",
	" sbv_content_new += f\"{start},{end}\\n{text}\\n\\n\"\n",
	"\n",
	"# Save to file, overwriting the previous audio_video.sbv\n",
	"output_filename = file_name.rsplit('.', 1)[0] + \".sbv\"\n",
	"with open(output_filename, \"w\", encoding=\"utf-8\") as f:\n",
	" f.write(sbv_content_new)\n",
	"\n",
	"print(f\"Success! Created: {output_filename} with broken sentences.\")"
	],
	"execution_count": 23,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Success! Created: audio_video.sbv with broken sentences.\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 17
	},
	"id": "960e4ab5",
	"outputId": "d1e93cc1-bacd-486a-9eea-2afa6d901b53"
	},
	"source": [
	"# sbv file\n",
	"files.download(output_filename)"
	],
	"execution_count": 21,
	"outputs": [
	{
	"output_type": "display_data",
	"data": {
	"text/plain": [
	"<IPython.core.display.Javascript object>"
	],
	"application/javascript": [
	"\n",
	" async function download(id, filename, size) {\n",
	" if (!google.colab.kernel.accessAllowed) {\n",
	" return;\n",
	" }\n",
	" const div = document.createElement('div');\n",
	" const label = document.createElement('label');\n",
	" label.textContent = `Downloading \"${filename}\": `;\n",
	" div.appendChild(label);\n",
	" const progress = document.createElement('progress');\n",
	" progress.max = size;\n",
	" div.appendChild(progress);\n",
	" document.body.appendChild(div);\n",
	"\n",
	" const buffers = [];\n",
	" let downloaded = 0;\n",
	"\n",
	" const channel = await google.colab.kernel.comms.open(id);\n",
	" // Send a message to notify the kernel that we're ready.\n",
	" channel.send({})\n",
	"\n",
	" for await (const message of channel.messages) {\n",
	" // Send a message to notify the kernel that we're ready.\n",
	" channel.send({})\n",
	" if (message.buffers) {\n",
	" for (const buffer of message.buffers) {\n",
	" buffers.push(buffer);\n",
	" downloaded += buffer.byteLength;\n",
	" progress.value = downloaded;\n",
	" }\n",
	" }\n",
	" }\n",
	" const blob = new Blob(buffers, {type: 'application/binary'});\n",
	" const a = document.createElement('a');\n",
	" a.href = window.URL.createObjectURL(blob);\n",
	" a.download = filename;\n",
	" div.appendChild(a);\n",
	" a.click();\n",
	" div.remove();\n",
	" }\n",
	" "
	]
	},
	"metadata": {}
	},
	{
	"output_type": "display_data",
	"data": {
	"text/plain": [
	"<IPython.core.display.Javascript object>"
	],
	"application/javascript": [
	"download(\"download_728cc862-8c45-4e4e-8612-94c5c5f1a429\", \"audio_video.sbv\", 20764)"
	]
	},
	"metadata": {}
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"# converting sbv (youtube format) to srt for local checking.\n",
	"# im using mac and iina, so subtitle should be rendered.\n",
	"\n",
	"import os\n",
	"\n",
	"input_filename = \"audio_video.sbv\"\n",
	"output_filename = \"audio_video.srt\"\n",
	"\n",
	"# 1. Check if file exists\n",
	"if not os.path.exists(input_filename):\n",
	" print(f\"Error: '{input_filename}' not found in the current directory.\")\n",
	"else:\n",
	" with open(input_filename, 'r', encoding='utf-8') as f:\n",
	" # Split by empty lines to isolate caption blocks\n",
	" blocks = f.read().strip().split('\\n\\n')\n",
	"\n",
	" with open(output_filename, 'w', encoding='utf-8') as f_out:\n",
	" for index, block in enumerate(blocks, 1):\n",
	" if not block.strip():\n",
	" continue\n",
	"\n",
	" lines = block.split('\\n')\n",
	"\n",
	" # The first line is the timestamp\n",
	" sbv_time = lines[0]\n",
	" # The rest is the text\n",
	" text = \"\\n\".join(lines[1:])\n",
	"\n",
	" # Convert SBV timestamp (0:00.000,0:05.000) to SRT (00:00,000 --> 00:05,000)\n",
	" # 1. Split start and end\n",
	" start, end = sbv_time.split(',')\n",
	" # 2. Replace dots with commas\n",
	" srt_time = f\"{start.replace('.', ',')} --> {end.replace('.', ',')}\"\n",
	"\n",
	" # Write to file (Index, Time, Text, Empty Line)\n",
	" f_out.write(f\"{index}\\n{srt_time}\\n{text}\\n\\n\")\n",
	"\n",
	" print(f\"Done! Created: {output_filename}\")\n",
	"\n",
	" # Optional: Uncomment the line below to automatically download the file to your PC\n",
	" # from google.colab import files; files.download(output_filename)"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "ktFjhz9Iqm8G",
	"outputId": "7552bbb0-e55f-483d-8ec2-6102c2a5f0bb"
	},
	"execution_count": 24,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Done! Created: audio_video.srt\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"files.download(output_filename)"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 17
	},
	"id": "ZINL0zEKwOZH",
	"outputId": "e8def8c0-cc3b-4060-9da0-48ec60a0af86"
	},
	"execution_count": 14,
	"outputs": [
	{
	"output_type": "display_data",
	"data": {
	"text/plain": [
	"<IPython.core.display.Javascript object>"
	],
	"application/javascript": [
	"\n",
	" async function download(id, filename, size) {\n",
	" if (!google.colab.kernel.accessAllowed) {\n",
	" return;\n",
	" }\n",
	" const div = document.createElement('div');\n",
	" const label = document.createElement('label');\n",
	" label.textContent = `Downloading \"${filename}\": `;\n",
	" div.appendChild(label);\n",
	" const progress = document.createElement('progress');\n",
	" progress.max = size;\n",
	" div.appendChild(progress);\n",
	" document.body.appendChild(div);\n",
	"\n",
	" const buffers = [];\n",
	" let downloaded = 0;\n",
	"\n",
	" const channel = await google.colab.kernel.comms.open(id);\n",
	" // Send a message to notify the kernel that we're ready.\n",
	" channel.send({})\n",
	"\n",
	" for await (const message of channel.messages) {\n",
	" // Send a message to notify the kernel that we're ready.\n",
	" channel.send({})\n",
	" if (message.buffers) {\n",
	" for (const buffer of message.buffers) {\n",
	" buffers.push(buffer);\n",
	" downloaded += buffer.byteLength;\n",
	" progress.value = downloaded;\n",
	" }\n",
	" }\n",
	" }\n",
	" const blob = new Blob(buffers, {type: 'application/binary'});\n",
	" const a = document.createElement('a');\n",
	" a.href = window.URL.createObjectURL(blob);\n",
	" a.download = filename;\n",
	" div.appendChild(a);\n",
	" a.click();\n",
	" div.remove();\n",
	" }\n",
	" "
	]
	},
	"metadata": {}
	},
	{
	"output_type": "display_data",
	"data": {
	"text/plain": [
	"<IPython.core.display.Javascript object>"
	],
	"application/javascript": [
	"download(\"download_959552eb-ddbd-482e-9179-2f80024edfd0\", \"audio_video.srt\", 21685)"
	]
	},
	"metadata": {}
	}
	]
	}
	]
	}
No results found