shhommychon · September 11, 2025 15:19
diff --git a/-my-speech-upscale-webui.ipynb b/-my-speech-upscale-webui.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": [
        "Q-wMgl_X7kOD"
      ]
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU",
    "gpuClass": "standard"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/shhommychon/a577a97c199d7b756d3bc2d1ac44c1ca/-my-speech-upscale-webui.ipynb)"
      ],
      "metadata": {
        "id": "fQsXgzlF3tZ0"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "# 음성 업스케일링\n",
        "- 능력자들이 만들어 놓은 음성 업스케일링 SoTA 모델들을 간단한 마우스 몇 번 '딸깍'으로 이용할 수 있도록 만든 페이지"
      ],
      "metadata": {
        "id": "DB4U84-b-6aa"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "##### setup"
      ],
      "metadata": {
        "id": "Q-wMgl_X7kOD"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!git clone https://gist.github.com/a577a97c199d7b756d3bc2d1ac44c1ca.git my-speech-upscale-webui"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "n3fLgpHJ7qj1",
        "executionInfo": {
          "status": "ok",
          "user_tz": -540,
          "elapsed": 1318
        },
        "outputId": "13b9a5ce-8c02-41f3-b0a9-500b21795047"
      },
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Cloning into 'my-speech-upscale-webui'...\n",
            "remote: Enumerating objects: 22, done.\u001b[K\n",
            "remote: Counting objects: 100% (3/3), done.\u001b[K\n",
            "remote: Compressing objects: 100% (3/3), done.\u001b[K\n",
            "remote: Total 22 (delta 0), reused 2 (delta 0), pack-reused 19 (from 1)\u001b[K\n",
            "Receiving objects: 100% (22/22), 16.72 KiB | 194.00 KiB/s, done.\n",
            "Resolving deltas: 100% (2/2), done.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "###### Nu-Wave 2\n",
        "- Seoul Natl. Univ.\n",
        "- 2022.06\n",
        "- [ArXiv](https://arxiv.org/abs/2206.08545), [GitHub](https://github.com/maum-ai/nuwave2)"
      ],
      "metadata": {
        "id": "ACGu2Edk7r4t"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!git clone --recursive https://github.com/mindslab-ai/nuwave2.git\n",
        "\n",
        "!rm nuwave2/utils/stft.py; mv my-speech-upscale-webui/nuwave2_utils_stft.py nuwave2/utils/stft.py\n",
        "!rm nuwave2/model.py; mv my-speech-upscale-webui/nuwave2_model.py nuwave2/model.py\n",
        "!rm nuwave2/inference.py; mv my-speech-upscale-webui/nuwave2_inference.py nuwave2/inference.py"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "peqjrwYF-52_",
        "executionInfo": {
          "status": "ok",
          "user_tz": -540,
          "elapsed": 7554
        },
        "outputId": "32ac41c4-8371-4245-8905-ece74b834a57"
      },
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Cloning into 'nuwave2'...\n",
            "remote: Enumerating objects: 239, done.\u001b[K\n",
            "remote: Counting objects: 100% (55/55), done.\u001b[K\n",
            "remote: Compressing objects: 100% (48/48), done.\u001b[K\n",
            "remote: Total 239 (delta 7), reused 7 (delta 7), pack-reused 184 (from 1)\u001b[K\n",
            "Receiving objects: 100% (239/239), 55.91 MiB | 14.08 MiB/s, done.\n",
            "Resolving deltas: 100% (14/14), done.\n",
            "Submodule 'vctk-silence-labels' (https://github.com/nii-yamagishilab/vctk-silence-labels.git) registered for path 'vctk-silence-labels'\n",
            "Cloning into '/content/nuwave2/vctk-silence-labels'...\n",
            "remote: Enumerating objects: 11, done.        \n",
            "remote: Counting objects: 100% (11/11), done.        \n",
            "remote: Compressing objects: 100% (11/11), done.        \n",
            "remote: Total 11 (delta 2), reused 7 (delta 0), pack-reused 0 (from 0)        \n",
            "Receiving objects: 100% (11/11), 242.52 KiB | 20.21 MiB/s, done.\n",
            "Resolving deltas: 100% (2/2), done.\n",
            "Submodule path 'vctk-silence-labels': checked out 'd8910f2eb6750f25a5d8a993ea1f0ef732075021'\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip -qq install pytorch_lightning"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ocDgypRMD-GL",
        "executionInfo": {
          "status": "ok",
          "user_tz": -540,
          "elapsed": 5110
        },
        "outputId": "81a9dcc7-a8d7-46cd-b86d-2e4cc5151da4"
      },
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/832.4 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m832.4/832.4 kB\u001b[0m \u001b[31m45.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/983.2 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m983.2/983.2 kB\u001b[0m \u001b[31m67.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!gdown 11t0cQYx6ZadKQjmfGnqxUUH2UEk5Yzk7 # 8 kHz ~ 48 kHz source → 48 kHz output\n",
        "!gdown 1IZihqb0LKHLtqRjyhHBGxXHJhUwskVRo # 3.2 kHz ~ 16 kHz source → 16 kHz output"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "w5AAGbbC9rdn",
        "executionInfo": {
          "status": "ok",
          "user_tz": -540,
          "elapsed": 17663
        },
        "outputId": "a7228ff9-d04d-49e6-fb7b-fdfb0115c2f0"
      },
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Downloading...\n",
            "From: https://drive.google.com/uc?id=11t0cQYx6ZadKQjmfGnqxUUH2UEk5Yzk7\n",
            "To: /content/nuwave2_02_16_13_epoch=629.ckpt\n",
            "100% 20.9M/20.9M [00:00<00:00, 40.1MB/s]\n",
            "Downloading...\n",
            "From: https://drive.google.com/uc?id=1IZihqb0LKHLtqRjyhHBGxXHJhUwskVRo\n",
            "To: /content/nuwave2_08_16_08_(3.2k-16k_to_16k)_epoch=584.ckpt\n",
            "100% 20.9M/20.9M [00:00<00:00, 93.2MB/s]\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!export PATH=/content/nuwave2:$PATH"
      ],
      "metadata": {
        "id": "try_oR_aEI1o",
        "executionInfo": {
          "status": "ok",
          "user_tz": -540,
          "elapsed": 106
        }
      },
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "###### AudioSR\n",
        "- Univ. of Surrey, Univ. of CA S.D., ByteDance\n",
        "- 2023.09\n",
        "- [ArXiv](https://arxiv.org/abs/2309.07314), [GitHub](https://github.com/haoheliu/versatile_audio_super_resolution)"
      ],
      "metadata": {
        "id": "n3VYQ5kg9W-L"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!git clone https://github.com/haoheliu/versatile_audio_super_resolution.git\n",
        "\n",
        "!rm versatile_audio_super_resolution/inference.py; mv my-speech-upscale-webui/audiosr_inference.py versatile_audio_super_resolution/inference.py"
      ],
      "metadata": {
        "id": "gkX0yUyR9j7U",
        "executionInfo": {
          "status": "ok",
          "user_tz": -540,
          "elapsed": 2530
        },
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "outputId": "404330ab-f443-4b98-9b76-73bebde6364e"
      },
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Cloning into 'versatile_audio_super_resolution'...\n",
            "remote: Enumerating objects: 475, done.\u001b[K\n",
            "remote: Counting objects: 100% (233/233), done.\u001b[K\n",
            "remote: Compressing objects: 100% (107/107), done.\u001b[K\n",
            "remote: Total 475 (delta 165), reused 139 (delta 126), pack-reused 242 (from 3)\u001b[K\n",
            "Receiving objects: 100% (475/475), 22.78 MiB | 14.60 MiB/s, done.\n",
            "Resolving deltas: 100% (225/225), done.\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip -qq install cog\n",
        "!pip -qq install ftfy\n",
        "!pip -qq install phonemizer\n",
        "!pip -qq install pyloudnorm\n",
        "!pip -qq install torchlibrosa\n",
        "!pip -qq install unidecode"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "gb8vtGGd5hym",
        "executionInfo": {
          "status": "ok",
          "user_tz": -540,
          "elapsed": 21010
        },
        "outputId": "46e394c4-6747-451d-f6ea-a2648ddf3ef8"
      },
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/78.1 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.1/78.1 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/60.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.8/60.8 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/95.5 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m95.5/95.5 kB\u001b[0m \u001b[31m8.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/67.2 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.2/67.2 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/510.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m510.8/510.8 kB\u001b[0m \u001b[31m30.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.0/72.0 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.7/4.7 MB\u001b[0m \u001b[31m91.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m452.2/452.2 kB\u001b[0m \u001b[31m31.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.8/44.8 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.2/48.2 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m103.8/103.8 kB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.9/59.9 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m213.4/213.4 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m565.1/565.1 kB\u001b[0m \u001b[31m33.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m235.8/235.8 kB\u001b[0m \u001b[31m11.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from huggingface_hub import hf_hub_download\n",
        "\n",
        "# AudioSR speech 모델 다운로드\n",
        "hf_hub_download(\n",
        "    repo_id=\"haoheliu/audiosr_speech\", # 또는 \"haoheliu/audiosr_basic\"\n",
        "    filename=\"pytorch_model.bin\",\n",
        ")"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 209,
          "referenced_widgets": []
        },
        "id": "eE0F3YAs6P9v",
        "executionInfo": {
          "status": "ok",
          "user_tz": -540,
          "elapsed": 86877
        },
        "outputId": "475ca58e-ec91-4f8e-bb26-374ab2538c6d"
      },
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n",
            "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
            "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
            "You will be able to reuse this secret in all of your notebooks.\n",
            "Please note that authentication is recommended but still optional to access public models or datasets.\n",
            "  warnings.warn(\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "pytorch_model.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "'/root/.cache/huggingface/hub/models--haoheliu--audiosr_speech/snapshots/413f1d734411663e95310c17d381279a0c049960/pytorch_model.bin'"
            ],
            "application/vnd.google.colaboratory.intrinsic+json": {
              "type": "string"
            }
          },
          "metadata": {},
          "execution_count": 0
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!export PATH=/content/versatile_audio_super_resolution:$PATH"
      ],
      "metadata": {
        "id": "gLFE3RWy6P4Y",
        "executionInfo": {
          "status": "ok",
          "user_tz": -540,
          "elapsed": 107
        }
      },
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "###### FlowHigh"
      ],
      "metadata": {
        "id": "QvBWOGw49kSW"
      }
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "VxMHYh2y9l2e",
        "executionInfo": {
          "status": "ok",
          "user_tz": -540,
          "elapsed": 2
        }
      },
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "##### run"
      ],
      "metadata": {
        "id": "F4qIvS54-8iS"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "!python3 nuwave2/inference.py -c /content/nuwave2_02_16_13_epoch=629.ckpt -i sample.wav --sr 15000"
      ],
      "metadata": {
        "id": "3a_oLQi24x2Q",
        "executionInfo": {
          "status": "ok",
          "user_tz": -540,
          "elapsed": 37946
        }
      },
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "!python3 versatile_audio_super_resolution/inference.py --input sample.wav --output ./audiosr_output"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "mTOT8spL7LGC",
        "executionInfo": {
          "status": "ok",
          "user_tz": -540,
          "elapsed": 157407
        },
        "outputId": "92797cd0-f2ae-4c62-a2ee-fb096fcdbf33"
      },
      "execution_count": 0,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "2025-09-11 14:50:11.971072: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
            "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
            "E0000 00:00:1757602212.295152    2230 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
            "E0000 00:00:1757602212.376606    2230 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
            "W0000 00:00:1757602213.034513    2230 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
            "W0000 00:00:1757602213.034551    2230 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
            "W0000 00:00:1757602213.034555    2230 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
            "W0000 00:00:1757602213.034559    2230 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
            "2025-09-11 14:50:13.092107: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
            "To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
            "tokenizer_config.json: 100% 25.0/25.0 [00:00<00:00, 191kB/s]\n",
            "vocab.json: 100% 899k/899k [00:00<00:00, 5.26MB/s]\n",
            "merges.txt: 100% 456k/456k [00:00<00:00, 2.88MB/s]\n",
            "tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 111MB/s]\n",
            "config.json: 100% 481/481 [00:00<00:00, 4.99MB/s]\n",
            "/usr/local/lib/python3.12/dist-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers\n",
            "  warnings.warn(f\"Importing from {__name__} is deprecated, please import via timm.layers\", FutureWarning)\n",
            "Loading Model...\n",
            "Loading AudioSR: speech\n",
            "Loading model on cuda:0\n",
            "DiffusionWrapper has 258.20 M params.\n",
            "Model loaded!\n",
            "Setting seed to: 3096934589\n",
            "overlap = 0.04\n",
            "guidance_scale = 3.5\n",
            "ddim_steps = 50\n",
            "chunk_size = 10.24\n",
            "multiband_ensemble = None\n",
            "input file = sample.wav\n",
            "audio.shape = (176640,)\n",
            "input cutoff = 12000\n",
            "Audio is mono\n",
            "enable_overlap = True\n",
            "Processing chunk 1 of 1 for Left/Mono channel\n",
            "Running DDIM Sampling with 50 timesteps\n",
            "DDIM Sampler: 100% 50/50 [00:11<00:00,  4.22it/s]\n",
            "file created: ./audiosr_output/SR_sample.wav\n",
            "\u001b[0m"
          ]
        }
      ]
    }
  ]
 }
diff --git a/.aiexclude b/.aiexclude
 .cursorignore
 LICENSE
 *.ipynb
diff --git a/.cursorignore b/.cursorignore
 .aiexclude
 LICENSE
 *.ipynb
diff --git a/.gitignore b/.gitignore
diff --git a/README.md b/README.md
diff --git a/audiosr_inference.py b/audiosr_inference.py
 import gc
 import os
 import random
 import numpy as np
 from scipy.signal.windows import hann
 import soundfile as sf
 import torch
 from cog import BasePredictor, Input, Path
 import tempfile
 import argparse
 import librosa
 from audiosr import build_model, super_resolution
 from scipy import signal
 import pyloudnorm as pyln


 import warnings
 warnings.filterwarnings("ignore")

 os.environ["TOKENIZERS_PARALLELISM"] = "true"
 torch.set_float32_matmul_precision("high")

 def match_array_shapes(array_1:np.ndarray, array_2:np.ndarray):
    if (len(array_1.shape) == 1) & (len(array_2.shape) == 1):
        if array_1.shape[0] > array_2.shape[0]:
            array_1 = array_1[:array_2.shape[0]]
        elif array_1.shape[0] < array_2.shape[0]:
            array_1 = np.pad(array_1, ((array_2.shape[0] - array_1.shape[0], 0)), 'constant', constant_values=0)
    else:
        if array_1.shape[1] > array_2.shape[1]:
            array_1 = array_1[:,:array_2.shape[1]]
        elif array_1.shape[1] < array_2.shape[1]:
            padding = array_2.shape[1] - array_1.shape[1]
            array_1 = np.pad(array_1, ((0,0), (0,padding)), 'constant', constant_values=0)
    return array_1


 def lr_filter(audio, cutoff, filter_type, order=12, sr=48000):
    audio = audio.T
    nyquist = 0.5 * sr
    normal_cutoff = cutoff / nyquist
    b, a = signal.butter(order//2, normal_cutoff, btype=filter_type, analog=False)
    sos = signal.tf2sos(b, a)
    filtered_audio = signal.sosfiltfilt(sos, audio)
    return filtered_audio.T

 class Predictor(BasePredictor):
    def setup(self, model_name="basic", device="auto"):
        self.model_name = model_name
        self.device = device
        self.sr = 48000
        print("Loading Model...")
        self.audiosr = build_model(model_name=self.model_name, device=self.device)
        # print(self.audiosr)
        # exit()
        print("Model loaded!")

    def process_audio(self, input_file, chunk_size=5.12, overlap=0.1, seed=None, guidance_scale=3.5, ddim_steps=50):
        audio, sr = librosa.load(input_file, sr=input_cutoff * 2, mono=False)
        audio = audio.T
        sr = input_cutoff * 2
        print(f"audio.shape = {audio.shape}")
        print(f"input cutoff = {input_cutoff}")
        
        is_stereo = len(audio.shape) == 2
        audio_channels = [audio] if not is_stereo else [audio[:, 0], audio[:, 1]]
        print("audio is stereo" if is_stereo else "Audio is mono")

        chunk_samples = int(chunk_size * sr)
        overlap_samples = int(overlap * chunk_samples)
        output_chunk_samples = int(chunk_size * self.sr)
        output_overlap_samples = int(overlap * output_chunk_samples)
        enable_overlap = overlap > 0
        print(f"enable_overlap = {enable_overlap}")
        
        def process_chunks(audio):
            chunks = []
            original_lengths = []
            start = 0
            while start < len(audio):
                end = min(start + chunk_samples, len(audio))
                chunk = audio[start:end]
                if len(chunk) < chunk_samples:
                    original_lengths.append(len(chunk))
                    chunk = np.concatenate([chunk, np.zeros(chunk_samples - len(chunk))])
                else:
                    original_lengths.append(chunk_samples)
                chunks.append(chunk)
                start += chunk_samples - overlap_samples if enable_overlap else chunk_samples
            return chunks, original_lengths

        # Process both channels (mono or stereo)
        chunks_per_channel = [process_chunks(channel) for channel in audio_channels]
        sample_rate_ratio = self.sr / sr
        total_length = len(chunks_per_channel[0][0]) * output_chunk_samples - (len(chunks_per_channel[0][0]) - 1) * (output_overlap_samples if enable_overlap else 0)
        reconstructed_channels = [np.zeros((1, total_length)) for _ in audio_channels]

        meter_before = pyln.Meter(sr)
        meter_after = pyln.Meter(self.sr)
        
        # Process chunks for each channel
        for ch_idx, (chunks, original_lengths) in enumerate(chunks_per_channel):
            for i, chunk in enumerate(chunks):
                loudness_before = meter_before.integrated_loudness(chunk)
                print(f"Processing chunk {i+1} of {len(chunks)} for {'Left/Mono' if ch_idx == 0 else 'Right'} channel")
                with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_wav:
                    sf.write(temp_wav.name, chunk, sr)
                
                    out_chunk = super_resolution(
                        self.audiosr,
                        temp_wav.name,
                        seed=seed,
                        guidance_scale=guidance_scale,
                        ddim_steps=ddim_steps,
                        latent_t_per_second=12.8
                    )

                    out_chunk = out_chunk[0]
                    num_samples_to_keep = int(original_lengths[i] * sample_rate_ratio)
                    out_chunk = out_chunk[:, :num_samples_to_keep].squeeze()
                    loudness_after = meter_after.integrated_loudness(out_chunk)
                    out_chunk = pyln.normalize.loudness(out_chunk, loudness_after, loudness_before)

                    if enable_overlap:
                        actual_overlap_samples = min(output_overlap_samples, num_samples_to_keep)
                        fade_out = np.linspace(1., 0., actual_overlap_samples)
                        fade_in = np.linspace(0., 1., actual_overlap_samples)

                        if i == 0:
                            out_chunk[-actual_overlap_samples:] *= fade_out
                        elif i < len(chunks) - 1:
                            out_chunk[:actual_overlap_samples] *= fade_in
                            out_chunk[-actual_overlap_samples:] *= fade_out
                        else:
                            out_chunk[:actual_overlap_samples] *= fade_in

                    start = i * (output_chunk_samples - output_overlap_samples if enable_overlap else output_chunk_samples)
                    end = start + out_chunk.shape[0]
                    reconstructed_channels[ch_idx][0, start:end] += out_chunk.flatten()

        reconstructed_audio = np.stack(reconstructed_channels, axis=-1) if is_stereo else reconstructed_channels[0]

        if multiband_ensemble:
            low, _ = librosa.load(input_file, sr=48000, mono=False)
            output = match_array_shapes(reconstructed_audio[0].T, low)
            low = lr_filter(low.T, crossover_freq, 'lowpass', order=10)
            high = lr_filter(output.T, crossover_freq, 'highpass', order=10)
            high = lr_filter(high, 23000, 'lowpass', order=2)
            output = low + high
        else:
            output = reconstructed_audio[0]
        # print(output, type(output))
        return output


    def predict(self,
        input_file: Path = Input(description="Audio to upsample"),
        ddim_steps: int = Input(description="Number of inference steps", default=50, ge=10, le=500),
        guidance_scale: float = Input(description="Scale for classifier free guidance", default=3.5, ge=1.0, le=20.0),
        overlap: float = Input(description="overlap size", default=0.04),
        chunk_size: float = Input(description="chunksize", default=10.24),
        seed: int = Input(description="Random seed. Leave blank to randomize the seed", default=None)
    ) -> Path:

        if seed == 0:
            seed = random.randint(0, 2**32 - 1)
        print(f"Setting seed to: {seed}")
        print(f"overlap = {overlap}")
        print(f"guidance_scale = {guidance_scale}")
        print(f"ddim_steps = {ddim_steps}")
        print(f"chunk_size = {chunk_size}")
        print(f"multiband_ensemble = {multiband_ensemble}")
        print(f"input file = {os.path.basename(input_file)}")
        os.makedirs(output_folder, exist_ok=True)
        waveform = self.process_audio(
            input_file,
            chunk_size=chunk_size,
            overlap=overlap,
            seed=seed,
            guidance_scale=guidance_scale,
            ddim_steps=ddim_steps
        )
        
        filename = os.path.splitext(os.path.basename(input_file))[0]
        sf.write(f"{output_folder}/SR_{filename}.wav", data=waveform, samplerate=48000,  subtype="PCM_16")
        print(f"file created: {output_folder}/SR_{filename}.wav")
        del self.audiosr, waveform
        gc.collect()
        torch.cuda.empty_cache()


 if __name__ == "__main__":

    parser = argparse.ArgumentParser(description="Find volume difference of two audio files.")
    parser.add_argument("--input", help="Path to input audio file")
    parser.add_argument("--output", help="Output folder")
    # 모델을 선택할 수 있도록 --model_name 인자를 추가합니다. (기본값: 'speech')
    parser.add_argument("--model_name", help="Model name: speech or basic", type=str, required=False, default="speech", choices=["speech", "basic"]) # ★ 추가
    parser.add_argument("--ddim_steps", help="Number of ddim steps", type=int, required=False, default=50)
    parser.add_argument("--chunk_size", help="chunk size", type=float, required=False, default=10.24)
    parser.add_argument("--guidance_scale", help="Guidance scale value",  type=float, required=False, default=3.5)
    parser.add_argument("--seed", help="Seed value, 0 = random seed", type=int, required=False, default=0)
    parser.add_argument("--overlap", help="overlap value", type=float, required=False, default=0.04)
    parser.add_argument("--multiband_ensemble", type=bool, help="Use multiband ensemble with input")
    parser.add_argument("--input_cutoff", help="Define the crossover of audio input in the multiband ensemble", type=int, required=False, default=12000)

    args = parser.parse_args()

    input_file_path = args.input
    output_folder = args.output
    ddim_steps = args.ddim_steps
    chunk_size = args.chunk_size
    guidance_scale = args.guidance_scale
    seed = args.seed
    overlap = args.overlap
    input_cutoff = args.input_cutoff
    multiband_ensemble = args.multiband_ensemble

    crossover_freq = input_cutoff - 1000

    p = Predictor()
    
    # 커맨드 라인에서 받은 model_name을 setup에 전달하며,
    # 기본값은 음성 모델 ('haoheliu/audiosr_speech') 입니다. (※ audiosr/utils.py#L393 참고.)
    p.setup(model_name=args.model_name, device='auto') # ★ 추가


    out = p.predict(
        input_file_path,
        ddim_steps=ddim_steps,
        guidance_scale=guidance_scale,
        seed=seed,
        chunk_size=chunk_size,
        overlap=overlap
    )

    del p
    gc.collect()
    torch.cuda.empty_cache()
diff --git a/GEMINI.md b/GEMINI.md
diff --git a/LICENSE b/LICENSE
                    MOST OF IT IS NOT MY CODE PUBLIC LICENSE

          :~!777!!!!!~~^::.                          .::^~~!!!7!!77!~:
         !J7777!!!!!!!7777!!^                      ^!!!777!!!!!!!7777J7.
        !J7!7777!!777??!77???:                    :???777??777!!777777J7.
       ~~^^^^^^^^^^^~!7^77!~^!.                  .!^~!77^7!~^^^^^^^^^^^~!.
      ^!^^^^^^^^~!~!~7^~^(   O)7                 .(O   )^~^7^!!!~^^^^^^^^!~
      ~7^^^^^^^^?!!??7.^!7J5777                  7775J?!~.!??!!?^^^^^^^^!!
      .~~^^^^^^^7!!???~::::^:^^~                ~^^:^^:.:~7??!~7^^^^^^^~!.
        :~!~~~~~!7J7!777^~~~!~~!.              .!~~!~~~~7?7!7J7!~~~~~!~:
          !::::::!77!!77JJJ?JJJ:                :JJJ?JJJ77!!777::::::7.
         .!::::::!?!7!777??JJJ:                  :YJJ??777!77?7::::::!.
       ::7~^:..::^?7!!77!J777??                  ??777J777!!7?^::..:^~7^:
   .~JYB##BBGY7~:..^~!?7??!????.                .????!7?7?!~^:.:^7YGBB###5J!.
 ~JP#&&&########G5?!~~!J!~77!7~                  ~7!77~!J!~^!?YGB######&&&&#GJ!.
  .:^~77?YPB###&&&&##BB#G^...                      ...^P#BB##&&&&###BG5J?7~^:..
           .^~7Y5PGBBBBB#Y                            Y#BBBBBGPPY?!^.
                  ..::::::                            .::::::.
      Man, I stole your code                          It's not my code

                           Version 0.1.0, May 2023
 
 Copyright (c) 2023 Chon, Sung Hyu

 Everyone is permitted to copy and distribute, modify this work as long as the 
 original licenses of this work is maintained.
 Most of the contents of this work are under MIT, GPL, or some other major public
 licenses. Use this work at your own risk.
 If copyright infringement occurs, the author will pretend that the author does
 not know anything, does not intend to defend you, and will claim that it's not
 the author's fault.
 
                    MOST OF IT IS NOT MY CODE PUBLIC LICENSE
        TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

 1. You do what the original authors tell you that you can do.

 2. If the original license or author is not mentioned in some parts of code,
    those codes are probably written by the author. You can use those parts of
    code however you would like to, but keep in mind that the author has very
    little experience in programming. Do not blame the author when you use those
    codes for some serious applications and something goes wrong.
diff --git a/my-speech-upscale-webui.py b/my-speech-upscale-webui.py
diff --git a/nuwave2_inference.py b/nuwave2_inference.py
 from lightning_model import NuWave2
 from omegaconf import OmegaConf as OC
 import os
 import argparse
 import datetime
 from glob import glob
 import torch
 import librosa as rosa
 from scipy.io.wavfile import write as swrite
 import matplotlib.pyplot as plt
 from utils.stft import STFTMag
 import numpy as np
 from scipy.signal import sosfiltfilt
 from scipy.signal import butter, cheby1, cheby2, ellip, bessel
 from scipy.signal import resample_poly
 import random


 def save_stft_mag(wav, fname):
    fig = plt.figure(figsize=(9, 3))
    plt.imshow(rosa.amplitude_to_db(stft(wav[0].detach().cpu()).numpy(),
               ref=np.max, top_db = 80.),
               aspect='auto',
               origin='lower',
               interpolation='none')
    plt.colorbar()
    plt.xlabel('Frames')
    plt.ylabel('Channels')
    plt.tight_layout()
    fig.savefig(fname, format='png')
    plt.close()
    return

 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-c',
                        '--checkpoint',
                        type=str,
                        required=True,
                        help="Checkpoint path")
    parser.add_argument('-i',
                        '--wav',
                        type=str,
                        default=None,
                        help="audio")
    parser.add_argument('--sr',
                        type=int,
                        required=True,
                        help="Sampling rate of input audio")
    parser.add_argument('--steps',
                        type=int,
                        required=False,
                        help="Steps for sampling")
    parser.add_argument('--gt', action="store_true",
                        required=False, help="Whether the input audio is 48 kHz ground truth audio.")
    parser.add_argument('--device',
                        type=str,
                        default='cuda',
                        required=False,
                        help="Device, 'cuda' or 'cpu'")

    args = parser.parse_args()
    #torch.backends.cudnn.benchmark = False
    hparams = OC.load('nuwave2/hparameter.yaml')
    os.makedirs(hparams.log.test_result_dir, exist_ok=True)
    if args.steps is None or args.steps == 8:
        args.steps = 8
        noise_schedule = eval(hparams.dpm.infer_schedule)
    else:
        noise_schedule = None
    model = NuWave2(hparams).to(args.device)
    model.eval()
    stft = STFTMag()
    # PyTorch 2.0+ 버전부터는 보안상의 이유로 weights_only=True가 기본값이 되었습니다.
    # 공식 체크포인트는 신뢰할 수 있으므로, 이전 버전과 동일하게 동작하도록 weights_only=False로 설정합니다.
    ckpt = torch.load(args.checkpoint, map_location='cpu', weights_only=False) # ★ weights_only=False로 명시
    model.load_state_dict(ckpt['state_dict'] if not('EMA' in args.checkpoint) else ckpt)

    highcut = args.sr // 2
    nyq = 0.5 * hparams.audio.sampling_rate
    hi = highcut / nyq

    if args.gt:
        wav, _ = rosa.load(args.wav, sr=hparams.audio.sampling_rate, mono=True)
        wav /= np.max(np.abs(wav))
        wav = wav[:len(wav) - len(wav) % hparams.audio.hop_length]

        order = 8
        sos = cheby1(order, 0.05, hi, btype='lowpass', output='sos')
        wav_l = sosfiltfilt(sos, wav)

        # downsample to the low sampling rate
        wav_l = resample_poly(wav_l, highcut * 2, hparams.audio.sampling_rate)
        # upsample to the original sampling rate
        wav_l = resample_poly(wav_l, hparams.audio.sampling_rate, highcut * 2)

        if len(wav_l) < len(wav):
            wav_l = np.pad(wav, (0, len(wav) - len(wav_l)), 'constant', constant_values=0)
        elif len(wav_l) > len(wav):
            wav_l = wav_l[:len(wav)]
    else:
        wav, _ = rosa.load(args.wav, sr=args.sr, mono=True)
        wav /= np.max(np.abs(wav))

        # upsample to the original sampling rate
        wav_l = resample_poly(wav, hparams.audio.sampling_rate, args.sr)
        wav_l = wav_l[:len(wav_l) - len(wav_l) % hparams.audio.hop_length]

    fft_size = hparams.audio.filter_length // 2 + 1
    band = torch.zeros(fft_size, dtype=torch.int64)
    band[:int(hi * fft_size)] = 1

    wav = torch.from_numpy(wav).unsqueeze(0).to(args.device)
    wav_l = torch.from_numpy(wav_l.copy()).float().unsqueeze(0).to(args.device)
    band = band.unsqueeze(0).to(args.device)

    wav_recon, wav_list = model.inference(wav_l, band, args.steps, noise_schedule)

    wav = torch.clamp(wav, min=-1, max=1 - torch.finfo(torch.float16).eps)
    save_stft_mag(wav, os.path.join(hparams.log.test_result_dir, f'wav.png'))
    if args.gt:
        swrite(os.path.join(hparams.log.test_result_dir, f'wav.wav'),
               hparams.audio.sampling_rate, wav[0].detach().cpu().numpy())
    else:
        swrite(os.path.join(hparams.log.test_result_dir, f'wav.wav'),
               args.sr, wav[0].detach().cpu().numpy())

    wav_l = torch.clamp(wav_l, min=-1, max=1 - torch.finfo(torch.float16).eps)
    save_stft_mag(wav_l, os.path.join(hparams.log.test_result_dir, f'wav_l.png'))
    swrite(os.path.join(hparams.log.test_result_dir, f'wav_l.wav'),
           hparams.audio.sampling_rate, wav_l[0].detach().cpu().numpy())

    wav_recon = torch.clamp(wav_recon, min=-1, max=1 - torch.finfo(torch.float16).eps)
    save_stft_mag(wav_recon, os.path.join(hparams.log.test_result_dir, f'result.png'))
    swrite(os.path.join(hparams.log.test_result_dir, f'result.wav'),
           hparams.audio.sampling_rate, wav_recon[0].detach().cpu().numpy())

    # for i in range(len(wav_list)):
    #     wav_recon_i = torch.clamp(wav_list[i], min=-1, max=1-torch.finfo(torch.float16).eps)
    #     save_stft_mag(wav_recon_i, os.path.join(hparams.log.test_result_dir, f'result_{i}.png'))
    #     swrite(os.path.join(hparams.log.test_result_dir, f'result_{i}.wav'),
    #            hparams.audio.sampling_rate, wav_recon_i[0].detach().cpu().numpy())

diff --git a/nuwave2_model.py b/nuwave2_model.py
 #Some codes are adopted from 
 #https://github.com/ivanvovk/WaveGrad
 #https://github.com/lmnt-com/diffwave
 #https://github.com/NVlabs/SPADE
 #https://github.com/pkumivision/FFC

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torch.fft
 from math import sqrt, log

 Linear = nn.Linear
 silu = F.silu
 relu = F.relu

 def Conv1d(*args, **kwargs):
    layer = nn.Conv1d(*args, **kwargs)
    nn.init.kaiming_normal_(layer.weight)
    return layer

 def Conv2d(*args, **kwargs):
    layer = nn.Conv2d(*args, **kwargs)
    nn.init.kaiming_normal_(layer.weight)
    return layer


 class DiffusionEmbedding(nn.Module):
    def __init__(self, hparams):
        super().__init__()
        self.n_channels = hparams.dpm.pos_emb_channels
        self.linear_scale = hparams.dpm.pos_emb_scale
        self.out_channels = hparams.arch.pos_emb_dim

        self.projection1 = Linear(self.n_channels, self.out_channels)
        self.projection2 = Linear(self.out_channels, self.out_channels)

    def forward(self, noise_level):
        if len(noise_level.shape) > 1:
            noise_level = noise_level.squeeze(-1)
        half_dim = self.n_channels // 2
        emb = log(10000) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, dtype=torch.float32).to(noise_level) * -emb)
        emb = self.linear_scale * noise_level.unsqueeze(1) * emb.unsqueeze(0)
        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
        emb = self.projection1(emb)
        emb = silu(emb)
        emb = self.projection2(emb)
        emb = silu(emb)
        return emb


 class BSFT(nn.Module):
    def __init__(self, nhidden, out_channels):
        super().__init__()
        self.mlp_shared = nn.Conv1d(2, nhidden, kernel_size=3, padding=1)

        self.mlp_gamma = Conv1d(nhidden, out_channels, kernel_size=3, padding=1)
        self.mlp_beta = Conv1d(nhidden, out_channels, kernel_size=3, padding=1)

    def forward(self, x, band):
        # band: (B, 2, n_fft // 2 + 1)
        actv = silu(self.mlp_shared(band))

        gamma = self.mlp_gamma(actv).unsqueeze(-1)
        beta = self.mlp_beta(actv).unsqueeze(-1)

        # apply scale and bias
        out = x * (1 + gamma) + beta

        return out


 class FourierUnit(nn.Module):
    def __init__(self, in_channels, out_channels, bsft_channels, filter_length=1024, hop_length=256, win_length=1024,
                 sampling_rate=48000):
        # bn_layer not used
        super(FourierUnit, self).__init__()
        self.sampling_rate = sampling_rate
        self.n_fft = filter_length
        self.hop_size = hop_length
        self.win_size = win_length
        hann_window = torch.hann_window(win_length)
        self.register_buffer('hann_window', hann_window)

        self.conv_layer = Conv2d(in_channels=in_channels * 2, out_channels=out_channels * 2,
                                 kernel_size=1, padding=0, bias=False)
        self.bsft = BSFT(bsft_channels, out_channels * 2)

    def forward(self, x, band):
        batch = x.shape[0]

        x = x.view(-1, x.size()[-1])

        # PyTorch 1.9+ 버전 호환성을 위한 stft/istft 처리 방식 변경
        # 1. stft는 복소수 텐서를 반환 (return_complex=True)
        ffted = torch.stft(
            x, self.n_fft, 
            hop_length=self.hop_size, 
            win_length=self.win_size, 
            window=self.hann_window,
            center=True, 
            normalized=True, 
            onesided=True, 
            return_complex=True,    # ★ False에서 True로 변경
        )
        
        # 2. 컨볼루션 연산을 위해 복소수 텐서를 실수형으로 변환 (채널 차원 추가)
        ffted = torch.view_as_real(ffted)       # ★ 추가
        ffted = ffted.permute(0, 3, 1, 2).contiguous()      # (BC, 2, n_fft/2+1, T)
        ffted = ffted.view((batch, -1,) + ffted.size()[2:]) # (B, 2C, n_fft/2+1, T)

        ffted = relu(self.bsft(ffted, band))    # (B, 2C, n_fft/2+1, T)
        ffted = self.conv_layer(ffted)

        # 3. istft를 위해 다시 복소수 텐서로 변환
        ffted = ffted.view((-1, 2,) + ffted.size()[2:]).permute(0, 2, 3, 1).contiguous()  # (BC, n_fft/2+1, T, 2)
        ffted = torch.view_as_complex(ffted)    # ★ 추가

        output = torch.istft(ffted, self.n_fft, hop_length=self.hop_size, win_length=self.win_size, window=self.hann_window,
                          center=True, normalized=True, onesided=True)
        output = output.view(batch, -1, x.size()[-1])
        return output


 class SpectralTransform(nn.Module):
    def __init__(self, in_channels, out_channels, bsft_channels, **audio_kwargs):
        # bn_layer not used
        super(SpectralTransform, self).__init__()
        self.conv1 = Conv1d(
            in_channels, out_channels // 2, kernel_size=1, bias=False)

        self.fu = FourierUnit(out_channels // 2, out_channels // 2, bsft_channels, **audio_kwargs)

        self.conv2 = Conv1d(
            out_channels // 2, out_channels, kernel_size=1, bias=False)

    def forward(self, x, band):
        x = silu(self.conv1(x))
        output = self.fu(x, band)
        output = self.conv2(x + output)

        return output


 class FFC(nn.Module): # STFC
    def __init__(self, in_channels, out_channels, bsft_channels, kernel_size=3,
                 ratio_gin=0.5, ratio_gout=0.5, padding=1,
                 **audio_kwargs):
        super(FFC, self).__init__()

        in_cg = int(in_channels * ratio_gin)
        in_cl = in_channels - in_cg
        out_cg = int(out_channels * ratio_gout)
        out_cl = out_channels - out_cg

        self.ratio_gin = ratio_gin
        self.ratio_gout = ratio_gout
        self.global_in_num = in_cg

        self.convl2l = Conv1d(in_cl, out_cl, kernel_size, padding=padding, bias=False)
        self.convl2g = Conv1d(in_cl, out_cg, kernel_size, padding=padding, bias=False)
        self.convg2l = Conv1d(in_cg, out_cl, kernel_size, padding=padding, bias=False)
        self.convg2g = SpectralTransform(in_cg, out_cg, bsft_channels, **audio_kwargs)

    def forward(self, x_l, x_g, band):
        out_xl = self.convl2l(x_l) + self.convg2l(x_g)
        out_xg = self.convl2g(x_l) + self.convg2g(x_g, band)

        return out_xl, out_xg


 class ResidualBlock(nn.Module):
    def __init__(self, residual_channels, pos_emb_dim, bsft_channels, **audio_kwargs):
        super().__init__()
        self.ffc1 = FFC(residual_channels, 2*residual_channels, bsft_channels,
                               kernel_size=3, ratio_gin=0.5, ratio_gout=0.5, padding=1, **audio_kwargs) # STFC

        self.diffusion_projection = Linear(pos_emb_dim, residual_channels)
        self.output_projection = Conv1d(residual_channels,
                                        2 * residual_channels, 1)

    def forward(self, x, band, noise_level):
        noise_level = self.diffusion_projection(noise_level).unsqueeze(-1)

        y = x + noise_level
        y_l, y_g = torch.split(y, [y.shape[1] - self.ffc1.global_in_num, self.ffc1.global_in_num], dim=1)
        y_l, y_g = self.ffc1(y_l, y_g, band) # STFC
        gate_l, filter_l = torch.chunk(y_l, 2, dim=1)
        gate_g, filter_g = torch.chunk(y_g, 2, dim=1)
        gate, filter = torch.cat((gate_l, gate_g), dim=1), torch.cat((filter_l, filter_g), dim=1)
        y = torch.sigmoid(gate) * torch.tanh(filter)
        y = self.output_projection(y)
        residual, skip = torch.chunk(y, 2, dim=1)
        return (x + residual) / sqrt(2.0), skip


 class NuWave2(nn.Module):
    def __init__(self, hparams):
        super().__init__()
        self.hparams = hparams
        self.input_projection = Conv1d(2, hparams.arch.residual_channels, 1)
        self.diffusion_embedding = DiffusionEmbedding(
            hparams)
        audio_kwargs = dict(filter_length = hparams.audio.filter_length, hop_length = hparams.audio.hop_length,
                          win_length = hparams.audio.win_length, sampling_rate = hparams.audio.sampling_rate)
        self.residual_layers = nn.ModuleList([
            ResidualBlock(hparams.arch.residual_channels,
                          hparams.arch.pos_emb_dim,
                          hparams.arch.bsft_channels,
                          **audio_kwargs)
            for i in range(hparams.arch.residual_layers)
        ])
        self.len_res = len(self.residual_layers)
        self.skip_projection = Conv1d(hparams.arch.residual_channels,
                                      hparams.arch.residual_channels, 1)
        self.output_projection = Conv1d(hparams.arch.residual_channels, 1, 1)

    def forward(self, audio, audio_low, band, noise_level):
        x = torch.stack((audio, audio_low), dim=1)
        x = self.input_projection(x)
        x = silu(x)
        noise_level = self.diffusion_embedding(noise_level)
        band = F.one_hot(band).transpose(1, -1).float()

        #This way is more faster!
        #skip = []
        skip =0.
        for layer in self.residual_layers:
            x, skip_connection = layer(x, band, noise_level)
            #skip.append(skip_connection)
            skip += skip_connection

        #x = torch.sum(torch.stack(skip), dim=0) / sqrt(self.len_res)
        x = skip / sqrt(self.len_res)
        x = self.skip_projection(x)
        x = silu(x)
        x = self.output_projection(x).squeeze(1)
        return x
diff --git a/nuwave2_utils_stft.py b/nuwave2_utils_stft.py
 import torch
 import torch.nn as nn
 import torch.nn.functional as F

 class STFTMag(nn.Module):
    def __init__(self,
                 nfft=1024,
                 hop=256):
        super().__init__()
        self.nfft = nfft
        self.hop = hop
        self.register_buffer('window', torch.hann_window(nfft), False)

    #x: [B,T] or [T]
    @torch.no_grad()
    def forward(self, x):
        T = x.shape[-1]
        # PyTorch 1.9+ 부터 torch.stft는 복소수 텐서를 반환하는 것이 기본값이 되었습니다.
        # return_complex=True로 명시하고, torch.abs()를 사용하여 크기를 계산합니다.
        # 이전 방식(return_complex=False)은 torch.norm(stft, p=2, dim =-1)을 사용했습니다.
        stft = torch.stft(
            x,
            self.nfft,
            self.hop,
            window=self.window,
            return_complex=True,    # ★ return_complex=True로 명시
        )
        mag = torch.abs(stft)       # ★ torch.abs()로 크기 계산
        return mag
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"collapsed_sections": [
	"Q-wMgl_X7kOD"
	]
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	},
	"accelerator": "GPU",
	"gpuClass": "standard"
	},
	"cells": [
	{
	"cell_type": "markdown",
	"source": [
	"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/shhommychon/a577a97c199d7b756d3bc2d1ac44c1ca/-my-speech-upscale-webui.ipynb)"
	],
	"metadata": {
	"id": "fQsXgzlF3tZ0"
	}
	},
	{
	"cell_type": "markdown",
	"source": [
	"# 음성 업스케일링\n",
	"- 능력자들이 만들어 놓은 음성 업스케일링 SoTA 모델들을 간단한 마우스 몇 번 '딸깍'으로 이용할 수 있도록 만든 페이지"
	],
	"metadata": {
	"id": "DB4U84-b-6aa"
	}
	},
	{
	"cell_type": "markdown",
	"source": [
	"##### setup"
	],
	"metadata": {
	"id": "Q-wMgl_X7kOD"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"!git clone https://gist.github.com/a577a97c199d7b756d3bc2d1ac44c1ca.git my-speech-upscale-webui"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "n3fLgpHJ7qj1",
	"executionInfo": {
	"status": "ok",
	"user_tz": -540,
	"elapsed": 1318
	},
	"outputId": "13b9a5ce-8c02-41f3-b0a9-500b21795047"
	},
	"execution_count": 0,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Cloning into 'my-speech-upscale-webui'...\n",
	"remote: Enumerating objects: 22, done.\u001b[K\n",
	"remote: Counting objects: 100% (3/3), done.\u001b[K\n",
	"remote: Compressing objects: 100% (3/3), done.\u001b[K\n",
	"remote: Total 22 (delta 0), reused 2 (delta 0), pack-reused 19 (from 1)\u001b[K\n",
	"Receiving objects: 100% (22/22), 16.72 KiB \| 194.00 KiB/s, done.\n",
	"Resolving deltas: 100% (2/2), done.\n"
	]
	}
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"###### Nu-Wave 2\n",
	"- Seoul Natl. Univ.\n",
	"- 2022.06\n",
	"- [ArXiv](https://arxiv.org/abs/2206.08545), [GitHub](https://github.com/maum-ai/nuwave2)"
	],
	"metadata": {
	"id": "ACGu2Edk7r4t"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"!git clone --recursive https://github.com/mindslab-ai/nuwave2.git\n",
	"\n",
	"!rm nuwave2/utils/stft.py; mv my-speech-upscale-webui/nuwave2_utils_stft.py nuwave2/utils/stft.py\n",
	"!rm nuwave2/model.py; mv my-speech-upscale-webui/nuwave2_model.py nuwave2/model.py\n",
	"!rm nuwave2/inference.py; mv my-speech-upscale-webui/nuwave2_inference.py nuwave2/inference.py"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "peqjrwYF-52_",
	"executionInfo": {
	"status": "ok",
	"user_tz": -540,
	"elapsed": 7554
	},
	"outputId": "32ac41c4-8371-4245-8905-ece74b834a57"
	},
	"execution_count": 0,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Cloning into 'nuwave2'...\n",
	"remote: Enumerating objects: 239, done.\u001b[K\n",
	"remote: Counting objects: 100% (55/55), done.\u001b[K\n",
	"remote: Compressing objects: 100% (48/48), done.\u001b[K\n",
	"remote: Total 239 (delta 7), reused 7 (delta 7), pack-reused 184 (from 1)\u001b[K\n",
	"Receiving objects: 100% (239/239), 55.91 MiB \| 14.08 MiB/s, done.\n",
	"Resolving deltas: 100% (14/14), done.\n",
	"Submodule 'vctk-silence-labels' (https://github.com/nii-yamagishilab/vctk-silence-labels.git) registered for path 'vctk-silence-labels'\n",
	"Cloning into '/content/nuwave2/vctk-silence-labels'...\n",
	"remote: Enumerating objects: 11, done. \n",
	"remote: Counting objects: 100% (11/11), done. \n",
	"remote: Compressing objects: 100% (11/11), done. \n",
	"remote: Total 11 (delta 2), reused 7 (delta 0), pack-reused 0 (from 0) \n",
	"Receiving objects: 100% (11/11), 242.52 KiB \| 20.21 MiB/s, done.\n",
	"Resolving deltas: 100% (2/2), done.\n",
	"Submodule path 'vctk-silence-labels': checked out 'd8910f2eb6750f25a5d8a993ea1f0ef732075021'\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"!pip -qq install pytorch_lightning"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "ocDgypRMD-GL",
	"executionInfo": {
	"status": "ok",
	"user_tz": -540,
	"elapsed": 5110
	},
	"outputId": "81a9dcc7-a8d7-46cd-b86d-2e4cc5151da4"
	},
	"execution_count": 0,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/832.4 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m832.4/832.4 kB\u001b[0m \u001b[31m45.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/983.2 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m983.2/983.2 kB\u001b[0m \u001b[31m67.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[?25h"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"!gdown 11t0cQYx6ZadKQjmfGnqxUUH2UEk5Yzk7 # 8 kHz ~ 48 kHz source → 48 kHz output\n",
	"!gdown 1IZihqb0LKHLtqRjyhHBGxXHJhUwskVRo # 3.2 kHz ~ 16 kHz source → 16 kHz output"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "w5AAGbbC9rdn",
	"executionInfo": {
	"status": "ok",
	"user_tz": -540,
	"elapsed": 17663
	},
	"outputId": "a7228ff9-d04d-49e6-fb7b-fdfb0115c2f0"
	},
	"execution_count": 0,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Downloading...\n",
	"From: https://drive.google.com/uc?id=11t0cQYx6ZadKQjmfGnqxUUH2UEk5Yzk7\n",
	"To: /content/nuwave2_02_16_13_epoch=629.ckpt\n",
	"100% 20.9M/20.9M [00:00<00:00, 40.1MB/s]\n",
	"Downloading...\n",
	"From: https://drive.google.com/uc?id=1IZihqb0LKHLtqRjyhHBGxXHJhUwskVRo\n",
	"To: /content/nuwave2_08_16_08_(3.2k-16k_to_16k)_epoch=584.ckpt\n",
	"100% 20.9M/20.9M [00:00<00:00, 93.2MB/s]\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"!export PATH=/content/nuwave2:$PATH"
	],
	"metadata": {
	"id": "try_oR_aEI1o",
	"executionInfo": {
	"status": "ok",
	"user_tz": -540,
	"elapsed": 106
	}
	},
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"###### AudioSR\n",
	"- Univ. of Surrey, Univ. of CA S.D., ByteDance\n",
	"- 2023.09\n",
	"- [ArXiv](https://arxiv.org/abs/2309.07314), [GitHub](https://github.com/haoheliu/versatile_audio_super_resolution)"
	],
	"metadata": {
	"id": "n3VYQ5kg9W-L"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"!git clone https://github.com/haoheliu/versatile_audio_super_resolution.git\n",
	"\n",
	"!rm versatile_audio_super_resolution/inference.py; mv my-speech-upscale-webui/audiosr_inference.py versatile_audio_super_resolution/inference.py"
	],
	"metadata": {
	"id": "gkX0yUyR9j7U",
	"executionInfo": {
	"status": "ok",
	"user_tz": -540,
	"elapsed": 2530
	},
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"outputId": "404330ab-f443-4b98-9b76-73bebde6364e"
	},
	"execution_count": 0,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"Cloning into 'versatile_audio_super_resolution'...\n",
	"remote: Enumerating objects: 475, done.\u001b[K\n",
	"remote: Counting objects: 100% (233/233), done.\u001b[K\n",
	"remote: Compressing objects: 100% (107/107), done.\u001b[K\n",
	"remote: Total 475 (delta 165), reused 139 (delta 126), pack-reused 242 (from 3)\u001b[K\n",
	"Receiving objects: 100% (475/475), 22.78 MiB \| 14.60 MiB/s, done.\n",
	"Resolving deltas: 100% (225/225), done.\n"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"!pip -qq install cog\n",
	"!pip -qq install ftfy\n",
	"!pip -qq install phonemizer\n",
	"!pip -qq install pyloudnorm\n",
	"!pip -qq install torchlibrosa\n",
	"!pip -qq install unidecode"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "gb8vtGGd5hym",
	"executionInfo": {
	"status": "ok",
	"user_tz": -540,
	"elapsed": 21010
	},
	"outputId": "46e394c4-6747-451d-f6ea-a2648ddf3ef8"
	},
	"execution_count": 0,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/78.1 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.1/78.1 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/60.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.8/60.8 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/95.5 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m95.5/95.5 kB\u001b[0m \u001b[31m8.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/67.2 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.2/67.2 kB\u001b[0m \u001b[31m6.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/510.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m510.8/510.8 kB\u001b[0m \u001b[31m30.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.0/72.0 kB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.7/4.7 MB\u001b[0m \u001b[31m91.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m452.2/452.2 kB\u001b[0m \u001b[31m31.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.8/44.8 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.2/48.2 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m103.8/103.8 kB\u001b[0m \u001b[31m8.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.9/59.9 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m213.4/213.4 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m565.1/565.1 kB\u001b[0m \u001b[31m33.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m235.8/235.8 kB\u001b[0m \u001b[31m11.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
	"\u001b[?25h"
	]
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"from huggingface_hub import hf_hub_download\n",
	"\n",
	"# AudioSR speech 모델 다운로드\n",
	"hf_hub_download(\n",
	" repo_id=\"haoheliu/audiosr_speech\", # 또는 \"haoheliu/audiosr_basic\"\n",
	" filename=\"pytorch_model.bin\",\n",
	")"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 209,
	"referenced_widgets": []
	},
	"id": "eE0F3YAs6P9v",
	"executionInfo": {
	"status": "ok",
	"user_tz": -540,
	"elapsed": 86877
	},
	"outputId": "475ca58e-ec91-4f8e-bb26-374ab2538c6d"
	},
	"execution_count": 0,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stderr",
	"text": [
	"/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning: \n",
	"The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
	"To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
	"You will be able to reuse this secret in all of your notebooks.\n",
	"Please note that authentication is recommended but still optional to access public models or datasets.\n",
	" warnings.warn(\n"
	]
	},
	{
	"output_type": "display_data",
	"data": {
	"text/plain": [
	"pytorch_model.bin: 0%\| \| 0.00/6.18G [00:00<?, ?B/s]"
	]
	},
	"metadata": {}
	},
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"'/root/.cache/huggingface/hub/models--haoheliu--audiosr_speech/snapshots/413f1d734411663e95310c17d381279a0c049960/pytorch_model.bin'"
	],
	"application/vnd.google.colaboratory.intrinsic+json": {
	"type": "string"
	}
	},
	"metadata": {},
	"execution_count": 0
	}
	]
	},
	{
	"cell_type": "code",
	"source": [
	"!export PATH=/content/versatile_audio_super_resolution:$PATH"
	],
	"metadata": {
	"id": "gLFE3RWy6P4Y",
	"executionInfo": {
	"status": "ok",
	"user_tz": -540,
	"elapsed": 107
	}
	},
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"###### FlowHigh"
	],
	"metadata": {
	"id": "QvBWOGw49kSW"
	}
	},
	{
	"cell_type": "code",
	"source": [],
	"metadata": {
	"id": "VxMHYh2y9l2e",
	"executionInfo": {
	"status": "ok",
	"user_tz": -540,
	"elapsed": 2
	}
	},
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"##### run"
	],
	"metadata": {
	"id": "F4qIvS54-8iS"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"!python3 nuwave2/inference.py -c /content/nuwave2_02_16_13_epoch=629.ckpt -i sample.wav --sr 15000"
	],
	"metadata": {
	"id": "3a_oLQi24x2Q",
	"executionInfo": {
	"status": "ok",
	"user_tz": -540,
	"elapsed": 37946
	}
	},
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"!python3 versatile_audio_super_resolution/inference.py --input sample.wav --output ./audiosr_output"
	],
	"metadata": {
	"colab": {
	"base_uri": "https://localhost:8080/"
	},
	"id": "mTOT8spL7LGC",
	"executionInfo": {
	"status": "ok",
	"user_tz": -540,
	"elapsed": 157407
	},
	"outputId": "92797cd0-f2ae-4c62-a2ee-fb096fcdbf33"
	},
	"execution_count": 0,
	"outputs": [
	{
	"output_type": "stream",
	"name": "stdout",
	"text": [
	"2025-09-11 14:50:11.971072: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
	"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
	"E0000 00:00:1757602212.295152 2230 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
	"E0000 00:00:1757602212.376606 2230 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
	"W0000 00:00:1757602213.034513 2230 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
	"W0000 00:00:1757602213.034551 2230 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
	"W0000 00:00:1757602213.034555 2230 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
	"W0000 00:00:1757602213.034559 2230 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.\n",
	"2025-09-11 14:50:13.092107: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
	"To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
	"tokenizer_config.json: 100% 25.0/25.0 [00:00<00:00, 191kB/s]\n",
	"vocab.json: 100% 899k/899k [00:00<00:00, 5.26MB/s]\n",
	"merges.txt: 100% 456k/456k [00:00<00:00, 2.88MB/s]\n",
	"tokenizer.json: 100% 1.36M/1.36M [00:00<00:00, 111MB/s]\n",
	"config.json: 100% 481/481 [00:00<00:00, 4.99MB/s]\n",
	"/usr/local/lib/python3.12/dist-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers\n",
	" warnings.warn(f\"Importing from {__name__} is deprecated, please import via timm.layers\", FutureWarning)\n",
	"Loading Model...\n",
	"Loading AudioSR: speech\n",
	"Loading model on cuda:0\n",
	"DiffusionWrapper has 258.20 M params.\n",
	"Model loaded!\n",
	"Setting seed to: 3096934589\n",
	"overlap = 0.04\n",
	"guidance_scale = 3.5\n",
	"ddim_steps = 50\n",
	"chunk_size = 10.24\n",
	"multiband_ensemble = None\n",
	"input file = sample.wav\n",
	"audio.shape = (176640,)\n",
	"input cutoff = 12000\n",
	"Audio is mono\n",
	"enable_overlap = True\n",
	"Processing chunk 1 of 1 for Left/Mono channel\n",
	"Running DDIM Sampling with 50 timesteps\n",
	"DDIM Sampler: 100% 50/50 [00:11<00:00, 4.22it/s]\n",
	"file created: ./audiosr_output/SR_sample.wav\n",
	"\u001b[0m"
	]
	}
	]
	}
	]
	}
	import gc
	import os
	import random
	import numpy as np
	from scipy.signal.windows import hann
	import soundfile as sf
	import torch
	from cog import BasePredictor, Input, Path
	import tempfile
	import argparse
	import librosa
	from audiosr import build_model, super_resolution
	from scipy import signal
	import pyloudnorm as pyln


	import warnings
	warnings.filterwarnings("ignore")

	os.environ["TOKENIZERS_PARALLELISM"] = "true"
	torch.set_float32_matmul_precision("high")

	def match_array_shapes(array_1:np.ndarray, array_2:np.ndarray):
	if (len(array_1.shape) == 1) & (len(array_2.shape) == 1):
	if array_1.shape[0] > array_2.shape[0]:
	array_1 = array_1[:array_2.shape[0]]
	elif array_1.shape[0] < array_2.shape[0]:
	array_1 = np.pad(array_1, ((array_2.shape[0] - array_1.shape[0], 0)), 'constant', constant_values=0)
	else:
	if array_1.shape[1] > array_2.shape[1]:
	array_1 = array_1[:,:array_2.shape[1]]
	elif array_1.shape[1] < array_2.shape[1]:
	padding = array_2.shape[1] - array_1.shape[1]
	array_1 = np.pad(array_1, ((0,0), (0,padding)), 'constant', constant_values=0)
	return array_1


	def lr_filter(audio, cutoff, filter_type, order=12, sr=48000):
	audio = audio.T
	nyquist = 0.5 * sr
	normal_cutoff = cutoff / nyquist
	b, a = signal.butter(order//2, normal_cutoff, btype=filter_type, analog=False)
	sos = signal.tf2sos(b, a)
	filtered_audio = signal.sosfiltfilt(sos, audio)
	return filtered_audio.T

	class Predictor(BasePredictor):
	def setup(self, model_name="basic", device="auto"):
	self.model_name = model_name
	self.device = device
	self.sr = 48000
	print("Loading Model...")
	self.audiosr = build_model(model_name=self.model_name, device=self.device)
	# print(self.audiosr)
	# exit()
	print("Model loaded!")

	def process_audio(self, input_file, chunk_size=5.12, overlap=0.1, seed=None, guidance_scale=3.5, ddim_steps=50):
	audio, sr = librosa.load(input_file, sr=input_cutoff * 2, mono=False)
	audio = audio.T
	sr = input_cutoff * 2
	print(f"audio.shape = {audio.shape}")
	print(f"input cutoff = {input_cutoff}")

	is_stereo = len(audio.shape) == 2
	audio_channels = [audio] if not is_stereo else [audio[:, 0], audio[:, 1]]
	print("audio is stereo" if is_stereo else "Audio is mono")

	chunk_samples = int(chunk_size * sr)
	overlap_samples = int(overlap * chunk_samples)
	output_chunk_samples = int(chunk_size * self.sr)
	output_overlap_samples = int(overlap * output_chunk_samples)
	enable_overlap = overlap > 0
	print(f"enable_overlap = {enable_overlap}")

	def process_chunks(audio):
	chunks = []
	original_lengths = []
	start = 0
	while start < len(audio):
	end = min(start + chunk_samples, len(audio))
	chunk = audio[start:end]
	if len(chunk) < chunk_samples:
	original_lengths.append(len(chunk))
	chunk = np.concatenate([chunk, np.zeros(chunk_samples - len(chunk))])
	else:
	original_lengths.append(chunk_samples)
	chunks.append(chunk)
	start += chunk_samples - overlap_samples if enable_overlap else chunk_samples
	return chunks, original_lengths

	# Process both channels (mono or stereo)
	chunks_per_channel = [process_chunks(channel) for channel in audio_channels]
	sample_rate_ratio = self.sr / sr
	total_length = len(chunks_per_channel[0][0]) * output_chunk_samples - (len(chunks_per_channel[0][0]) - 1) * (output_overlap_samples if enable_overlap else 0)
	reconstructed_channels = [np.zeros((1, total_length)) for _ in audio_channels]

	meter_before = pyln.Meter(sr)
	meter_after = pyln.Meter(self.sr)

	# Process chunks for each channel
	for ch_idx, (chunks, original_lengths) in enumerate(chunks_per_channel):
	for i, chunk in enumerate(chunks):
	loudness_before = meter_before.integrated_loudness(chunk)
	print(f"Processing chunk {i+1} of {len(chunks)} for {'Left/Mono' if ch_idx == 0 else 'Right'} channel")
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_wav:
	sf.write(temp_wav.name, chunk, sr)

	out_chunk = super_resolution(
	self.audiosr,
	temp_wav.name,
	seed=seed,
	guidance_scale=guidance_scale,
	ddim_steps=ddim_steps,
	latent_t_per_second=12.8
	)

	out_chunk = out_chunk[0]
	num_samples_to_keep = int(original_lengths[i] * sample_rate_ratio)
	out_chunk = out_chunk[:, :num_samples_to_keep].squeeze()
	loudness_after = meter_after.integrated_loudness(out_chunk)
	out_chunk = pyln.normalize.loudness(out_chunk, loudness_after, loudness_before)

	if enable_overlap:
	actual_overlap_samples = min(output_overlap_samples, num_samples_to_keep)
	fade_out = np.linspace(1., 0., actual_overlap_samples)
	fade_in = np.linspace(0., 1., actual_overlap_samples)

	if i == 0:
	out_chunk[-actual_overlap_samples:] *= fade_out
	elif i < len(chunks) - 1:
	out_chunk[:actual_overlap_samples] *= fade_in
	out_chunk[-actual_overlap_samples:] *= fade_out
	else:
	out_chunk[:actual_overlap_samples] *= fade_in

	start = i * (output_chunk_samples - output_overlap_samples if enable_overlap else output_chunk_samples)
	end = start + out_chunk.shape[0]
	reconstructed_channels[ch_idx][0, start:end] += out_chunk.flatten()

	reconstructed_audio = np.stack(reconstructed_channels, axis=-1) if is_stereo else reconstructed_channels[0]

	if multiband_ensemble:
	low, _ = librosa.load(input_file, sr=48000, mono=False)
	output = match_array_shapes(reconstructed_audio[0].T, low)
	low = lr_filter(low.T, crossover_freq, 'lowpass', order=10)
	high = lr_filter(output.T, crossover_freq, 'highpass', order=10)
	high = lr_filter(high, 23000, 'lowpass', order=2)
	output = low + high
	else:
	output = reconstructed_audio[0]
	# print(output, type(output))
	return output


	def predict(self,
	input_file: Path = Input(description="Audio to upsample"),
	ddim_steps: int = Input(description="Number of inference steps", default=50, ge=10, le=500),
	guidance_scale: float = Input(description="Scale for classifier free guidance", default=3.5, ge=1.0, le=20.0),
	overlap: float = Input(description="overlap size", default=0.04),
	chunk_size: float = Input(description="chunksize", default=10.24),
	seed: int = Input(description="Random seed. Leave blank to randomize the seed", default=None)
	) -> Path:

	if seed == 0:
	seed = random.randint(0, 2**32 - 1)
	print(f"Setting seed to: {seed}")
	print(f"overlap = {overlap}")
	print(f"guidance_scale = {guidance_scale}")
	print(f"ddim_steps = {ddim_steps}")
	print(f"chunk_size = {chunk_size}")
	print(f"multiband_ensemble = {multiband_ensemble}")
	print(f"input file = {os.path.basename(input_file)}")
	os.makedirs(output_folder, exist_ok=True)
	waveform = self.process_audio(
	input_file,
	chunk_size=chunk_size,
	overlap=overlap,
	seed=seed,
	guidance_scale=guidance_scale,
	ddim_steps=ddim_steps
	)

	filename = os.path.splitext(os.path.basename(input_file))[0]
	sf.write(f"{output_folder}/SR_{filename}.wav", data=waveform, samplerate=48000, subtype="PCM_16")
	print(f"file created: {output_folder}/SR_{filename}.wav")
	del self.audiosr, waveform
	gc.collect()
	torch.cuda.empty_cache()


	if __name__ == "__main__":

	parser = argparse.ArgumentParser(description="Find volume difference of two audio files.")
	parser.add_argument("--input", help="Path to input audio file")
	parser.add_argument("--output", help="Output folder")
	# 모델을 선택할 수 있도록 --model_name 인자를 추가합니다. (기본값: 'speech')
	parser.add_argument("--model_name", help="Model name: speech or basic", type=str, required=False, default="speech", choices=["speech", "basic"]) # ★ 추가
	parser.add_argument("--ddim_steps", help="Number of ddim steps", type=int, required=False, default=50)
	parser.add_argument("--chunk_size", help="chunk size", type=float, required=False, default=10.24)
	parser.add_argument("--guidance_scale", help="Guidance scale value", type=float, required=False, default=3.5)
	parser.add_argument("--seed", help="Seed value, 0 = random seed", type=int, required=False, default=0)
	parser.add_argument("--overlap", help="overlap value", type=float, required=False, default=0.04)
	parser.add_argument("--multiband_ensemble", type=bool, help="Use multiband ensemble with input")
	parser.add_argument("--input_cutoff", help="Define the crossover of audio input in the multiband ensemble", type=int, required=False, default=12000)

	args = parser.parse_args()

	input_file_path = args.input
	output_folder = args.output
	ddim_steps = args.ddim_steps
	chunk_size = args.chunk_size
	guidance_scale = args.guidance_scale
	seed = args.seed
	overlap = args.overlap
	input_cutoff = args.input_cutoff
	multiband_ensemble = args.multiband_ensemble

	crossover_freq = input_cutoff - 1000

	p = Predictor()

	# 커맨드 라인에서 받은 model_name을 setup에 전달하며,
	# 기본값은 음성 모델 ('haoheliu/audiosr_speech') 입니다. (※ audiosr/utils.py#L393 참고.)
	p.setup(model_name=args.model_name, device='auto') # ★ 추가


	out = p.predict(
	input_file_path,
	ddim_steps=ddim_steps,
	guidance_scale=guidance_scale,
	seed=seed,
	chunk_size=chunk_size,
	overlap=overlap
	)

	del p
	gc.collect()
	torch.cuda.empty_cache()
	MOST OF IT IS NOT MY CODE PUBLIC LICENSE

	:~!777!!!!!~~^::. .::^~~!!!7!!77!~:
	!J7777!!!!!!!7777!!^ ^!!!777!!!!!!!7777J7.
	!J7!7777!!777??!77???: :???777??777!!777777J7.
	~~^^^^^^^^^^^~!7^77!~^!. .!^~!77^7!~^^^^^^^^^^^~!.
	^!^^^^^^^^~!~!~7^~^( O)7 .(O )^~^7^!!!~^^^^^^^^!~
	~7^^^^^^^^?!!??7.^!7J5777 7775J?!~.!??!!?^^^^^^^^!!
	.~~^^^^^^^7!!???~::::^:^^~ ~^^:^^:.:~7??!~7^^^^^^^~!.
	:~!~~~~~!7J7!777^~~~!~~!. .!~~!~~~~7?7!7J7!~~~~~!~:
	!::::::!77!!77JJJ?JJJ: :JJJ?JJJ77!!777::::::7.
	.!::::::!?!7!777??JJJ: :YJJ??777!77?7::::::!.
	::7~^:..::^?7!!77!J777?? ??777J777!!7?^::..:^~7^:
	.~JYB##BBGY7~:..^~!?7??!????. .????!7?7?!~^:.:^7YGBB###5J!.
	~JP#&&&########G5?!~~!J!~77!7~ ~7!77~!J!~^!?YGB######&&&&#GJ!.
	.:^~77?YPB###&&&&##BB#G^... ...^P#BB##&&&&###BG5J?7~^:..
	.^~7Y5PGBBBBB#Y Y#BBBBBGPPY?!^.
	..:::::: .::::::.
	Man, I stole your code It's not my code

	Version 0.1.0, May 2023

	Copyright (c) 2023 Chon, Sung Hyu

	Everyone is permitted to copy and distribute, modify this work as long as the
	original licenses of this work is maintained.
	Most of the contents of this work are under MIT, GPL, or some other major public
	licenses. Use this work at your own risk.
	If copyright infringement occurs, the author will pretend that the author does
	not know anything, does not intend to defend you, and will claim that it's not
	the author's fault.

	MOST OF IT IS NOT MY CODE PUBLIC LICENSE
	TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION

	1. You do what the original authors tell you that you can do.

	2. If the original license or author is not mentioned in some parts of code,
	those codes are probably written by the author. You can use those parts of
	code however you would like to, but keep in mind that the author has very
	little experience in programming. Do not blame the author when you use those
	codes for some serious applications and something goes wrong.
	from lightning_model import NuWave2
	from omegaconf import OmegaConf as OC
	import os
	import argparse
	import datetime
	from glob import glob
	import torch
	import librosa as rosa
	from scipy.io.wavfile import write as swrite
	import matplotlib.pyplot as plt
	from utils.stft import STFTMag
	import numpy as np
	from scipy.signal import sosfiltfilt
	from scipy.signal import butter, cheby1, cheby2, ellip, bessel
	from scipy.signal import resample_poly
	import random


	def save_stft_mag(wav, fname):
	fig = plt.figure(figsize=(9, 3))
	plt.imshow(rosa.amplitude_to_db(stft(wav[0].detach().cpu()).numpy(),
	ref=np.max, top_db = 80.),
	aspect='auto',
	origin='lower',
	interpolation='none')
	plt.colorbar()
	plt.xlabel('Frames')
	plt.ylabel('Channels')
	plt.tight_layout()
	fig.savefig(fname, format='png')
	plt.close()
	return

	if __name__ == '__main__':
	parser = argparse.ArgumentParser()
	parser.add_argument('-c',
	'--checkpoint',
	type=str,
	required=True,
	help="Checkpoint path")
	parser.add_argument('-i',
	'--wav',
	type=str,
	default=None,
	help="audio")
	parser.add_argument('--sr',
	type=int,
	required=True,
	help="Sampling rate of input audio")
	parser.add_argument('--steps',
	type=int,
	required=False,
	help="Steps for sampling")
	parser.add_argument('--gt', action="store_true",
	required=False, help="Whether the input audio is 48 kHz ground truth audio.")
	parser.add_argument('--device',
	type=str,
	default='cuda',
	required=False,
	help="Device, 'cuda' or 'cpu'")

	args = parser.parse_args()
	#torch.backends.cudnn.benchmark = False
	hparams = OC.load('nuwave2/hparameter.yaml')
	os.makedirs(hparams.log.test_result_dir, exist_ok=True)
	if args.steps is None or args.steps == 8:
	args.steps = 8
	noise_schedule = eval(hparams.dpm.infer_schedule)
	else:
	noise_schedule = None
	model = NuWave2(hparams).to(args.device)
	model.eval()
	stft = STFTMag()
	# PyTorch 2.0+ 버전부터는 보안상의 이유로 weights_only=True가 기본값이 되었습니다.
	# 공식 체크포인트는 신뢰할 수 있으므로, 이전 버전과 동일하게 동작하도록 weights_only=False로 설정합니다.
	ckpt = torch.load(args.checkpoint, map_location='cpu', weights_only=False) # ★ weights_only=False로 명시
	model.load_state_dict(ckpt['state_dict'] if not('EMA' in args.checkpoint) else ckpt)

	highcut = args.sr // 2
	nyq = 0.5 * hparams.audio.sampling_rate
	hi = highcut / nyq

	if args.gt:
	wav, _ = rosa.load(args.wav, sr=hparams.audio.sampling_rate, mono=True)
	wav /= np.max(np.abs(wav))
	wav = wav[:len(wav) - len(wav) % hparams.audio.hop_length]

	order = 8
	sos = cheby1(order, 0.05, hi, btype='lowpass', output='sos')
	wav_l = sosfiltfilt(sos, wav)

	# downsample to the low sampling rate
	wav_l = resample_poly(wav_l, highcut * 2, hparams.audio.sampling_rate)
	# upsample to the original sampling rate
	wav_l = resample_poly(wav_l, hparams.audio.sampling_rate, highcut * 2)

	if len(wav_l) < len(wav):
	wav_l = np.pad(wav, (0, len(wav) - len(wav_l)), 'constant', constant_values=0)
	elif len(wav_l) > len(wav):
	wav_l = wav_l[:len(wav)]
	else:
	wav, _ = rosa.load(args.wav, sr=args.sr, mono=True)
	wav /= np.max(np.abs(wav))

	# upsample to the original sampling rate
	wav_l = resample_poly(wav, hparams.audio.sampling_rate, args.sr)
	wav_l = wav_l[:len(wav_l) - len(wav_l) % hparams.audio.hop_length]

	fft_size = hparams.audio.filter_length // 2 + 1
	band = torch.zeros(fft_size, dtype=torch.int64)
	band[:int(hi * fft_size)] = 1

	wav = torch.from_numpy(wav).unsqueeze(0).to(args.device)
	wav_l = torch.from_numpy(wav_l.copy()).float().unsqueeze(0).to(args.device)
	band = band.unsqueeze(0).to(args.device)

	wav_recon, wav_list = model.inference(wav_l, band, args.steps, noise_schedule)

	wav = torch.clamp(wav, min=-1, max=1 - torch.finfo(torch.float16).eps)
	save_stft_mag(wav, os.path.join(hparams.log.test_result_dir, f'wav.png'))
	if args.gt:
	swrite(os.path.join(hparams.log.test_result_dir, f'wav.wav'),
	hparams.audio.sampling_rate, wav[0].detach().cpu().numpy())
	else:
	swrite(os.path.join(hparams.log.test_result_dir, f'wav.wav'),
	args.sr, wav[0].detach().cpu().numpy())

	wav_l = torch.clamp(wav_l, min=-1, max=1 - torch.finfo(torch.float16).eps)
	save_stft_mag(wav_l, os.path.join(hparams.log.test_result_dir, f'wav_l.png'))
	swrite(os.path.join(hparams.log.test_result_dir, f'wav_l.wav'),
	hparams.audio.sampling_rate, wav_l[0].detach().cpu().numpy())

	wav_recon = torch.clamp(wav_recon, min=-1, max=1 - torch.finfo(torch.float16).eps)
	save_stft_mag(wav_recon, os.path.join(hparams.log.test_result_dir, f'result.png'))
	swrite(os.path.join(hparams.log.test_result_dir, f'result.wav'),
	hparams.audio.sampling_rate, wav_recon[0].detach().cpu().numpy())

	# for i in range(len(wav_list)):
	# wav_recon_i = torch.clamp(wav_list[i], min=-1, max=1-torch.finfo(torch.float16).eps)
	# save_stft_mag(wav_recon_i, os.path.join(hparams.log.test_result_dir, f'result_{i}.png'))
	# swrite(os.path.join(hparams.log.test_result_dir, f'result_{i}.wav'),
	# hparams.audio.sampling_rate, wav_recon_i[0].detach().cpu().numpy())
	#Some codes are adopted from
	#https://github.com/ivanvovk/WaveGrad
	#https://github.com/lmnt-com/diffwave
	#https://github.com/NVlabs/SPADE
	#https://github.com/pkumivision/FFC

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	import torch.fft
	from math import sqrt, log

	Linear = nn.Linear
	silu = F.silu
	relu = F.relu

	def Conv1d(args, *kwargs):
	layer = nn.Conv1d(args, *kwargs)
	nn.init.kaiming_normal_(layer.weight)
	return layer

	def Conv2d(args, *kwargs):
	layer = nn.Conv2d(args, *kwargs)
	nn.init.kaiming_normal_(layer.weight)
	return layer


	class DiffusionEmbedding(nn.Module):
	def __init__(self, hparams):
	super().__init__()
	self.n_channels = hparams.dpm.pos_emb_channels
	self.linear_scale = hparams.dpm.pos_emb_scale
	self.out_channels = hparams.arch.pos_emb_dim

	self.projection1 = Linear(self.n_channels, self.out_channels)
	self.projection2 = Linear(self.out_channels, self.out_channels)

	def forward(self, noise_level):
	if len(noise_level.shape) > 1:
	noise_level = noise_level.squeeze(-1)
	half_dim = self.n_channels // 2
	emb = log(10000) / (half_dim - 1)
	emb = torch.exp(torch.arange(half_dim, dtype=torch.float32).to(noise_level) * -emb)
	emb = self.linear_scale * noise_level.unsqueeze(1) * emb.unsqueeze(0)
	emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
	emb = self.projection1(emb)
	emb = silu(emb)
	emb = self.projection2(emb)
	emb = silu(emb)
	return emb


	class BSFT(nn.Module):
	def __init__(self, nhidden, out_channels):
	super().__init__()
	self.mlp_shared = nn.Conv1d(2, nhidden, kernel_size=3, padding=1)

	self.mlp_gamma = Conv1d(nhidden, out_channels, kernel_size=3, padding=1)
	self.mlp_beta = Conv1d(nhidden, out_channels, kernel_size=3, padding=1)

	def forward(self, x, band):
	# band: (B, 2, n_fft // 2 + 1)
	actv = silu(self.mlp_shared(band))

	gamma = self.mlp_gamma(actv).unsqueeze(-1)
	beta = self.mlp_beta(actv).unsqueeze(-1)

	# apply scale and bias
	out = x * (1 + gamma) + beta

	return out


	class FourierUnit(nn.Module):
	def __init__(self, in_channels, out_channels, bsft_channels, filter_length=1024, hop_length=256, win_length=1024,
	sampling_rate=48000):
	# bn_layer not used
	super(FourierUnit, self).__init__()
	self.sampling_rate = sampling_rate
	self.n_fft = filter_length
	self.hop_size = hop_length
	self.win_size = win_length
	hann_window = torch.hann_window(win_length)
	self.register_buffer('hann_window', hann_window)

	self.conv_layer = Conv2d(in_channels=in_channels * 2, out_channels=out_channels * 2,
	kernel_size=1, padding=0, bias=False)
	self.bsft = BSFT(bsft_channels, out_channels * 2)

	def forward(self, x, band):
	batch = x.shape[0]

	x = x.view(-1, x.size()[-1])

	# PyTorch 1.9+ 버전 호환성을 위한 stft/istft 처리 방식 변경
	# 1. stft는 복소수 텐서를 반환 (return_complex=True)
	ffted = torch.stft(
	x, self.n_fft,
	hop_length=self.hop_size,
	win_length=self.win_size,
	window=self.hann_window,
	center=True,
	normalized=True,
	onesided=True,
	return_complex=True, # ★ False에서 True로 변경
	)

	# 2. 컨볼루션 연산을 위해 복소수 텐서를 실수형으로 변환 (채널 차원 추가)
	ffted = torch.view_as_real(ffted) # ★ 추가
	ffted = ffted.permute(0, 3, 1, 2).contiguous() # (BC, 2, n_fft/2+1, T)
	ffted = ffted.view((batch, -1,) + ffted.size()[2:]) # (B, 2C, n_fft/2+1, T)

	ffted = relu(self.bsft(ffted, band)) # (B, 2C, n_fft/2+1, T)
	ffted = self.conv_layer(ffted)

	# 3. istft를 위해 다시 복소수 텐서로 변환
	ffted = ffted.view((-1, 2,) + ffted.size()[2:]).permute(0, 2, 3, 1).contiguous() # (BC, n_fft/2+1, T, 2)
	ffted = torch.view_as_complex(ffted) # ★ 추가

	output = torch.istft(ffted, self.n_fft, hop_length=self.hop_size, win_length=self.win_size, window=self.hann_window,
	center=True, normalized=True, onesided=True)
	output = output.view(batch, -1, x.size()[-1])
	return output


	class SpectralTransform(nn.Module):
	def __init__(self, in_channels, out_channels, bsft_channels, **audio_kwargs):
	# bn_layer not used
	super(SpectralTransform, self).__init__()
	self.conv1 = Conv1d(
	in_channels, out_channels // 2, kernel_size=1, bias=False)

	self.fu = FourierUnit(out_channels // 2, out_channels // 2, bsft_channels, **audio_kwargs)

	self.conv2 = Conv1d(
	out_channels // 2, out_channels, kernel_size=1, bias=False)

	def forward(self, x, band):
	x = silu(self.conv1(x))
	output = self.fu(x, band)
	output = self.conv2(x + output)

	return output


	class FFC(nn.Module): # STFC
	def __init__(self, in_channels, out_channels, bsft_channels, kernel_size=3,
	ratio_gin=0.5, ratio_gout=0.5, padding=1,
	**audio_kwargs):
	super(FFC, self).__init__()

	in_cg = int(in_channels * ratio_gin)
	in_cl = in_channels - in_cg
	out_cg = int(out_channels * ratio_gout)
	out_cl = out_channels - out_cg

	self.ratio_gin = ratio_gin
	self.ratio_gout = ratio_gout
	self.global_in_num = in_cg

	self.convl2l = Conv1d(in_cl, out_cl, kernel_size, padding=padding, bias=False)
	self.convl2g = Conv1d(in_cl, out_cg, kernel_size, padding=padding, bias=False)
	self.convg2l = Conv1d(in_cg, out_cl, kernel_size, padding=padding, bias=False)
	self.convg2g = SpectralTransform(in_cg, out_cg, bsft_channels, **audio_kwargs)

	def forward(self, x_l, x_g, band):
	out_xl = self.convl2l(x_l) + self.convg2l(x_g)
	out_xg = self.convl2g(x_l) + self.convg2g(x_g, band)

	return out_xl, out_xg


	class ResidualBlock(nn.Module):
	def __init__(self, residual_channels, pos_emb_dim, bsft_channels, **audio_kwargs):
	super().__init__()
	self.ffc1 = FFC(residual_channels, 2*residual_channels, bsft_channels,
	kernel_size=3, ratio_gin=0.5, ratio_gout=0.5, padding=1, **audio_kwargs) # STFC

	self.diffusion_projection = Linear(pos_emb_dim, residual_channels)
	self.output_projection = Conv1d(residual_channels,
	2 * residual_channels, 1)

	def forward(self, x, band, noise_level):
	noise_level = self.diffusion_projection(noise_level).unsqueeze(-1)

	y = x + noise_level
	y_l, y_g = torch.split(y, [y.shape[1] - self.ffc1.global_in_num, self.ffc1.global_in_num], dim=1)
	y_l, y_g = self.ffc1(y_l, y_g, band) # STFC
	gate_l, filter_l = torch.chunk(y_l, 2, dim=1)
	gate_g, filter_g = torch.chunk(y_g, 2, dim=1)
	gate, filter = torch.cat((gate_l, gate_g), dim=1), torch.cat((filter_l, filter_g), dim=1)
	y = torch.sigmoid(gate) * torch.tanh(filter)
	y = self.output_projection(y)
	residual, skip = torch.chunk(y, 2, dim=1)
	return (x + residual) / sqrt(2.0), skip


	class NuWave2(nn.Module):
	def __init__(self, hparams):
	super().__init__()
	self.hparams = hparams
	self.input_projection = Conv1d(2, hparams.arch.residual_channels, 1)
	self.diffusion_embedding = DiffusionEmbedding(
	hparams)
	audio_kwargs = dict(filter_length = hparams.audio.filter_length, hop_length = hparams.audio.hop_length,
	win_length = hparams.audio.win_length, sampling_rate = hparams.audio.sampling_rate)
	self.residual_layers = nn.ModuleList([
	ResidualBlock(hparams.arch.residual_channels,
	hparams.arch.pos_emb_dim,
	hparams.arch.bsft_channels,
	**audio_kwargs)
	for i in range(hparams.arch.residual_layers)
	])
	self.len_res = len(self.residual_layers)
	self.skip_projection = Conv1d(hparams.arch.residual_channels,
	hparams.arch.residual_channels, 1)
	self.output_projection = Conv1d(hparams.arch.residual_channels, 1, 1)

	def forward(self, audio, audio_low, band, noise_level):
	x = torch.stack((audio, audio_low), dim=1)
	x = self.input_projection(x)
	x = silu(x)
	noise_level = self.diffusion_embedding(noise_level)
	band = F.one_hot(band).transpose(1, -1).float()

	#This way is more faster!
	#skip = []
	skip =0.
	for layer in self.residual_layers:
	x, skip_connection = layer(x, band, noise_level)
	#skip.append(skip_connection)
	skip += skip_connection

	#x = torch.sum(torch.stack(skip), dim=0) / sqrt(self.len_res)
	x = skip / sqrt(self.len_res)
	x = self.skip_projection(x)
	x = silu(x)
	x = self.output_projection(x).squeeze(1)
	return x