Last active
February 17, 2024 17:52
-
-
Save shhommychon/4287a1660a76045e066c9da53ab01d73 to your computer and use it in GitHub Desktop.
내가 쓰려고 남겨놓는 깨진 한글 복원기
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [], | |
| "collapsed_sections": [ | |
| "zFbzRE9-LwwJ" | |
| ] | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "[](https://colab.research.google.com/gist/shhommychon/4287a1660a76045e066c9da53ab01d73)" | |
| ], | |
| "metadata": { | |
| "id": "3udRTKlHLSCi" | |
| } | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# 인코딩 깨진 한글 복원하기" | |
| ], | |
| "metadata": { | |
| "id": "5NmzYbcSLqxZ" | |
| } | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## 코덱" | |
| ], | |
| "metadata": { | |
| "id": "zFbzRE9-LwwJ" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "id": "z6yE7NQjBpc9" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "STRING_CODECS = (\n", | |
| " \"ascii\",\n", | |
| " \"big5\",\n", | |
| " \"big5hkscs\",\n", | |
| " \"cp037\",\n", | |
| " \"cp1006\",\n", | |
| " \"cp1026\",\n", | |
| " \"cp1125\",\n", | |
| " \"cp1140\",\n", | |
| " \"cp1250\",\n", | |
| " \"cp1251\",\n", | |
| " \"cp1252\",\n", | |
| " \"cp1253\",\n", | |
| " \"cp1254\",\n", | |
| " \"cp1255\",\n", | |
| " \"cp1256\",\n", | |
| " \"cp1257\",\n", | |
| " \"cp1258\",\n", | |
| " \"cp273\",\n", | |
| " \"cp424\",\n", | |
| " \"cp437\",\n", | |
| " \"cp500\",\n", | |
| " \"cp65001\",\n", | |
| " \"cp720\",\n", | |
| " \"cp737\",\n", | |
| " \"cp775\",\n", | |
| " \"cp850\",\n", | |
| " \"cp852\",\n", | |
| " \"cp855\",\n", | |
| " \"cp856\",\n", | |
| " \"cp857\",\n", | |
| " \"cp858\",\n", | |
| " \"cp860\",\n", | |
| " \"cp861\",\n", | |
| " \"cp862\",\n", | |
| " \"cp863\",\n", | |
| " \"cp864\",\n", | |
| " \"cp865\",\n", | |
| " \"cp866\",\n", | |
| " \"cp869\",\n", | |
| " \"cp874\",\n", | |
| " \"cp875\",\n", | |
| " \"cp932\",\n", | |
| " \"cp949\",\n", | |
| " \"cp950\",\n", | |
| " \"euc_jis_2004\",\n", | |
| " \"euc_jisx0213\",\n", | |
| " \"euc_jp\",\n", | |
| " \"euc_kr\",\n", | |
| " \"gb18030\",\n", | |
| " \"gb2312\",\n", | |
| " \"gbk\",\n", | |
| " \"hz\",\n", | |
| " \"iso2022_jp\",\n", | |
| " \"iso2022_jp_1\",\n", | |
| " \"iso2022_jp_2\",\n", | |
| " \"iso2022_jp_2004\",\n", | |
| " \"iso2022_jp_3\",\n", | |
| " \"iso2022_jp_ext\",\n", | |
| " \"iso2022_kr\",\n", | |
| " \"iso8859_10\",\n", | |
| " \"iso8859_11\",\n", | |
| " \"iso8859_13\",\n", | |
| " \"iso8859_14\",\n", | |
| " \"iso8859_15\",\n", | |
| " \"iso8859_16\",\n", | |
| " \"iso8859_2\",\n", | |
| " \"iso8859_3\",\n", | |
| " \"iso8859_4\",\n", | |
| " \"iso8859_5\",\n", | |
| " \"iso8859_6\",\n", | |
| " \"iso8859_7\",\n", | |
| " \"iso8859_8\",\n", | |
| " \"iso8859_9\",\n", | |
| " \"johab\",\n", | |
| " \"koi8_r\",\n", | |
| " \"koi8_t\",\n", | |
| " \"koi8_u\",\n", | |
| " \"kz1048\",\n", | |
| " \"latin_1\",\n", | |
| " \"mac_cyrillic\",\n", | |
| " \"mac_greek\",\n", | |
| " \"mac_iceland\",\n", | |
| " \"mac_latin2\",\n", | |
| " \"mac_roman\",\n", | |
| " \"mac_turkish\",\n", | |
| " \"ptcp154\",\n", | |
| " \"shift_jis\",\n", | |
| " \"shift_jis_2004\",\n", | |
| " \"shift_jisx0213\",\n", | |
| " \"utf_16\",\n", | |
| " \"utf_16_be\",\n", | |
| " \"utf_16_le\",\n", | |
| " \"utf_32\",\n", | |
| " \"utf_32_be\",\n", | |
| " \"utf_32_le\",\n", | |
| " \"utf_7\",\n", | |
| " \"utf_8\",\n", | |
| " \"utf_8_sig\",\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## 올바른 코덱 찾기" | |
| ], | |
| "metadata": { | |
| "id": "WuqyakeJLysY" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "target_str = \"D:\\Data\\Backup\\øæ≥؃ƒªÁ¡¯∏\\uf8ff¿Ω\t815.1 MB\t2015-12-20 ø¿»ƒ 10:32:32\"" | |
| ], | |
| "metadata": { | |
| "id": "xjU2mBBjH2H2" | |
| }, | |
| "execution_count": 2, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import re\n", | |
| "\n", | |
| "KOR_REGEX = re.compile(\"[가-힣]+\")\n", | |
| "contains_kor = lambda s: KOR_REGEX.search(s) is not None\n", | |
| "\n", | |
| "for ec in STRING_CODECS:\n", | |
| " for dc in STRING_CODECS:\n", | |
| " if ec == dc: continue\n", | |
| " try:\n", | |
| " translated_str = target_str.encode(ec).decode(dc)\n", | |
| " if contains_kor(translated_str):\n", | |
| " print(f\"ENC: {ec} / DEC: {dc}\")\n", | |
| " print(f\"\\t{translated_str}\\n\")\n", | |
| " except:\n", | |
| " continue" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "4ilXc-grIDen", | |
| "outputId": "970516c9-5f60-455a-dd7d-d83b66ccd90a" | |
| }, | |
| "execution_count": 3, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "ENC: cp65001 / DEC: utf_16\n", | |
| "\t㩄䑜瑡屡慂正灵썜쎸ꖉ飃鋆鋆ꫂ臃ꇂ꿂裢뾣뿂㠉㔱ㄮ䴠ू〲㔱ㄭⴲ〲쌠슸슿욻ₒ〱㌺㨲㈳\n", | |
| "\n", | |
| "ENC: cp65001 / DEC: utf_16_be\n", | |
| "\t䐺屄慴慜䉡捫異峃룃ꛢ覥쎘욒욒슪쎁슡슯迯ꎿ슿캩सㄵ⸱⁍䈉㈰ㄵⴱ㈭㈰룂뿂믆鈠㨳㈺㌲\n", | |
| "\n", | |
| "ENC: cp65001 / DEC: utf_16_le\n", | |
| "\t㩄䑜瑡屡慂正灵썜쎸ꖉ飃鋆鋆ꫂ臃ꇂ꿂裢뾣뿂㠉㔱ㄮ䴠ू〲㔱ㄭⴲ〲쌠슸슿욻ₒ〱㌺㨲㈳\n", | |
| "\n", | |
| "ENC: mac_iceland / DEC: cp949\n", | |
| "\tD:\\Data\\Backup\\옛날컴사진모음\t815.1 MB\t2015-12-20 오후 10:32:32\n", | |
| "\n", | |
| "ENC: mac_iceland / DEC: euc_kr\n", | |
| "\tD:\\Data\\Backup\\옛날컴사진모음\t815.1 MB\t2015-12-20 오후 10:32:32\n", | |
| "\n", | |
| "ENC: mac_roman / DEC: cp949\n", | |
| "\tD:\\Data\\Backup\\옛날컴사진모음\t815.1 MB\t2015-12-20 오후 10:32:32\n", | |
| "\n", | |
| "ENC: mac_roman / DEC: euc_kr\n", | |
| "\tD:\\Data\\Backup\\옛날컴사진모음\t815.1 MB\t2015-12-20 오후 10:32:32\n", | |
| "\n", | |
| "ENC: mac_turkish / DEC: cp949\n", | |
| "\tD:\\Data\\Backup\\옛날컴사진모음\t815.1 MB\t2015-12-20 오후 10:32:32\n", | |
| "\n", | |
| "ENC: mac_turkish / DEC: euc_kr\n", | |
| "\tD:\\Data\\Backup\\옛날컴사진모음\t815.1 MB\t2015-12-20 오후 10:32:32\n", | |
| "\n", | |
| "ENC: utf_8 / DEC: utf_16\n", | |
| "\t㩄䑜瑡屡慂正灵썜쎸ꖉ飃鋆鋆ꫂ臃ꇂ꿂裢뾣뿂㠉㔱ㄮ䴠ू〲㔱ㄭⴲ〲쌠슸슿욻ₒ〱㌺㨲㈳\n", | |
| "\n", | |
| "ENC: utf_8 / DEC: utf_16_be\n", | |
| "\t䐺屄慴慜䉡捫異峃룃ꛢ覥쎘욒욒슪쎁슡슯迯ꎿ슿캩सㄵ⸱⁍䈉㈰ㄵⴱ㈭㈰룂뿂믆鈠㨳㈺㌲\n", | |
| "\n", | |
| "ENC: utf_8 / DEC: utf_16_le\n", | |
| "\t㩄䑜瑡屡慂正灵썜쎸ꖉ飃鋆鋆ꫂ臃ꇂ꿂裢뾣뿂㠉㔱ㄮ䴠ू〲㔱ㄭⴲ〲쌠슸슿욻ₒ〱㌺㨲㈳\n", | |
| "\n" | |
| ] | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment