shhommychon · February 17, 2024 12:30
diff --git a/drop_duplicate_favorites.ipynb b/drop_duplicate_favorites.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "collapsed_sections": [
        "HqorbDcCCad0"
      ]
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/shhommychon/96e944260fb0a2d7549fce27ff9c3032)"
      ],
      "metadata": {
        "id": "KgfwUDWSBNmd"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "# 인터넷 즐겨찾기 중복 링크 제거"
      ],
      "metadata": {
        "id": "t1eLqShWCUFh"
      }
    },
    {
      "cell_type": "markdown",
      "source": [
        "## 설정"
      ],
      "metadata": {
        "id": "f6k-YS_VCY4x"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "EDGE_FAVORITES_FILENAME = \"favorites_2_17_24.html\"\n",
        "OUTPUT_FILENAME = \"noduplicates_2_17_24.html\"\n",
        "\n",
        "LEAVE_EARLIEST_ROW = False  # leave row with latest date if false\n",
        "LEAVE_UPPER_ROW = False  # leave row with bigger index if false"
      ],
      "metadata": {
        "id": "DGxcVkrbAiSI"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## 중복 제거"
      ],
      "metadata": {
        "id": "HqorbDcCCad0"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "with open(EDGE_FAVORITES_FILENAME, 'r') as f:\n",
        "    lines = f.readlines()\n",
        "len(lines)"
      ],
      "metadata": {
        "id": "d6Mq_1V8N5fx"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import pandas as pd\n",
        "\n",
        "df = pd.DataFrame({\"line\": lines})\n",
        "df = df.reset_index()\n",
        "df[\"IS_DT\"] = df[\"line\"].apply(lambda line: line.strip()[:6] == \"<DT><A\")\n",
        "df.head()"
      ],
      "metadata": {
        "id": "EFNZupY6N-a0"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "import re\n",
        "\n",
        "href_regex = re.compile(\"HREF=\\\"([^\\\"]+)\\\"\")\n",
        "date_regex = re.compile(\"ADD_DATE=\\\"([^\\\"]+)\\\"\")\n",
        "\n",
        "def href_func(line):\n",
        "    m = href_regex.search(line)\n",
        "    if m is not None:\n",
        "        href = m.group(1)\n",
        "        if href[-1] == '/': href = href[:-1]\n",
        "        return href\n",
        "    else:\n",
        "        return ''\n",
        "\n",
        "def date_func(line):\n",
        "    m = href_regex.search(line)\n",
        "    if m is not None:\n",
        "        m = date_regex.search(line)\n",
        "        date = m.group(1)\n",
        "        return date\n",
        "    else:\n",
        "        return ''\n",
        "\n",
        "df[\"HREF\"] = df[\"line\"].apply(href_func)\n",
        "df[\"DATE\"] = df[\"line\"].apply(date_func)\n",
        "df.head()"
      ],
      "metadata": {
        "id": "u2ZUcF5a4_YY"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "other_df = df[~df[\"IS_DT\"]]\n",
        "dt_df = df[df[\"IS_DT\"]]\n",
        "\n",
        "len(dt_df), len(other_df), len(dt_df)+len(other_df)==len(df)"
      ],
      "metadata": {
        "id": "JpafoMuL5064"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "clean_df = dt_df.sort_values(by=[\"DATE\", \"index\"], ascending=[LEAVE_EARLIEST_ROW, LEAVE_UPPER_ROW]).groupby(\"HREF\").head(1)\n",
        "len(dt_df), len(clean_df)"
      ],
      "metadata": {
        "id": "Ast75k_A75oZ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "new_df = pd.concat([clean_df, other_df]).sort_values(by=\"index\")\n",
        "new_df.head()"
      ],
      "metadata": {
        "id": "zNNZXHKA-VSs"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "with open(OUTPUT_FILENAME, 'w') as f:\n",
        "    for _, value in new_df[\"line\"].items():\n",
        "        f.write(value)"
      ],
      "metadata": {
        "id": "cTYHFU9H-uJY"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"provenance": [],
	"collapsed_sections": [
	"HqorbDcCCad0"
	]
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"language_info": {
	"name": "python"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"source": [
	"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/shhommychon/96e944260fb0a2d7549fce27ff9c3032)"
	],
	"metadata": {
	"id": "KgfwUDWSBNmd"
	}
	},
	{
	"cell_type": "markdown",
	"source": [
	"# 인터넷 즐겨찾기 중복 링크 제거"
	],
	"metadata": {
	"id": "t1eLqShWCUFh"
	}
	},
	{
	"cell_type": "markdown",
	"source": [
	"## 설정"
	],
	"metadata": {
	"id": "f6k-YS_VCY4x"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"EDGE_FAVORITES_FILENAME = \"favorites_2_17_24.html\"\n",
	"OUTPUT_FILENAME = \"noduplicates_2_17_24.html\"\n",
	"\n",
	"LEAVE_EARLIEST_ROW = False # leave row with latest date if false\n",
	"LEAVE_UPPER_ROW = False # leave row with bigger index if false"
	],
	"metadata": {
	"id": "DGxcVkrbAiSI"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"## 중복 제거"
	],
	"metadata": {
	"id": "HqorbDcCCad0"
	}
	},
	{
	"cell_type": "code",
	"source": [
	"with open(EDGE_FAVORITES_FILENAME, 'r') as f:\n",
	" lines = f.readlines()\n",
	"len(lines)"
	],
	"metadata": {
	"id": "d6Mq_1V8N5fx"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"import pandas as pd\n",
	"\n",
	"df = pd.DataFrame({\"line\": lines})\n",
	"df = df.reset_index()\n",
	"df[\"IS_DT\"] = df[\"line\"].apply(lambda line: line.strip()[:6] == \"<DT><A\")\n",
	"df.head()"
	],
	"metadata": {
	"id": "EFNZupY6N-a0"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"import re\n",
	"\n",
	"href_regex = re.compile(\"HREF=\\\"([^\\\"]+)\\\"\")\n",
	"date_regex = re.compile(\"ADD_DATE=\\\"([^\\\"]+)\\\"\")\n",
	"\n",
	"def href_func(line):\n",
	" m = href_regex.search(line)\n",
	" if m is not None:\n",
	" href = m.group(1)\n",
	" if href[-1] == '/': href = href[:-1]\n",
	" return href\n",
	" else:\n",
	" return ''\n",
	"\n",
	"def date_func(line):\n",
	" m = href_regex.search(line)\n",
	" if m is not None:\n",
	" m = date_regex.search(line)\n",
	" date = m.group(1)\n",
	" return date\n",
	" else:\n",
	" return ''\n",
	"\n",
	"df[\"HREF\"] = df[\"line\"].apply(href_func)\n",
	"df[\"DATE\"] = df[\"line\"].apply(date_func)\n",
	"df.head()"
	],
	"metadata": {
	"id": "u2ZUcF5a4_YY"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"other_df = df[~df[\"IS_DT\"]]\n",
	"dt_df = df[df[\"IS_DT\"]]\n",
	"\n",
	"len(dt_df), len(other_df), len(dt_df)+len(other_df)==len(df)"
	],
	"metadata": {
	"id": "JpafoMuL5064"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"clean_df = dt_df.sort_values(by=[\"DATE\", \"index\"], ascending=[LEAVE_EARLIEST_ROW, LEAVE_UPPER_ROW]).groupby(\"HREF\").head(1)\n",
	"len(dt_df), len(clean_df)"
	],
	"metadata": {
	"id": "Ast75k_A75oZ"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"new_df = pd.concat([clean_df, other_df]).sort_values(by=\"index\")\n",
	"new_df.head()"
	],
	"metadata": {
	"id": "zNNZXHKA-VSs"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"with open(OUTPUT_FILENAME, 'w') as f:\n",
	" for _, value in new_df[\"line\"].items():\n",
	" f.write(value)"
	],
	"metadata": {
	"id": "cTYHFU9H-uJY"
	},
	"execution_count": null,
	"outputs": []
	}
	]
	}
No results found