Last active
February 17, 2024 12:30
-
-
Save shhommychon/96e944260fb0a2d7549fce27ff9c3032 to your computer and use it in GitHub Desktop.
내가 쓰려고 남겨놓는 엣지 즐겨찾기 중복 제거기
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [], | |
| "collapsed_sections": [ | |
| "HqorbDcCCad0" | |
| ] | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "[](https://colab.research.google.com/gist/shhommychon/96e944260fb0a2d7549fce27ff9c3032)" | |
| ], | |
| "metadata": { | |
| "id": "KgfwUDWSBNmd" | |
| } | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# 인터넷 즐겨찾기 중복 링크 제거" | |
| ], | |
| "metadata": { | |
| "id": "t1eLqShWCUFh" | |
| } | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## 설정" | |
| ], | |
| "metadata": { | |
| "id": "f6k-YS_VCY4x" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "EDGE_FAVORITES_FILENAME = \"favorites_2_17_24.html\"\n", | |
| "OUTPUT_FILENAME = \"noduplicates_2_17_24.html\"\n", | |
| "\n", | |
| "LEAVE_EARLIEST_ROW = False # leave row with latest date if false\n", | |
| "LEAVE_UPPER_ROW = False # leave row with bigger index if false" | |
| ], | |
| "metadata": { | |
| "id": "DGxcVkrbAiSI" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "## 중복 제거" | |
| ], | |
| "metadata": { | |
| "id": "HqorbDcCCad0" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "with open(EDGE_FAVORITES_FILENAME, 'r') as f:\n", | |
| " lines = f.readlines()\n", | |
| "len(lines)" | |
| ], | |
| "metadata": { | |
| "id": "d6Mq_1V8N5fx" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import pandas as pd\n", | |
| "\n", | |
| "df = pd.DataFrame({\"line\": lines})\n", | |
| "df = df.reset_index()\n", | |
| "df[\"IS_DT\"] = df[\"line\"].apply(lambda line: line.strip()[:6] == \"<DT><A\")\n", | |
| "df.head()" | |
| ], | |
| "metadata": { | |
| "id": "EFNZupY6N-a0" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import re\n", | |
| "\n", | |
| "href_regex = re.compile(\"HREF=\\\"([^\\\"]+)\\\"\")\n", | |
| "date_regex = re.compile(\"ADD_DATE=\\\"([^\\\"]+)\\\"\")\n", | |
| "\n", | |
| "def href_func(line):\n", | |
| " m = href_regex.search(line)\n", | |
| " if m is not None:\n", | |
| " href = m.group(1)\n", | |
| " if href[-1] == '/': href = href[:-1]\n", | |
| " return href\n", | |
| " else:\n", | |
| " return ''\n", | |
| "\n", | |
| "def date_func(line):\n", | |
| " m = href_regex.search(line)\n", | |
| " if m is not None:\n", | |
| " m = date_regex.search(line)\n", | |
| " date = m.group(1)\n", | |
| " return date\n", | |
| " else:\n", | |
| " return ''\n", | |
| "\n", | |
| "df[\"HREF\"] = df[\"line\"].apply(href_func)\n", | |
| "df[\"DATE\"] = df[\"line\"].apply(date_func)\n", | |
| "df.head()" | |
| ], | |
| "metadata": { | |
| "id": "u2ZUcF5a4_YY" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "other_df = df[~df[\"IS_DT\"]]\n", | |
| "dt_df = df[df[\"IS_DT\"]]\n", | |
| "\n", | |
| "len(dt_df), len(other_df), len(dt_df)+len(other_df)==len(df)" | |
| ], | |
| "metadata": { | |
| "id": "JpafoMuL5064" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "clean_df = dt_df.sort_values(by=[\"DATE\", \"index\"], ascending=[LEAVE_EARLIEST_ROW, LEAVE_UPPER_ROW]).groupby(\"HREF\").head(1)\n", | |
| "len(dt_df), len(clean_df)" | |
| ], | |
| "metadata": { | |
| "id": "Ast75k_A75oZ" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "new_df = pd.concat([clean_df, other_df]).sort_values(by=\"index\")\n", | |
| "new_df.head()" | |
| ], | |
| "metadata": { | |
| "id": "zNNZXHKA-VSs" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "with open(OUTPUT_FILENAME, 'w') as f:\n", | |
| " for _, value in new_df[\"line\"].items():\n", | |
| " f.write(value)" | |
| ], | |
| "metadata": { | |
| "id": "cTYHFU9H-uJY" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment