Created
October 24, 2022 20:35
-
-
Save yy-zhong/7e29e5c69d14e459dc4b752f7e94960c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # 需要实现备份原先的仓库 | |
| import json | |
| import os | |
| import re | |
| import shutil | |
| # 所有出现过的链接,包括没有对应文件的链接 | |
| ALL_LINKS = [] | |
| # 每个链接对应的文件相对路径 | |
| LINK_TO_FILENAME = {} | |
| # 每个文件中的链接列表 | |
| LINKS_IN_FILE = {} | |
| # 每个链接对应的文件列表,即一个链接出现在哪几个文件中 | |
| REVERSE_INDEX = {} | |
| # 确实有对应文件的链接 | |
| STRIPED_LINKS = [] | |
| # 还没创建的链接 | |
| UNCREATED_LINKS = [] | |
| # 尚未和其他文件建立联系的链接 | |
| ISOLATED_LINKS = [] | |
| def delete_non_md_files_and_dirs(vault_name: str): | |
| """清空所有非md文件和文件夹 并删除空文件夹""" | |
| for root, dirs, files in os.walk(vault_name): | |
| for file in files: | |
| if not file.endswith(".md"): | |
| os.remove(os.path.join(root, file)) | |
| for root, dirs, files in os.walk(vault_name): | |
| if not os.listdir(root): | |
| os.rmdir(root) | |
| def get_all_links_in_file(filename): | |
| """ | |
| 获取文件中的所有链接名, | |
| 返回一个不包含`.md`后缀的列表 | |
| 如title.md返回["APP", "test"],说明里面有[[APP]]和[[test]] | |
| """ | |
| with open(filename, "r", encoding="utf-8") as f: | |
| content = f.read() | |
| link_in_one_file = re.findall(r"\[\[(.*?)\]\]", content) | |
| return link_in_one_file | |
| def find_files_by_link(link, vault_name): | |
| """ | |
| 在整个vault中找寻的文件 返回文件路径的相对路径 | |
| 每个路径的开头一定是/,例如 /test/test.md | |
| """ | |
| for root, dirs, files in os.walk(vault_name): | |
| for file in files: | |
| if file == link + ".md": | |
| LINK_TO_FILENAME[link] = os.path.join(root, file) | |
| return os.path.join(root, file).replace(vault_name, "") | |
| # Get every file's links | |
| def traverse_all_files(vault_name): | |
| """遍历所有文件""" | |
| for root, dirs, files in os.walk(vault_name): | |
| for file in files: | |
| # 获取单个文件中的所有链接 | |
| links_in_one_file = get_all_links_in_file(os.path.join(root, file)) | |
| if links_in_one_file: | |
| LINKS_IN_FILE[os.path.join(root, file)] = links_in_one_file | |
| for link in links_in_one_file: | |
| # 将发现的所有链接写入ALL_LINKS列表(这里包括没有对应文件的链接) | |
| if link.split("|")[0] not in ALL_LINKS: | |
| ALL_LINKS.append(link.split("|")[0]) | |
| # 找出链接对应的文件路径 | |
| files = find_files_by_link(link, vault_name) | |
| # 将链接和对应的文件路径写入LINK_TO_FILENAME字典 | |
| if files: | |
| LINK_TO_FILENAME[link] = files | |
| else: | |
| if file.replace(".md", "") not in ALL_LINKS: | |
| ALL_LINKS.append(file.replace(".md", "")) | |
| LINK_TO_FILENAME[file.replace(".md", "")] = os.path.join( | |
| root, file | |
| ).replace(vault_name, "") | |
| def replace_double_link_to_normal_link(vault_name): | |
| """ | |
| 遍历整个仓库,将双向链接替换为单向链接 | |
| 原本是[[test]],现在变成[test](/test/test.md) | |
| 这个会修改文件本体,所以要在最后执行 | |
| """ | |
| for root, dirs, files in os.walk(vault_name): | |
| for file in files: | |
| with open(os.path.join(root, file), "r", encoding="utf-8") as f: | |
| content = f.read() | |
| links = re.findall(r"\[\[(.*?)\]\]", content) | |
| for link in links: | |
| if link in LINK_TO_FILENAME: | |
| content = content.replace( | |
| "[[" + link + "]]", | |
| "[" + link + "]" + "(" + LINK_TO_FILENAME[link] + ")", | |
| ) | |
| with open(os.path.join(root, file), "w", encoding="utf-8") as f: | |
| f.write(content) | |
| def wrap_all_url_links(vault_name): | |
| """ | |
| 将所有的HTTP链接用尖括号包裹起来 | |
| 这个会修改文件本体,所以要在最后执行 | |
| """ | |
| for root, dirs, files in os.walk(vault_name): | |
| for file in files: | |
| with open(os.path.join(root, file), "r", encoding="utf-8") as f: | |
| content = f.read() | |
| # Find all http links | |
| links = re.findall(r"(http.*?)(?=\s)", content) | |
| for link in links: | |
| content = content.replace(link, "<" + link + ">") | |
| with open(os.path.join(root, file), "w", encoding="utf-8") as f: | |
| f.write(content) | |
| f.close() | |
| def create_reverse_index(): | |
| # 创建反向索引,每个链接对应的文件 | |
| for file, links in LINKS_IN_FILE.items(): | |
| for link in links: | |
| if link in REVERSE_INDEX: | |
| REVERSE_INDEX[link].append(file) | |
| else: | |
| REVERSE_INDEX[link] = [file] | |
| def strip_empty_links(): | |
| """将确实有对应文件的链接写入STRIPED_LINKS列表""" | |
| for link in ALL_LINKS: | |
| if link in LINK_TO_FILENAME: | |
| STRIPED_LINKS.append(link) | |
| else: | |
| UNCREATED_LINKS.append(link) | |
| def flush_all_stored_data(): | |
| with open("LINK_TO_FILENAME.json", "w", encoding="utf-8") as f: | |
| # sort the dict by key | |
| f.write( | |
| json.dumps(LINK_TO_FILENAME, indent=4, sort_keys=True, ensure_ascii=False) | |
| ) | |
| f.close() | |
| with open("REVERSE_INDEX.json", "w", encoding="utf-8") as f: | |
| f.write(json.dumps(REVERSE_INDEX, indent=4, sort_keys=True, ensure_ascii=False)) | |
| f.close() | |
| with open("ALL_LINKS.json", "w", encoding="utf-8") as f: | |
| # Sort the list | |
| f.write(json.dumps(sorted(ALL_LINKS), indent=4, ensure_ascii=False)) | |
| f.close() | |
| with open("LINKS_IN_FILE.json", "w", encoding="utf-8") as f: | |
| f.write(json.dumps(LINKS_IN_FILE, indent=4, sort_keys=True, ensure_ascii=False)) | |
| f.close() | |
| with open("STRIPED_LINKS.json", "w", encoding="utf-8") as f: | |
| f.write(json.dumps(STRIPED_LINKS, indent=4, sort_keys=True, ensure_ascii=False)) | |
| f.close() | |
| with open("UNCREATED_LINKS.json", "w", encoding="utf-8") as f: | |
| f.write(json.dumps(sorted(UNCREATED_LINKS), indent=4, ensure_ascii=False)) | |
| f.close() | |
| with open("ISOLATED_LINKS.json", "w", encoding="utf-8") as f: | |
| f.write(json.dumps(sorted(ISOLATED_LINKS), indent=4, ensure_ascii=False)) | |
| f.close() | |
| # Flush all stored data | |
| ALL_LINKS.clear() | |
| LINK_TO_FILENAME.clear() | |
| LINKS_IN_FILE.clear() | |
| REVERSE_INDEX.clear() | |
| STRIPED_LINKS.clear() | |
| UNCREATED_LINKS.clear() | |
| ISOLATED_LINKS.clear() | |
| def find_isolated_links(): | |
| """找出没有对应文件的链接""" | |
| for link in STRIPED_LINKS: | |
| if link not in REVERSE_INDEX: | |
| ISOLATED_LINKS.append(link) | |
| if __name__ == "__main__": | |
| VAULT_NAME = "content" | |
| delete_non_md_files_and_dirs(vault_name=VAULT_NAME) | |
| traverse_all_files(vault_name=VAULT_NAME) | |
| create_reverse_index() | |
| strip_empty_links() | |
| find_isolated_links() | |
| print("All links: ", len(ALL_LINKS)) | |
| print("Striped links: ", len(STRIPED_LINKS)) | |
| print("Uncreated links: ", len(UNCREATED_LINKS)) | |
| flush_all_stored_data() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment