Skip to content

Instantly share code, notes, and snippets.

@yy-zhong
Created October 24, 2022 20:35
Show Gist options
  • Select an option

  • Save yy-zhong/7e29e5c69d14e459dc4b752f7e94960c to your computer and use it in GitHub Desktop.

Select an option

Save yy-zhong/7e29e5c69d14e459dc4b752f7e94960c to your computer and use it in GitHub Desktop.
# 需要实现备份原先的仓库
import json
import os
import re
import shutil
# 所有出现过的链接,包括没有对应文件的链接
ALL_LINKS = []
# 每个链接对应的文件相对路径
LINK_TO_FILENAME = {}
# 每个文件中的链接列表
LINKS_IN_FILE = {}
# 每个链接对应的文件列表,即一个链接出现在哪几个文件中
REVERSE_INDEX = {}
# 确实有对应文件的链接
STRIPED_LINKS = []
# 还没创建的链接
UNCREATED_LINKS = []
# 尚未和其他文件建立联系的链接
ISOLATED_LINKS = []
def delete_non_md_files_and_dirs(vault_name: str):
"""清空所有非md文件和文件夹 并删除空文件夹"""
for root, dirs, files in os.walk(vault_name):
for file in files:
if not file.endswith(".md"):
os.remove(os.path.join(root, file))
for root, dirs, files in os.walk(vault_name):
if not os.listdir(root):
os.rmdir(root)
def get_all_links_in_file(filename):
"""
获取文件中的所有链接名,
返回一个不包含`.md`后缀的列表
如title.md返回["APP", "test"],说明里面有[[APP]]和[[test]]
"""
with open(filename, "r", encoding="utf-8") as f:
content = f.read()
link_in_one_file = re.findall(r"\[\[(.*?)\]\]", content)
return link_in_one_file
def find_files_by_link(link, vault_name):
"""
在整个vault中找寻的文件 返回文件路径的相对路径
每个路径的开头一定是/,例如 /test/test.md
"""
for root, dirs, files in os.walk(vault_name):
for file in files:
if file == link + ".md":
LINK_TO_FILENAME[link] = os.path.join(root, file)
return os.path.join(root, file).replace(vault_name, "")
# Get every file's links
def traverse_all_files(vault_name):
"""遍历所有文件"""
for root, dirs, files in os.walk(vault_name):
for file in files:
# 获取单个文件中的所有链接
links_in_one_file = get_all_links_in_file(os.path.join(root, file))
if links_in_one_file:
LINKS_IN_FILE[os.path.join(root, file)] = links_in_one_file
for link in links_in_one_file:
# 将发现的所有链接写入ALL_LINKS列表(这里包括没有对应文件的链接)
if link.split("|")[0] not in ALL_LINKS:
ALL_LINKS.append(link.split("|")[0])
# 找出链接对应的文件路径
files = find_files_by_link(link, vault_name)
# 将链接和对应的文件路径写入LINK_TO_FILENAME字典
if files:
LINK_TO_FILENAME[link] = files
else:
if file.replace(".md", "") not in ALL_LINKS:
ALL_LINKS.append(file.replace(".md", ""))
LINK_TO_FILENAME[file.replace(".md", "")] = os.path.join(
root, file
).replace(vault_name, "")
def replace_double_link_to_normal_link(vault_name):
"""
遍历整个仓库,将双向链接替换为单向链接
原本是[[test]],现在变成[test](/test/test.md)
这个会修改文件本体,所以要在最后执行
"""
for root, dirs, files in os.walk(vault_name):
for file in files:
with open(os.path.join(root, file), "r", encoding="utf-8") as f:
content = f.read()
links = re.findall(r"\[\[(.*?)\]\]", content)
for link in links:
if link in LINK_TO_FILENAME:
content = content.replace(
"[[" + link + "]]",
"[" + link + "]" + "(" + LINK_TO_FILENAME[link] + ")",
)
with open(os.path.join(root, file), "w", encoding="utf-8") as f:
f.write(content)
def wrap_all_url_links(vault_name):
"""
将所有的HTTP链接用尖括号包裹起来
这个会修改文件本体,所以要在最后执行
"""
for root, dirs, files in os.walk(vault_name):
for file in files:
with open(os.path.join(root, file), "r", encoding="utf-8") as f:
content = f.read()
# Find all http links
links = re.findall(r"(http.*?)(?=\s)", content)
for link in links:
content = content.replace(link, "<" + link + ">")
with open(os.path.join(root, file), "w", encoding="utf-8") as f:
f.write(content)
f.close()
def create_reverse_index():
# 创建反向索引,每个链接对应的文件
for file, links in LINKS_IN_FILE.items():
for link in links:
if link in REVERSE_INDEX:
REVERSE_INDEX[link].append(file)
else:
REVERSE_INDEX[link] = [file]
def strip_empty_links():
"""将确实有对应文件的链接写入STRIPED_LINKS列表"""
for link in ALL_LINKS:
if link in LINK_TO_FILENAME:
STRIPED_LINKS.append(link)
else:
UNCREATED_LINKS.append(link)
def flush_all_stored_data():
with open("LINK_TO_FILENAME.json", "w", encoding="utf-8") as f:
# sort the dict by key
f.write(
json.dumps(LINK_TO_FILENAME, indent=4, sort_keys=True, ensure_ascii=False)
)
f.close()
with open("REVERSE_INDEX.json", "w", encoding="utf-8") as f:
f.write(json.dumps(REVERSE_INDEX, indent=4, sort_keys=True, ensure_ascii=False))
f.close()
with open("ALL_LINKS.json", "w", encoding="utf-8") as f:
# Sort the list
f.write(json.dumps(sorted(ALL_LINKS), indent=4, ensure_ascii=False))
f.close()
with open("LINKS_IN_FILE.json", "w", encoding="utf-8") as f:
f.write(json.dumps(LINKS_IN_FILE, indent=4, sort_keys=True, ensure_ascii=False))
f.close()
with open("STRIPED_LINKS.json", "w", encoding="utf-8") as f:
f.write(json.dumps(STRIPED_LINKS, indent=4, sort_keys=True, ensure_ascii=False))
f.close()
with open("UNCREATED_LINKS.json", "w", encoding="utf-8") as f:
f.write(json.dumps(sorted(UNCREATED_LINKS), indent=4, ensure_ascii=False))
f.close()
with open("ISOLATED_LINKS.json", "w", encoding="utf-8") as f:
f.write(json.dumps(sorted(ISOLATED_LINKS), indent=4, ensure_ascii=False))
f.close()
# Flush all stored data
ALL_LINKS.clear()
LINK_TO_FILENAME.clear()
LINKS_IN_FILE.clear()
REVERSE_INDEX.clear()
STRIPED_LINKS.clear()
UNCREATED_LINKS.clear()
ISOLATED_LINKS.clear()
def find_isolated_links():
"""找出没有对应文件的链接"""
for link in STRIPED_LINKS:
if link not in REVERSE_INDEX:
ISOLATED_LINKS.append(link)
if __name__ == "__main__":
VAULT_NAME = "content"
delete_non_md_files_and_dirs(vault_name=VAULT_NAME)
traverse_all_files(vault_name=VAULT_NAME)
create_reverse_index()
strip_empty_links()
find_isolated_links()
print("All links: ", len(ALL_LINKS))
print("Striped links: ", len(STRIPED_LINKS))
print("Uncreated links: ", len(UNCREATED_LINKS))
flush_all_stored_data()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment