Skip to content

Instantly share code, notes, and snippets.

@schroneko
Created December 30, 2024 06:17
Show Gist options
  • Select an option

  • Save schroneko/b302b9258425ebcbec009874798af69a to your computer and use it in GitHub Desktop.

Select an option

Save schroneko/b302b9258425ebcbec009874798af69a to your computer and use it in GitHub Desktop.
import re
from bs4 import BeautifulSoup
import json
def extract_rss_urls(html_file):
with open(html_file, 'r', encoding='utf-8') as f:
html_content = f.read()
soup = BeautifulSoup(html_content, 'html.parser')
urls = []
for a in soup.find_all('a', href=True):
href = a['href']
if any([
href.endswith('.atom'),
href.endswith('/rss'),
href.endswith('/feed'),
'feed' in href.lower(),
'rss' in href.lower(),
'atom' in href.lower(),
'note.com' in href,
'zenn.dev' in href,
'qiita.com' in href,
'connpass.com' in href,
'speakerdeck.com' in href,
'hatenablog.com' in href,
'substack.com' in href
]):
if not any([
'feed/tag' in href.lower(),
'feed/category' in href.lower(),
'feed/author' in href.lower(),
'feed/comments' in href.lower(),
'javascript:' in href.lower(),
href.endswith('.jpg'),
href.endswith('.png'),
href.endswith('.gif'),
href.endswith('.css'),
href.endswith('.js')
]):
urls.append(href)
unique_urls = list(set(urls))
with open('rss.txt', 'w', encoding='utf-8') as f:
for url in sorted(unique_urls):
f.write(url + '\n')
feeds = []
for url in sorted(unique_urls):
feed_name = url.split('/')[-2] if url.endswith('/rss') or url.endswith('/feed') else url.split('/')[-2]
feeds.append({
"feed": feed_name,
"url": url
})
with open('feeds.json', 'w', encoding='utf-8') as f:
json.dump(feeds, f, indent=2, ensure_ascii=False)
print(f"Total feeds found: {len(unique_urls)}")
if __name__ == "__main__":
extract_rss_urls('index.html')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment