schroneko · December 30, 2024 06:17
diff --git a/extract_rss.py b/extract_rss.py
 import re
 from bs4 import BeautifulSoup
 import json

 def extract_rss_urls(html_file):
    with open(html_file, 'r', encoding='utf-8') as f:
        html_content = f.read()

    soup = BeautifulSoup(html_content, 'html.parser')

    urls = []
    for a in soup.find_all('a', href=True):
        href = a['href']
        if any([
            href.endswith('.atom'),
            href.endswith('/rss'),
            href.endswith('/feed'),
            'feed' in href.lower(),
            'rss' in href.lower(),
            'atom' in href.lower(),
            'note.com' in href,
            'zenn.dev' in href,
            'qiita.com' in href,
            'connpass.com' in href,
            'speakerdeck.com' in href,
            'hatenablog.com' in href,
            'substack.com' in href
        ]):
            if not any([
                'feed/tag' in href.lower(),
                'feed/category' in href.lower(),
                'feed/author' in href.lower(),
                'feed/comments' in href.lower(),
                'javascript:' in href.lower(),
                href.endswith('.jpg'),
                href.endswith('.png'),
                href.endswith('.gif'),
                href.endswith('.css'),
                href.endswith('.js')
            ]):
                urls.append(href)

    unique_urls = list(set(urls))
    
    with open('rss.txt', 'w', encoding='utf-8') as f:
        for url in sorted(unique_urls):
            f.write(url + '\n')

    feeds = []
    for url in sorted(unique_urls):
        feed_name = url.split('/')[-2] if url.endswith('/rss') or url.endswith('/feed') else url.split('/')[-2]
        feeds.append({
            "feed": feed_name,
            "url": url
        })
    
    with open('feeds.json', 'w', encoding='utf-8') as f:
        json.dump(feeds, f, indent=2, ensure_ascii=False)

    print(f"Total feeds found: {len(unique_urls)}")

 if __name__ == "__main__":
    extract_rss_urls('index.html')
	import re
	from bs4 import BeautifulSoup
	import json

	def extract_rss_urls(html_file):
	with open(html_file, 'r', encoding='utf-8') as f:
	html_content = f.read()

	soup = BeautifulSoup(html_content, 'html.parser')

	urls = []
	for a in soup.find_all('a', href=True):
	href = a['href']
	if any([
	href.endswith('.atom'),
	href.endswith('/rss'),
	href.endswith('/feed'),
	'feed' in href.lower(),
	'rss' in href.lower(),
	'atom' in href.lower(),
	'note.com' in href,
	'zenn.dev' in href,
	'qiita.com' in href,
	'connpass.com' in href,
	'speakerdeck.com' in href,
	'hatenablog.com' in href,
	'substack.com' in href
	]):
	if not any([
	'feed/tag' in href.lower(),
	'feed/category' in href.lower(),
	'feed/author' in href.lower(),
	'feed/comments' in href.lower(),
	'javascript:' in href.lower(),
	href.endswith('.jpg'),
	href.endswith('.png'),
	href.endswith('.gif'),
	href.endswith('.css'),
	href.endswith('.js')
	]):
	urls.append(href)

	unique_urls = list(set(urls))

	with open('rss.txt', 'w', encoding='utf-8') as f:
	for url in sorted(unique_urls):
	f.write(url + '\n')

	feeds = []
	for url in sorted(unique_urls):
	feed_name = url.split('/')[-2] if url.endswith('/rss') or url.endswith('/feed') else url.split('/')[-2]
	feeds.append({
	"feed": feed_name,
	"url": url
	})

	with open('feeds.json', 'w', encoding='utf-8') as f:
	json.dump(feeds, f, indent=2, ensure_ascii=False)

	print(f"Total feeds found: {len(unique_urls)}")

	if __name__ == "__main__":
	extract_rss_urls('index.html')
No results found