Skip to content

Instantly share code, notes, and snippets.

@weimzh
Created December 9, 2025 10:59
Show Gist options
  • Select an option

  • Save weimzh/251ffab9e4c7424aa8de9781f9c07240 to your computer and use it in GitHub Desktop.

Select an option

Save weimzh/251ffab9e4c7424aa8de9781f9c07240 to your computer and use it in GitHub Desktop.
A tool to fetch posts, replies, and boosts from a Mastodon user and output as markdown
#!/usr/bin/env python3
"""
Mastodon Post Fetcher
A tool to fetch posts, replies, and boosts from a Mastodon user and output as markdown
"""
import argparse
import json
import os
import re
from datetime import datetime
from typing import Dict, List, Optional
import requests
class MastodonFetcher:
def __init__(self, instance_url: str, access_token: Optional[str] = None):
"""
Initialize the Mastodon fetcher
Args:
instance_url: The URL of the Mastodon instance (e.g., https://mastodon.social)
access_token: Optional access token for authenticated requests
"""
self.instance_url = instance_url.rstrip('/')
self.access_token = access_token
self.headers = {"User-Agent": "Mastodon-Fetcher/1.0"}
if access_token:
self.headers["Authorization"] = f"Bearer {access_token}"
def get_user_by_username(self, username: str) -> Optional[Dict]:
"""
Get user information by username
Args:
username: The user's @username (with or without @)
Returns:
User data dictionary or None if not found
"""
# Remove @ if present
username = username.lstrip('@')
url = f"{self.instance_url}/api/v1/accounts/lookup"
params = {"acct": username}
response = requests.get(url, headers=self.headers, params=params)
response.raise_for_status()
return response.json()
def get_user_statuses(self, user_id: str, limit: int = 40) -> List[Dict]:
"""
Get user's statuses (posts/replies/boosts) with proper pagination
Args:
user_id: The user's ID
limit: Maximum number of statuses to fetch (default 40)
Returns:
List of status dictionaries
"""
all_statuses = []
min_limit_per_request = min(limit, 40) # Mastodon API max is 40 per request
params = {
"limit": min_limit_per_request,
"exclude_replies": False, # Include replies
"exclude_reblogs": False # Include boosts/reblogs
}
# Keep fetching until we have enough statuses or no more data
while len(all_statuses) < limit:
url = f"{self.instance_url}/api/v1/accounts/{user_id}/statuses"
response = requests.get(url, headers=self.headers, params=params)
response.raise_for_status()
statuses = response.json()
# If no more statuses returned, break the loop
if not statuses:
break
all_statuses.extend(statuses)
# If we've reached the limit, break the loop
if len(all_statuses) >= limit:
break
# Get the ID of the last fetched status to use for pagination
last_status_id = statuses[-1]['id']
params['max_id'] = last_status_id
# Return only up to the requested limit
return all_statuses[:limit]
def clean_content(self, content: str) -> str:
"""
Clean HTML content from Mastodon status to plain text
Args:
content: HTML content from Mastodon status
Returns:
Cleaned plain text
"""
# First, convert HTML links to markdown format
# Match <a href="...">...</a> and convert to [text](url) format
content = re.sub(r'<a\s+[^>]*href=["\']([^"\']*)["\'][^>]*>(.*?)</a>', r'[\2](\1)', content, flags=re.IGNORECASE)
# Replace paragraph tags with newlines first
content = re.sub(r'<\s*p\s*>', '\n', content, flags=re.IGNORECASE)
content = re.sub(r'<\s*/\s*p\s*>', '', content, flags=re.IGNORECASE)
# Replace break tags with newlines
content = re.sub(r'<\s*br\s*/?\s*>', '\n', content, flags=re.IGNORECASE)
# Remove other HTML tags but keep their content
clean_text = re.sub(r'<[^>]+>', '', content)
# Unescape HTML entities
clean_text = clean_text.replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&').replace('&quot;', '"')
# Clean up extra whitespace while preserving paragraphs
clean_text = re.sub(r'\n\s*\n', '\n\n', clean_text) # Reduce multiple newlines
clean_text = clean_text.strip()
# Remove leading/trailing whitespace from each line
clean_text = '\n'.join(line.strip() for line in clean_text.split('\n'))
return clean_text.strip()
def format_status_as_markdown(self, status: Dict) -> str:
"""
Format a Mastodon status as markdown
Args:
status: Status dictionary from Mastodon API
Returns:
Formatted markdown string
"""
content = self.clean_content(status['content'])
created_at = datetime.fromisoformat(status['created_at'].replace('Z', '+00:00'))
# Determine status type
if status.get('reblog'): # Boost/Reblog
reblogged_user = status['reblog']['account']['display_name'] or status['reblog']['account']['username']
reblogged_content = self.clean_content(status['reblog']['content'])
markdown = f"## Boost: {reblogged_user}\n"
markdown += f"**Boosted on:** {created_at.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
markdown += f"{reblogged_content}\n\n"
markdown += f"*Original post: [{status['reblog']['uri']}]({status['reblog']['uri']})*\n\n"
else:
markdown = f"## Post\n"
markdown += f"**Posted on:** {created_at.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
if content:
markdown += f"{content}\n\n"
# Add media attachments if any
if status.get('media_attachments') and len(status['media_attachments']) > 0:
markdown += "**Attachments:**\n"
for media in status['media_attachments']:
if media.get('type') == 'image':
markdown += f"![{media.get('description', 'Image')}]({media['url']})\n"
else:
markdown += f"[{media.get('description', 'Attachment')}]({media['url']})\n"
markdown += "\n"
# Add link to original post
markdown += f"*Original post: [{status['uri']}]({status['uri']})*\n\n"
return markdown
def fetch_user_posts(self, username: str, limit: int = 40) -> str:
"""
Fetch user posts and return as markdown
Args:
username: The user's @username
limit: Maximum number of statuses to fetch
Returns:
Markdown formatted string of user's posts
"""
user = self.get_user_by_username(username)
if not user:
raise ValueError(f"User {username} not found")
print(f"Found user: {user['display_name']} (@{user['acct']})")
print(f"Fetching up to {limit} statuses...")
statuses = self.get_user_statuses(user['id'], limit)
# Build markdown content
markdown_content = f"# Posts from {user['display_name']} (@{user['acct']})\n\n"
markdown_content += f"**Account URL:** [{user['url']}]({user['url']})\n\n"
for status in statuses:
markdown_content += self.format_status_as_markdown(status)
return markdown_content
def main():
parser = argparse.ArgumentParser(description="Fetch Mastodon posts and output as markdown")
parser.add_argument("username", help="Mastodon username (with or without @)")
parser.add_argument("--instance", default="https://mastodon.social", help="Mastodon instance URL (default: https://mastodon.social)")
parser.add_argument("--token", help="Mastodon access token (optional, for authenticated requests)")
parser.add_argument("--limit", type=int, default=40, help="Number of posts to fetch (default: 40)")
parser.add_argument("--output", "-o", help="Output file path (default: username.md)")
args = parser.parse_args()
try:
fetcher = MastodonFetcher(args.instance, args.token)
markdown_content = fetcher.fetch_user_posts(args.username, args.limit)
output_file = args.output or f"{args.username.lstrip('@')}.md"
with open(output_file, 'w', encoding='utf-8') as f:
f.write(markdown_content)
print(f"Successfully fetched {len(markdown_content.split('## '))-1} posts/replies/boosts from @{args.username}")
print(f"Output saved to: {output_file}")
except requests.exceptions.RequestException as e:
print(f"Request error: {e}")
except ValueError as e:
print(f"Error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment