Created
December 9, 2025 10:59
-
-
Save weimzh/251ffab9e4c7424aa8de9781f9c07240 to your computer and use it in GitHub Desktop.
A tool to fetch posts, replies, and boosts from a Mastodon user and output as markdown
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Mastodon Post Fetcher | |
| A tool to fetch posts, replies, and boosts from a Mastodon user and output as markdown | |
| """ | |
| import argparse | |
| import json | |
| import os | |
| import re | |
| from datetime import datetime | |
| from typing import Dict, List, Optional | |
| import requests | |
| class MastodonFetcher: | |
| def __init__(self, instance_url: str, access_token: Optional[str] = None): | |
| """ | |
| Initialize the Mastodon fetcher | |
| Args: | |
| instance_url: The URL of the Mastodon instance (e.g., https://mastodon.social) | |
| access_token: Optional access token for authenticated requests | |
| """ | |
| self.instance_url = instance_url.rstrip('/') | |
| self.access_token = access_token | |
| self.headers = {"User-Agent": "Mastodon-Fetcher/1.0"} | |
| if access_token: | |
| self.headers["Authorization"] = f"Bearer {access_token}" | |
| def get_user_by_username(self, username: str) -> Optional[Dict]: | |
| """ | |
| Get user information by username | |
| Args: | |
| username: The user's @username (with or without @) | |
| Returns: | |
| User data dictionary or None if not found | |
| """ | |
| # Remove @ if present | |
| username = username.lstrip('@') | |
| url = f"{self.instance_url}/api/v1/accounts/lookup" | |
| params = {"acct": username} | |
| response = requests.get(url, headers=self.headers, params=params) | |
| response.raise_for_status() | |
| return response.json() | |
| def get_user_statuses(self, user_id: str, limit: int = 40) -> List[Dict]: | |
| """ | |
| Get user's statuses (posts/replies/boosts) with proper pagination | |
| Args: | |
| user_id: The user's ID | |
| limit: Maximum number of statuses to fetch (default 40) | |
| Returns: | |
| List of status dictionaries | |
| """ | |
| all_statuses = [] | |
| min_limit_per_request = min(limit, 40) # Mastodon API max is 40 per request | |
| params = { | |
| "limit": min_limit_per_request, | |
| "exclude_replies": False, # Include replies | |
| "exclude_reblogs": False # Include boosts/reblogs | |
| } | |
| # Keep fetching until we have enough statuses or no more data | |
| while len(all_statuses) < limit: | |
| url = f"{self.instance_url}/api/v1/accounts/{user_id}/statuses" | |
| response = requests.get(url, headers=self.headers, params=params) | |
| response.raise_for_status() | |
| statuses = response.json() | |
| # If no more statuses returned, break the loop | |
| if not statuses: | |
| break | |
| all_statuses.extend(statuses) | |
| # If we've reached the limit, break the loop | |
| if len(all_statuses) >= limit: | |
| break | |
| # Get the ID of the last fetched status to use for pagination | |
| last_status_id = statuses[-1]['id'] | |
| params['max_id'] = last_status_id | |
| # Return only up to the requested limit | |
| return all_statuses[:limit] | |
| def clean_content(self, content: str) -> str: | |
| """ | |
| Clean HTML content from Mastodon status to plain text | |
| Args: | |
| content: HTML content from Mastodon status | |
| Returns: | |
| Cleaned plain text | |
| """ | |
| # First, convert HTML links to markdown format | |
| # Match <a href="...">...</a> and convert to [text](url) format | |
| content = re.sub(r'<a\s+[^>]*href=["\']([^"\']*)["\'][^>]*>(.*?)</a>', r'[\2](\1)', content, flags=re.IGNORECASE) | |
| # Replace paragraph tags with newlines first | |
| content = re.sub(r'<\s*p\s*>', '\n', content, flags=re.IGNORECASE) | |
| content = re.sub(r'<\s*/\s*p\s*>', '', content, flags=re.IGNORECASE) | |
| # Replace break tags with newlines | |
| content = re.sub(r'<\s*br\s*/?\s*>', '\n', content, flags=re.IGNORECASE) | |
| # Remove other HTML tags but keep their content | |
| clean_text = re.sub(r'<[^>]+>', '', content) | |
| # Unescape HTML entities | |
| clean_text = clean_text.replace('<', '<').replace('>', '>').replace('&', '&').replace('"', '"') | |
| # Clean up extra whitespace while preserving paragraphs | |
| clean_text = re.sub(r'\n\s*\n', '\n\n', clean_text) # Reduce multiple newlines | |
| clean_text = clean_text.strip() | |
| # Remove leading/trailing whitespace from each line | |
| clean_text = '\n'.join(line.strip() for line in clean_text.split('\n')) | |
| return clean_text.strip() | |
| def format_status_as_markdown(self, status: Dict) -> str: | |
| """ | |
| Format a Mastodon status as markdown | |
| Args: | |
| status: Status dictionary from Mastodon API | |
| Returns: | |
| Formatted markdown string | |
| """ | |
| content = self.clean_content(status['content']) | |
| created_at = datetime.fromisoformat(status['created_at'].replace('Z', '+00:00')) | |
| # Determine status type | |
| if status.get('reblog'): # Boost/Reblog | |
| reblogged_user = status['reblog']['account']['display_name'] or status['reblog']['account']['username'] | |
| reblogged_content = self.clean_content(status['reblog']['content']) | |
| markdown = f"## Boost: {reblogged_user}\n" | |
| markdown += f"**Boosted on:** {created_at.strftime('%Y-%m-%d %H:%M:%S')}\n\n" | |
| markdown += f"{reblogged_content}\n\n" | |
| markdown += f"*Original post: [{status['reblog']['uri']}]({status['reblog']['uri']})*\n\n" | |
| else: | |
| markdown = f"## Post\n" | |
| markdown += f"**Posted on:** {created_at.strftime('%Y-%m-%d %H:%M:%S')}\n\n" | |
| if content: | |
| markdown += f"{content}\n\n" | |
| # Add media attachments if any | |
| if status.get('media_attachments') and len(status['media_attachments']) > 0: | |
| markdown += "**Attachments:**\n" | |
| for media in status['media_attachments']: | |
| if media.get('type') == 'image': | |
| markdown += f"\n" | |
| else: | |
| markdown += f"[{media.get('description', 'Attachment')}]({media['url']})\n" | |
| markdown += "\n" | |
| # Add link to original post | |
| markdown += f"*Original post: [{status['uri']}]({status['uri']})*\n\n" | |
| return markdown | |
| def fetch_user_posts(self, username: str, limit: int = 40) -> str: | |
| """ | |
| Fetch user posts and return as markdown | |
| Args: | |
| username: The user's @username | |
| limit: Maximum number of statuses to fetch | |
| Returns: | |
| Markdown formatted string of user's posts | |
| """ | |
| user = self.get_user_by_username(username) | |
| if not user: | |
| raise ValueError(f"User {username} not found") | |
| print(f"Found user: {user['display_name']} (@{user['acct']})") | |
| print(f"Fetching up to {limit} statuses...") | |
| statuses = self.get_user_statuses(user['id'], limit) | |
| # Build markdown content | |
| markdown_content = f"# Posts from {user['display_name']} (@{user['acct']})\n\n" | |
| markdown_content += f"**Account URL:** [{user['url']}]({user['url']})\n\n" | |
| for status in statuses: | |
| markdown_content += self.format_status_as_markdown(status) | |
| return markdown_content | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Fetch Mastodon posts and output as markdown") | |
| parser.add_argument("username", help="Mastodon username (with or without @)") | |
| parser.add_argument("--instance", default="https://mastodon.social", help="Mastodon instance URL (default: https://mastodon.social)") | |
| parser.add_argument("--token", help="Mastodon access token (optional, for authenticated requests)") | |
| parser.add_argument("--limit", type=int, default=40, help="Number of posts to fetch (default: 40)") | |
| parser.add_argument("--output", "-o", help="Output file path (default: username.md)") | |
| args = parser.parse_args() | |
| try: | |
| fetcher = MastodonFetcher(args.instance, args.token) | |
| markdown_content = fetcher.fetch_user_posts(args.username, args.limit) | |
| output_file = args.output or f"{args.username.lstrip('@')}.md" | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| f.write(markdown_content) | |
| print(f"Successfully fetched {len(markdown_content.split('## '))-1} posts/replies/boosts from @{args.username}") | |
| print(f"Output saved to: {output_file}") | |
| except requests.exceptions.RequestException as e: | |
| print(f"Request error: {e}") | |
| except ValueError as e: | |
| print(f"Error: {e}") | |
| except Exception as e: | |
| print(f"Unexpected error: {e}") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment