weimzh · December 9, 2025 10:59
diff --git a/mastodon_fetcher.py b/mastodon_fetcher.py
 #!/usr/bin/env python3
 """
 Mastodon Post Fetcher

 A tool to fetch posts, replies, and boosts from a Mastodon user and output as markdown
 """

 import argparse
 import json
 import os
 import re
 from datetime import datetime
 from typing import Dict, List, Optional

 import requests


 class MastodonFetcher:
    def __init__(self, instance_url: str, access_token: Optional[str] = None):
        """
        Initialize the Mastodon fetcher
        
        Args:
            instance_url: The URL of the Mastodon instance (e.g., https://mastodon.social)
            access_token: Optional access token for authenticated requests
        """
        self.instance_url = instance_url.rstrip('/')
        self.access_token = access_token
        self.headers = {"User-Agent": "Mastodon-Fetcher/1.0"}
        
        if access_token:
            self.headers["Authorization"] = f"Bearer {access_token}"
    
    def get_user_by_username(self, username: str) -> Optional[Dict]:
        """
        Get user information by username
        
        Args:
            username: The user's @username (with or without @)
            
        Returns:
            User data dictionary or None if not found
        """
        # Remove @ if present
        username = username.lstrip('@')
        
        url = f"{self.instance_url}/api/v1/accounts/lookup"
        params = {"acct": username}
        
        response = requests.get(url, headers=self.headers, params=params)
        response.raise_for_status()
        
        return response.json()
    
    def get_user_statuses(self, user_id: str, limit: int = 40) -> List[Dict]:
        """
        Get user's statuses (posts/replies/boosts) with proper pagination

        Args:
            user_id: The user's ID
            limit: Maximum number of statuses to fetch (default 40)

        Returns:
            List of status dictionaries
        """
        all_statuses = []
        min_limit_per_request = min(limit, 40)  # Mastodon API max is 40 per request
        params = {
            "limit": min_limit_per_request,
            "exclude_replies": False,  # Include replies
            "exclude_reblogs": False   # Include boosts/reblogs
        }

        # Keep fetching until we have enough statuses or no more data
        while len(all_statuses) < limit:
            url = f"{self.instance_url}/api/v1/accounts/{user_id}/statuses"

            response = requests.get(url, headers=self.headers, params=params)
            response.raise_for_status()

            statuses = response.json()

            # If no more statuses returned, break the loop
            if not statuses:
                break

            all_statuses.extend(statuses)

            # If we've reached the limit, break the loop
            if len(all_statuses) >= limit:
                break

            # Get the ID of the last fetched status to use for pagination
            last_status_id = statuses[-1]['id']
            params['max_id'] = last_status_id

        # Return only up to the requested limit
        return all_statuses[:limit]
    
    def clean_content(self, content: str) -> str:
        """
        Clean HTML content from Mastodon status to plain text

        Args:
            content: HTML content from Mastodon status

        Returns:
            Cleaned plain text
        """
        # First, convert HTML links to markdown format
        # Match <a href="...">...</a> and convert to [text](url) format
        content = re.sub(r'<a\s+[^>]*href=["\']([^"\']*)["\'][^>]*>(.*?)</a>', r'[\2](\1)', content, flags=re.IGNORECASE)

        # Replace paragraph tags with newlines first
        content = re.sub(r'<\s*p\s*>', '\n', content, flags=re.IGNORECASE)
        content = re.sub(r'<\s*/\s*p\s*>', '', content, flags=re.IGNORECASE)

        # Replace break tags with newlines
        content = re.sub(r'<\s*br\s*/?\s*>', '\n', content, flags=re.IGNORECASE)

        # Remove other HTML tags but keep their content
        clean_text = re.sub(r'<[^>]+>', '', content)

        # Unescape HTML entities
        clean_text = clean_text.replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&').replace('&quot;', '"')

        # Clean up extra whitespace while preserving paragraphs
        clean_text = re.sub(r'\n\s*\n', '\n\n', clean_text)  # Reduce multiple newlines
        clean_text = clean_text.strip()

        # Remove leading/trailing whitespace from each line
        clean_text = '\n'.join(line.strip() for line in clean_text.split('\n'))

        return clean_text.strip()
    
    def format_status_as_markdown(self, status: Dict) -> str:
        """
        Format a Mastodon status as markdown
        
        Args:
            status: Status dictionary from Mastodon API
            
        Returns:
            Formatted markdown string
        """
        content = self.clean_content(status['content'])
        created_at = datetime.fromisoformat(status['created_at'].replace('Z', '+00:00'))
        
        # Determine status type
        if status.get('reblog'):  # Boost/Reblog
            reblogged_user = status['reblog']['account']['display_name'] or status['reblog']['account']['username']
            reblogged_content = self.clean_content(status['reblog']['content'])
            
            markdown = f"## Boost: {reblogged_user}\n"
            markdown += f"**Boosted on:** {created_at.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
            markdown += f"{reblogged_content}\n\n"
            markdown += f"*Original post: [{status['reblog']['uri']}]({status['reblog']['uri']})*\n\n"
        else:
            markdown = f"## Post\n"
            markdown += f"**Posted on:** {created_at.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
            
            if content:
                markdown += f"{content}\n\n"
                
            # Add media attachments if any
            if status.get('media_attachments') and len(status['media_attachments']) > 0:
                markdown += "**Attachments:**\n"
                for media in status['media_attachments']:
                    if media.get('type') == 'image':
                        markdown += f"![{media.get('description', 'Image')}]({media['url']})\n"
                    else:
                        markdown += f"[{media.get('description', 'Attachment')}]({media['url']})\n"
                markdown += "\n"
                
            # Add link to original post
            markdown += f"*Original post: [{status['uri']}]({status['uri']})*\n\n"
        
        return markdown
    
    def fetch_user_posts(self, username: str, limit: int = 40) -> str:
        """
        Fetch user posts and return as markdown
        
        Args:
            username: The user's @username
            limit: Maximum number of statuses to fetch
            
        Returns:
            Markdown formatted string of user's posts
        """
        user = self.get_user_by_username(username)
        if not user:
            raise ValueError(f"User {username} not found")
        
        print(f"Found user: {user['display_name']} (@{user['acct']})")
        print(f"Fetching up to {limit} statuses...")
        
        statuses = self.get_user_statuses(user['id'], limit)
        
        # Build markdown content
        markdown_content = f"# Posts from {user['display_name']} (@{user['acct']})\n\n"
        markdown_content += f"**Account URL:** [{user['url']}]({user['url']})\n\n"
        
        for status in statuses:
            markdown_content += self.format_status_as_markdown(status)
        
        return markdown_content


 def main():
    parser = argparse.ArgumentParser(description="Fetch Mastodon posts and output as markdown")
    parser.add_argument("username", help="Mastodon username (with or without @)")
    parser.add_argument("--instance", default="https://mastodon.social", help="Mastodon instance URL (default: https://mastodon.social)")
    parser.add_argument("--token", help="Mastodon access token (optional, for authenticated requests)")
    parser.add_argument("--limit", type=int, default=40, help="Number of posts to fetch (default: 40)")
    parser.add_argument("--output", "-o", help="Output file path (default: username.md)")
    
    args = parser.parse_args()
    
    try:
        fetcher = MastodonFetcher(args.instance, args.token)
        markdown_content = fetcher.fetch_user_posts(args.username, args.limit)
        
        output_file = args.output or f"{args.username.lstrip('@')}.md"
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(markdown_content)
        
        print(f"Successfully fetched {len(markdown_content.split('## '))-1} posts/replies/boosts from @{args.username}")
        print(f"Output saved to: {output_file}")
    
    except requests.exceptions.RequestException as e:
        print(f"Request error: {e}")
    except ValueError as e:
        print(f"Error: {e}")
    except Exception as e:
        print(f"Unexpected error: {e}")


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Mastodon Post Fetcher

	A tool to fetch posts, replies, and boosts from a Mastodon user and output as markdown
	"""

	import argparse
	import json
	import os
	import re
	from datetime import datetime
	from typing import Dict, List, Optional

	import requests


	class MastodonFetcher:
	def __init__(self, instance_url: str, access_token: Optional[str] = None):
	"""
	Initialize the Mastodon fetcher

	Args:
	instance_url: The URL of the Mastodon instance (e.g., https://mastodon.social)
	access_token: Optional access token for authenticated requests
	"""
	self.instance_url = instance_url.rstrip('/')
	self.access_token = access_token
	self.headers = {"User-Agent": "Mastodon-Fetcher/1.0"}

	if access_token:
	self.headers["Authorization"] = f"Bearer {access_token}"

	def get_user_by_username(self, username: str) -> Optional[Dict]:
	"""
	Get user information by username

	Args:
	username: The user's @username (with or without @)

	Returns:
	User data dictionary or None if not found
	"""
	# Remove @ if present
	username = username.lstrip('@')

	url = f"{self.instance_url}/api/v1/accounts/lookup"
	params = {"acct": username}

	response = requests.get(url, headers=self.headers, params=params)
	response.raise_for_status()

	return response.json()

	def get_user_statuses(self, user_id: str, limit: int = 40) -> List[Dict]:
	"""
	Get user's statuses (posts/replies/boosts) with proper pagination

	Args:
	user_id: The user's ID
	limit: Maximum number of statuses to fetch (default 40)

	Returns:
	List of status dictionaries
	"""
	all_statuses = []
	min_limit_per_request = min(limit, 40) # Mastodon API max is 40 per request
	params = {
	"limit": min_limit_per_request,
	"exclude_replies": False, # Include replies
	"exclude_reblogs": False # Include boosts/reblogs
	}

	# Keep fetching until we have enough statuses or no more data
	while len(all_statuses) < limit:
	url = f"{self.instance_url}/api/v1/accounts/{user_id}/statuses"

	response = requests.get(url, headers=self.headers, params=params)
	response.raise_for_status()

	statuses = response.json()

	# If no more statuses returned, break the loop
	if not statuses:
	break

	all_statuses.extend(statuses)

	# If we've reached the limit, break the loop
	if len(all_statuses) >= limit:
	break

	# Get the ID of the last fetched status to use for pagination
	last_status_id = statuses[-1]['id']
	params['max_id'] = last_status_id

	# Return only up to the requested limit
	return all_statuses[:limit]

	def clean_content(self, content: str) -> str:
	"""
	Clean HTML content from Mastodon status to plain text

	Args:
	content: HTML content from Mastodon status

	Returns:
	Cleaned plain text
	"""
	# First, convert HTML links to markdown format
	# Match <a href="...">...</a> and convert to [text](url) format
	content = re.sub(r'<a\s+[^>]href=["\']([^"\'])["\'][^>]>(.?)</a>', r'[\2](\1)', content, flags=re.IGNORECASE)

	# Replace paragraph tags with newlines first
	content = re.sub(r'<\sp\s>', '\n', content, flags=re.IGNORECASE)
	content = re.sub(r'<\s/\sp\s*>', '', content, flags=re.IGNORECASE)

	# Replace break tags with newlines
	content = re.sub(r'<\sbr\s/?\s*>', '\n', content, flags=re.IGNORECASE)

	# Remove other HTML tags but keep their content
	clean_text = re.sub(r'<[^>]+>', '', content)

	# Unescape HTML entities
	clean_text = clean_text.replace('<', '<').replace('>', '>').replace('&', '&').replace('"', '"')

	# Clean up extra whitespace while preserving paragraphs
	clean_text = re.sub(r'\n\s*\n', '\n\n', clean_text) # Reduce multiple newlines
	clean_text = clean_text.strip()

	# Remove leading/trailing whitespace from each line
	clean_text = '\n'.join(line.strip() for line in clean_text.split('\n'))

	return clean_text.strip()

	def format_status_as_markdown(self, status: Dict) -> str:
	"""
	Format a Mastodon status as markdown

	Args:
	status: Status dictionary from Mastodon API

	Returns:
	Formatted markdown string
	"""
	content = self.clean_content(status['content'])
	created_at = datetime.fromisoformat(status['created_at'].replace('Z', '+00:00'))

	# Determine status type
	if status.get('reblog'): # Boost/Reblog
	reblogged_user = status['reblog']['account']['display_name'] or status['reblog']['account']['username']
	reblogged_content = self.clean_content(status['reblog']['content'])

	markdown = f"## Boost: {reblogged_user}\n"
	markdown += f"Boosted on: {created_at.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
	markdown += f"{reblogged_content}\n\n"
	markdown += f"Original post: [{status['reblog']['uri']}]({status['reblog']['uri']})\n\n"
	else:
	markdown = f"## Post\n"
	markdown += f"Posted on: {created_at.strftime('%Y-%m-%d %H:%M:%S')}\n\n"

	if content:
	markdown += f"{content}\n\n"

	# Add media attachments if any
	if status.get('media_attachments') and len(status['media_attachments']) > 0:
	markdown += "Attachments:\n"
	for media in status['media_attachments']:
	if media.get('type') == 'image':
	markdown += f"![{media.get('description', 'Image')}]({media['url']})\n"
	else:
	markdown += f"[{media.get('description', 'Attachment')}]({media['url']})\n"
	markdown += "\n"

	# Add link to original post
	markdown += f"Original post: [{status['uri']}]({status['uri']})\n\n"

	return markdown

	def fetch_user_posts(self, username: str, limit: int = 40) -> str:
	"""
	Fetch user posts and return as markdown

	Args:
	username: The user's @username
	limit: Maximum number of statuses to fetch

	Returns:
	Markdown formatted string of user's posts
	"""
	user = self.get_user_by_username(username)
	if not user:
	raise ValueError(f"User {username} not found")

	print(f"Found user: {user['display_name']} (@{user['acct']})")
	print(f"Fetching up to {limit} statuses...")

	statuses = self.get_user_statuses(user['id'], limit)

	# Build markdown content
	markdown_content = f"# Posts from {user['display_name']} (@{user['acct']})\n\n"
	markdown_content += f"Account URL: [{user['url']}]({user['url']})\n\n"

	for status in statuses:
	markdown_content += self.format_status_as_markdown(status)

	return markdown_content


	def main():
	parser = argparse.ArgumentParser(description="Fetch Mastodon posts and output as markdown")
	parser.add_argument("username", help="Mastodon username (with or without @)")
	parser.add_argument("--instance", default="https://mastodon.social", help="Mastodon instance URL (default: https://mastodon.social)")
	parser.add_argument("--token", help="Mastodon access token (optional, for authenticated requests)")
	parser.add_argument("--limit", type=int, default=40, help="Number of posts to fetch (default: 40)")
	parser.add_argument("--output", "-o", help="Output file path (default: username.md)")

	args = parser.parse_args()

	try:
	fetcher = MastodonFetcher(args.instance, args.token)
	markdown_content = fetcher.fetch_user_posts(args.username, args.limit)

	output_file = args.output or f"{args.username.lstrip('@')}.md"
	with open(output_file, 'w', encoding='utf-8') as f:
	f.write(markdown_content)

	print(f"Successfully fetched {len(markdown_content.split('## '))-1} posts/replies/boosts from @{args.username}")
	print(f"Output saved to: {output_file}")

	except requests.exceptions.RequestException as e:
	print(f"Request error: {e}")
	except ValueError as e:
	print(f"Error: {e}")
	except Exception as e:
	print(f"Unexpected error: {e}")


	if __name__ == "__main__":
	main()
No results found