Instantly share code, notes, and snippets.
Created
December 28, 2025 22:07
-
Star
0
(0)
You must be signed in to star a gist -
Fork
0
(0)
You must be signed in to fork a gist
-
-
Save gglanzani/e25a6d2faaa187859c577fd6267e0ce3 to your computer and use it in GitHub Desktop.
Script to automatically add tags to markdown blog posts (with YAML front matter) using OpenAI GPT-4 for content analysis.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Script to automatically add tags to blog posts using OpenAI GPT-4 for content analysis. | |
| Analyzes the content and generates relevant tags for each post. | |
| """ | |
| import os | |
| import json | |
| import yaml | |
| import time | |
| from pathlib import Path | |
| from typing import List, Dict, Set | |
| from collections import Counter | |
| try: | |
| from openai import OpenAI | |
| except ImportError: | |
| print("Error: OpenAI library not found. Please install it with: pip install openai") | |
| exit(1) | |
| def extract_frontmatter_and_content(file_path: str) -> tuple: | |
| """Extract frontmatter and content from a markdown file.""" | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| # Split frontmatter and content | |
| if content.startswith('---'): | |
| parts = content.split('---', 2) | |
| if len(parts) >= 3: | |
| frontmatter_str = parts[1] | |
| post_content = parts[2].strip() | |
| try: | |
| frontmatter = yaml.safe_load(frontmatter_str) | |
| except yaml.YAMLError: | |
| frontmatter = {} | |
| else: | |
| frontmatter = {} | |
| post_content = content | |
| else: | |
| frontmatter = {} | |
| post_content = content | |
| return frontmatter, post_content | |
| def generate_tags_with_openai(title: str, content: str, client: OpenAI) -> List[str]: | |
| """Use OpenAI GPT-4 to analyze content and generate relevant tags.""" | |
| # Prepare the prompt for GPT-4 | |
| prompt = f"""Analyze the following blog post and generate relevant tags. The blog is a technical blog covering topics like programming, AI/ML, productivity, technology reviews, and personal thoughts. | |
| Please generate 3-7 relevant tags that best categorize this content. Focus on: | |
| - Programming languages and technologies mentioned | |
| - Main topics and themes | |
| - Tools and platforms discussed | |
| - Content type (tutorial, opinion, review, etc.) | |
| Return only a JSON array of tags, nothing else. Use lowercase, short tags (1-2 words max). Avoid overly generic tags. If the content is short, don't generate more than 1-2 tags. | |
| Title: {title} | |
| Content: {content[:3000]}{"..." if len(content) > 3000 else ""} | |
| Tags:""" | |
| try: | |
| response = client.chat.completions.create( | |
| model="gpt-4", | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": "You are a helpful assistant that analyzes blog post content and generates relevant tags. Always respond with valid JSON array format." | |
| }, | |
| {"role": "user", "content": prompt} | |
| ], | |
| max_tokens=150, | |
| temperature=0.3 | |
| ) | |
| # Parse the response | |
| response_content = response.choices[0].message.content.strip() | |
| # Try to extract JSON from the response | |
| try: | |
| # Look for JSON array in the response | |
| import re | |
| json_match = re.search(r'\[.*?\]', response_content, re.DOTALL) | |
| if json_match: | |
| tags_json = json_match.group() | |
| tags = json.loads(tags_json) | |
| # Ensure all tags are strings and clean them | |
| clean_tags = [str(tag).lower().strip() for tag in tags if tag] | |
| return clean_tags[:7] # Limit to 7 tags max | |
| else: | |
| print(f"Warning: Could not find JSON array in response: {response_content}") | |
| return [] | |
| except json.JSONDecodeError as e: | |
| print(f"Warning: Failed to parse JSON response: {response_content}") | |
| print(f"JSON Error: {e}") | |
| return [] | |
| except Exception as e: | |
| print(f"Error calling OpenAI API: {e}") | |
| return [] | |
| def update_frontmatter_with_tags(file_path: str, tags: List[str], dry_run: bool = True) -> bool: | |
| """Update the markdown file to include tags as the fourth line.""" | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| lines = f.readlines() | |
| # Check if tags already exist | |
| for line in lines: | |
| if line.strip().startswith('tags:'): | |
| if not dry_run: | |
| print(f" Tags already exist in file") | |
| return False | |
| # Find where to insert the tags line (after title, which should be the 3rd line) | |
| tags_line_index = None | |
| for i, line in enumerate(lines): | |
| if line.strip().startswith('title:'): | |
| tags_line_index = i + 1 | |
| break | |
| if tags_line_index is None: | |
| if not dry_run: | |
| print(f" Could not find title line to insert tags after") | |
| return False | |
| # Format tags as a list | |
| if tags: | |
| sorted_tags = sorted(tags) | |
| tags_line = f'tags: {json.dumps(sorted_tags)}\n' | |
| # Insert the tags line | |
| if not dry_run: | |
| lines.insert(tags_line_index, tags_line) | |
| # Write back to file | |
| with open(file_path, 'w', encoding='utf-8') as f: | |
| f.writelines(lines) | |
| return True | |
| def main(): | |
| """Main function to process all posts and add tags using OpenAI.""" | |
| # Check for OpenAI API key | |
| api_key = os.getenv('OPENAI_API_KEY') | |
| if not api_key: | |
| print("Error: OPENAI_API_KEY environment variable not set!") | |
| print("Please set your OpenAI API key with: export OPENAI_API_KEY='your-api-key'") | |
| return | |
| # Initialize OpenAI client | |
| client = OpenAI(api_key=api_key) | |
| posts_dir = Path("content/posts") | |
| if not posts_dir.exists(): | |
| print("Content/posts directory not found!") | |
| return | |
| print("Analyzing posts with OpenAI GPT-4 and suggesting tags...\n") | |
| # Get all posts without tags | |
| posts_to_process = [] | |
| for post_file in sorted(posts_dir.glob("*.md")): | |
| # Check if tags already exist by reading the file directly | |
| with open(post_file, 'r', encoding='utf-8') as f: | |
| file_content = f.read() | |
| # Skip if tags already exist | |
| has_tags = False | |
| for line in file_content.split('\n'): | |
| if line.strip().startswith('tags:'): | |
| has_tags = True | |
| break | |
| if has_tags: | |
| continue | |
| # Extract frontmatter and content for processing | |
| frontmatter, content = extract_frontmatter_and_content(post_file) | |
| posts_to_process.append((post_file, frontmatter, content)) | |
| if not posts_to_process: | |
| print("No posts found that need tags. All posts already have tags!") | |
| return | |
| print(f"Found {len(posts_to_process)} posts without tags.") | |
| # Ask for confirmation before making API calls | |
| response = input(f"This will make {len(posts_to_process)} OpenAI API calls. Continue? [y/N]: ") | |
| if response.lower().strip() not in ['y', 'yes']: | |
| print("Cancelled.") | |
| return | |
| # Process posts and collect suggestions | |
| all_suggested_tags = Counter() | |
| post_suggestions = [] | |
| print("\nProcessing posts...") | |
| for i, (post_file, frontmatter, content) in enumerate(posts_to_process, 1): | |
| title = frontmatter.get('title', '') | |
| print(f"[{i}/{len(posts_to_process)}] Processing: {post_file.name}") | |
| # Generate tags using OpenAI | |
| suggested_tags = generate_tags_with_openai(title, content, client) | |
| if suggested_tags: | |
| post_suggestions.append((post_file, suggested_tags, title)) | |
| all_suggested_tags.update(suggested_tags) | |
| print(f" Generated tags: {', '.join(suggested_tags)}") | |
| else: | |
| print(f" No tags generated") | |
| # Add a small delay to be respectful to the API | |
| time.sleep(1) | |
| # Show summary | |
| print(f"\n{'-'*80}") | |
| print("Most common suggested tags:") | |
| for tag, count in all_suggested_tags.most_common(20): | |
| print(f" {tag}: {count} posts") | |
| print(f"\nSuggested tags for {len(post_suggestions)} posts:") | |
| print("-" * 80) | |
| # Show suggestions for each post | |
| for post_file, tags, title in post_suggestions: | |
| print(f"\n📝 {post_file.name}") | |
| print(f" Title: {title}") | |
| print(f" Suggested tags: {', '.join(tags)}") | |
| print(f"\n{'-'*80}") | |
| response = input(f"\nWould you like to apply these tags to {len(post_suggestions)} posts? [y/N]: ") | |
| if response.lower().strip() in ['y', 'yes']: | |
| print("\nApplying tags...") | |
| updated_count = 0 | |
| for post_file, tags, title in post_suggestions: | |
| if update_frontmatter_with_tags(post_file, tags, dry_run=False): | |
| print(f"✓ Updated {post_file.name}") | |
| updated_count += 1 | |
| else: | |
| print(f"- Skipped {post_file.name}") | |
| print(f"\n✅ Updated {updated_count} posts with tags!") | |
| else: | |
| print("\nNo changes made. You can run this script again to apply the suggestions.") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment