Skip to content

Instantly share code, notes, and snippets.

@gglanzani
Created December 28, 2025 22:07
Show Gist options
  • Select an option

  • Save gglanzani/e25a6d2faaa187859c577fd6267e0ce3 to your computer and use it in GitHub Desktop.

Select an option

Save gglanzani/e25a6d2faaa187859c577fd6267e0ce3 to your computer and use it in GitHub Desktop.
Script to automatically add tags to markdown blog posts (with YAML front matter) using OpenAI GPT-4 for content analysis.
#!/usr/bin/env python3
"""
Script to automatically add tags to blog posts using OpenAI GPT-4 for content analysis.
Analyzes the content and generates relevant tags for each post.
"""
import os
import json
import yaml
import time
from pathlib import Path
from typing import List, Dict, Set
from collections import Counter
try:
from openai import OpenAI
except ImportError:
print("Error: OpenAI library not found. Please install it with: pip install openai")
exit(1)
def extract_frontmatter_and_content(file_path: str) -> tuple:
"""Extract frontmatter and content from a markdown file."""
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
# Split frontmatter and content
if content.startswith('---'):
parts = content.split('---', 2)
if len(parts) >= 3:
frontmatter_str = parts[1]
post_content = parts[2].strip()
try:
frontmatter = yaml.safe_load(frontmatter_str)
except yaml.YAMLError:
frontmatter = {}
else:
frontmatter = {}
post_content = content
else:
frontmatter = {}
post_content = content
return frontmatter, post_content
def generate_tags_with_openai(title: str, content: str, client: OpenAI) -> List[str]:
"""Use OpenAI GPT-4 to analyze content and generate relevant tags."""
# Prepare the prompt for GPT-4
prompt = f"""Analyze the following blog post and generate relevant tags. The blog is a technical blog covering topics like programming, AI/ML, productivity, technology reviews, and personal thoughts.
Please generate 3-7 relevant tags that best categorize this content. Focus on:
- Programming languages and technologies mentioned
- Main topics and themes
- Tools and platforms discussed
- Content type (tutorial, opinion, review, etc.)
Return only a JSON array of tags, nothing else. Use lowercase, short tags (1-2 words max). Avoid overly generic tags. If the content is short, don't generate more than 1-2 tags.
Title: {title}
Content: {content[:3000]}{"..." if len(content) > 3000 else ""}
Tags:"""
try:
response = client.chat.completions.create(
model="gpt-4",
messages=[
{
"role": "system",
"content": "You are a helpful assistant that analyzes blog post content and generates relevant tags. Always respond with valid JSON array format."
},
{"role": "user", "content": prompt}
],
max_tokens=150,
temperature=0.3
)
# Parse the response
response_content = response.choices[0].message.content.strip()
# Try to extract JSON from the response
try:
# Look for JSON array in the response
import re
json_match = re.search(r'\[.*?\]', response_content, re.DOTALL)
if json_match:
tags_json = json_match.group()
tags = json.loads(tags_json)
# Ensure all tags are strings and clean them
clean_tags = [str(tag).lower().strip() for tag in tags if tag]
return clean_tags[:7] # Limit to 7 tags max
else:
print(f"Warning: Could not find JSON array in response: {response_content}")
return []
except json.JSONDecodeError as e:
print(f"Warning: Failed to parse JSON response: {response_content}")
print(f"JSON Error: {e}")
return []
except Exception as e:
print(f"Error calling OpenAI API: {e}")
return []
def update_frontmatter_with_tags(file_path: str, tags: List[str], dry_run: bool = True) -> bool:
"""Update the markdown file to include tags as the fourth line."""
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
# Check if tags already exist
for line in lines:
if line.strip().startswith('tags:'):
if not dry_run:
print(f" Tags already exist in file")
return False
# Find where to insert the tags line (after title, which should be the 3rd line)
tags_line_index = None
for i, line in enumerate(lines):
if line.strip().startswith('title:'):
tags_line_index = i + 1
break
if tags_line_index is None:
if not dry_run:
print(f" Could not find title line to insert tags after")
return False
# Format tags as a list
if tags:
sorted_tags = sorted(tags)
tags_line = f'tags: {json.dumps(sorted_tags)}\n'
# Insert the tags line
if not dry_run:
lines.insert(tags_line_index, tags_line)
# Write back to file
with open(file_path, 'w', encoding='utf-8') as f:
f.writelines(lines)
return True
def main():
"""Main function to process all posts and add tags using OpenAI."""
# Check for OpenAI API key
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
print("Error: OPENAI_API_KEY environment variable not set!")
print("Please set your OpenAI API key with: export OPENAI_API_KEY='your-api-key'")
return
# Initialize OpenAI client
client = OpenAI(api_key=api_key)
posts_dir = Path("content/posts")
if not posts_dir.exists():
print("Content/posts directory not found!")
return
print("Analyzing posts with OpenAI GPT-4 and suggesting tags...\n")
# Get all posts without tags
posts_to_process = []
for post_file in sorted(posts_dir.glob("*.md")):
# Check if tags already exist by reading the file directly
with open(post_file, 'r', encoding='utf-8') as f:
file_content = f.read()
# Skip if tags already exist
has_tags = False
for line in file_content.split('\n'):
if line.strip().startswith('tags:'):
has_tags = True
break
if has_tags:
continue
# Extract frontmatter and content for processing
frontmatter, content = extract_frontmatter_and_content(post_file)
posts_to_process.append((post_file, frontmatter, content))
if not posts_to_process:
print("No posts found that need tags. All posts already have tags!")
return
print(f"Found {len(posts_to_process)} posts without tags.")
# Ask for confirmation before making API calls
response = input(f"This will make {len(posts_to_process)} OpenAI API calls. Continue? [y/N]: ")
if response.lower().strip() not in ['y', 'yes']:
print("Cancelled.")
return
# Process posts and collect suggestions
all_suggested_tags = Counter()
post_suggestions = []
print("\nProcessing posts...")
for i, (post_file, frontmatter, content) in enumerate(posts_to_process, 1):
title = frontmatter.get('title', '')
print(f"[{i}/{len(posts_to_process)}] Processing: {post_file.name}")
# Generate tags using OpenAI
suggested_tags = generate_tags_with_openai(title, content, client)
if suggested_tags:
post_suggestions.append((post_file, suggested_tags, title))
all_suggested_tags.update(suggested_tags)
print(f" Generated tags: {', '.join(suggested_tags)}")
else:
print(f" No tags generated")
# Add a small delay to be respectful to the API
time.sleep(1)
# Show summary
print(f"\n{'-'*80}")
print("Most common suggested tags:")
for tag, count in all_suggested_tags.most_common(20):
print(f" {tag}: {count} posts")
print(f"\nSuggested tags for {len(post_suggestions)} posts:")
print("-" * 80)
# Show suggestions for each post
for post_file, tags, title in post_suggestions:
print(f"\n📝 {post_file.name}")
print(f" Title: {title}")
print(f" Suggested tags: {', '.join(tags)}")
print(f"\n{'-'*80}")
response = input(f"\nWould you like to apply these tags to {len(post_suggestions)} posts? [y/N]: ")
if response.lower().strip() in ['y', 'yes']:
print("\nApplying tags...")
updated_count = 0
for post_file, tags, title in post_suggestions:
if update_frontmatter_with_tags(post_file, tags, dry_run=False):
print(f"✓ Updated {post_file.name}")
updated_count += 1
else:
print(f"- Skipped {post_file.name}")
print(f"\n✅ Updated {updated_count} posts with tags!")
else:
print("\nNo changes made. You can run this script again to apply the suggestions.")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment