Skip to content

Instantly share code, notes, and snippets.

@x
Created December 8, 2025 21:00
Show Gist options
  • Select an option

  • Save x/c26e6cd5d151d5ed10c725d03dc84116 to your computer and use it in GitHub Desktop.

Select an option

Save x/c26e6cd5d151d5ed10c725d03dc84116 to your computer and use it in GitHub Desktop.
Downloads an llms.txt and all referenced markdown/text files into a local folder.
#!/bin/bash
# Downloads an llms.txt and all referenced markdown/text files into a local folder.
#
# This script recursively downloads documentation intended for LLMs, preserving
# the original URL structure as a local folder hierarchy. All URL references in
# downloaded files are rewritten to point to the local copies.
#
# Usage:
# ./download-llms-txt.sh <source_url> <target_folder>
#
# Example:
# ./download-llms-txt.sh https://google.github.io/adk-docs/llms.txt ~/src/my_project/agent_docs/adk-docs/
#
# This creates:
# ~/src/my_project/agent_docs/adk-docs/llms.txt
# ~/src/my_project/agent_docs/adk-docs/get-started/index.md
# ... and other referenced files
set -euo pipefail
if [[ $# -ne 2 ]]; then
echo "Usage: $0 <source_url> <target_folder>" >&2
exit 1
fi
SOURCE_URL="$1"
TARGET_FOLDER="$2"
# Extract base URL (scheme + host) from source URL
BASE_URL=$(echo "$SOURCE_URL" | sed -E 's|(https?://[^/]+).*|\1|')
# Extract base path (directory portion of the source URL path)
# e.g., https://google.github.io/adk-docs/llms.txt -> /adk-docs
URL_BASE_PATH=$(echo "$SOURCE_URL" | sed -E 's|https?://[^/]+||' | sed -E 's|/[^/]*$||')
# Track downloaded files using a temp file (compatible with bash 3.x)
DOWNLOADED_FILE=$(mktemp)
trap 'rm -f "$DOWNLOADED_FILE"' EXIT
is_downloaded() {
grep -qxF "$1" "$DOWNLOADED_FILE" 2>/dev/null
}
mark_downloaded() {
echo "$1" >> "$DOWNLOADED_FILE"
}
download_and_process() {
local url="$1"
local base_dir="$2"
# Skip if already downloaded
if is_downloaded "$url"; then
return
fi
mark_downloaded "$url"
# Compute local path from URL, stripping the base path
local url_path
url_path=$(echo "$url" | sed -E 's|https?://[^/]+||')
# Strip the URL base path to avoid duplicate directories
if [[ -n "$URL_BASE_PATH" ]]; then
url_path="${url_path#"$URL_BASE_PATH"}"
fi
# Handle root path
if [[ -z "$url_path" || "$url_path" == "/" ]]; then
url_path="/index.txt"
fi
local local_path="${base_dir}${url_path}"
local local_dir
local_dir=$(dirname "$local_path")
# Create directory and download file
mkdir -p "$local_dir"
echo "Downloading: $url -> $local_path"
if ! curl -sSfL "$url" -o "$local_path" 2>/dev/null; then
echo " Warning: Failed to download $url" >&2
return
fi
# Find all URLs in the downloaded file that point to .md or .txt files
local linked_urls
linked_urls=$(grep -oE 'https?://[^[:space:]"<>)]+\.(md|txt)' "$local_path" 2>/dev/null | sort -u || true)
# Also find relative URLs (starting with /)
local relative_urls
relative_urls=$(grep -oE '\(/[^[:space:]"<>)]+\.(md|txt)\)|\[[^]]*\]\(/[^)]+\.(md|txt)\)' "$local_path" 2>/dev/null | grep -oE '/[^[:space:]"<>)]+\.(md|txt)' | sort -u || true)
# Process absolute URLs that match our base
# Save local_path before loop since bash local variables can be affected by recursion
local saved_local_path="$local_path"
for linked_url in $linked_urls; do
# Only process URLs from the same host
if [[ "$linked_url" == "$BASE_URL"* ]]; then
# Save URL before recursive call (bash local vars get clobbered in recursion)
local saved_linked_url="$linked_url"
# Download the linked file (may already be downloaded via recursion)
download_and_process "$linked_url" "$base_dir"
# Rewrite URL to relative path
local rewrite_path
rewrite_path=$(echo "$saved_linked_url" | sed -E 's|https?://[^/]+||')
if [[ -n "$URL_BASE_PATH" ]]; then
rewrite_path="${rewrite_path#"$URL_BASE_PATH"}"
fi
sed -i '' "s|$saved_linked_url|.$rewrite_path|g" "$saved_local_path" 2>/dev/null || \
sed -i "s|$saved_linked_url|.$rewrite_path|g" "$saved_local_path" 2>/dev/null || true
fi
done
# Process relative URLs
for rel_path in $relative_urls; do
local full_url="${BASE_URL}${rel_path}"
download_and_process "$full_url" "$base_dir"
# Rewrite to local relative path (add . prefix)
sed -i '' "s|(${rel_path})|(.$rel_path)|g" "$saved_local_path" 2>/dev/null || \
sed -i "s|(${rel_path})|(.$rel_path)|g" "$saved_local_path" 2>/dev/null || true
done
}
# Clean and create target folder
rm -rf "$TARGET_FOLDER"
mkdir -p "$TARGET_FOLDER"
# Start recursive download
download_and_process "$SOURCE_URL" "$TARGET_FOLDER"
TOTAL=$(wc -l < "$DOWNLOADED_FILE" | tr -d ' ')
echo ""
echo "Download complete. Files saved to: $TARGET_FOLDER"
echo "Total files downloaded: $TOTAL"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment