Created
December 8, 2025 21:00
-
-
Save x/c26e6cd5d151d5ed10c725d03dc84116 to your computer and use it in GitHub Desktop.
Downloads an llms.txt and all referenced markdown/text files into a local folder.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Downloads an llms.txt and all referenced markdown/text files into a local folder. | |
| # | |
| # This script recursively downloads documentation intended for LLMs, preserving | |
| # the original URL structure as a local folder hierarchy. All URL references in | |
| # downloaded files are rewritten to point to the local copies. | |
| # | |
| # Usage: | |
| # ./download-llms-txt.sh <source_url> <target_folder> | |
| # | |
| # Example: | |
| # ./download-llms-txt.sh https://google.github.io/adk-docs/llms.txt ~/src/my_project/agent_docs/adk-docs/ | |
| # | |
| # This creates: | |
| # ~/src/my_project/agent_docs/adk-docs/llms.txt | |
| # ~/src/my_project/agent_docs/adk-docs/get-started/index.md | |
| # ... and other referenced files | |
| set -euo pipefail | |
| if [[ $# -ne 2 ]]; then | |
| echo "Usage: $0 <source_url> <target_folder>" >&2 | |
| exit 1 | |
| fi | |
| SOURCE_URL="$1" | |
| TARGET_FOLDER="$2" | |
| # Extract base URL (scheme + host) from source URL | |
| BASE_URL=$(echo "$SOURCE_URL" | sed -E 's|(https?://[^/]+).*|\1|') | |
| # Extract base path (directory portion of the source URL path) | |
| # e.g., https://google.github.io/adk-docs/llms.txt -> /adk-docs | |
| URL_BASE_PATH=$(echo "$SOURCE_URL" | sed -E 's|https?://[^/]+||' | sed -E 's|/[^/]*$||') | |
| # Track downloaded files using a temp file (compatible with bash 3.x) | |
| DOWNLOADED_FILE=$(mktemp) | |
| trap 'rm -f "$DOWNLOADED_FILE"' EXIT | |
| is_downloaded() { | |
| grep -qxF "$1" "$DOWNLOADED_FILE" 2>/dev/null | |
| } | |
| mark_downloaded() { | |
| echo "$1" >> "$DOWNLOADED_FILE" | |
| } | |
| download_and_process() { | |
| local url="$1" | |
| local base_dir="$2" | |
| # Skip if already downloaded | |
| if is_downloaded "$url"; then | |
| return | |
| fi | |
| mark_downloaded "$url" | |
| # Compute local path from URL, stripping the base path | |
| local url_path | |
| url_path=$(echo "$url" | sed -E 's|https?://[^/]+||') | |
| # Strip the URL base path to avoid duplicate directories | |
| if [[ -n "$URL_BASE_PATH" ]]; then | |
| url_path="${url_path#"$URL_BASE_PATH"}" | |
| fi | |
| # Handle root path | |
| if [[ -z "$url_path" || "$url_path" == "/" ]]; then | |
| url_path="/index.txt" | |
| fi | |
| local local_path="${base_dir}${url_path}" | |
| local local_dir | |
| local_dir=$(dirname "$local_path") | |
| # Create directory and download file | |
| mkdir -p "$local_dir" | |
| echo "Downloading: $url -> $local_path" | |
| if ! curl -sSfL "$url" -o "$local_path" 2>/dev/null; then | |
| echo " Warning: Failed to download $url" >&2 | |
| return | |
| fi | |
| # Find all URLs in the downloaded file that point to .md or .txt files | |
| local linked_urls | |
| linked_urls=$(grep -oE 'https?://[^[:space:]"<>)]+\.(md|txt)' "$local_path" 2>/dev/null | sort -u || true) | |
| # Also find relative URLs (starting with /) | |
| local relative_urls | |
| relative_urls=$(grep -oE '\(/[^[:space:]"<>)]+\.(md|txt)\)|\[[^]]*\]\(/[^)]+\.(md|txt)\)' "$local_path" 2>/dev/null | grep -oE '/[^[:space:]"<>)]+\.(md|txt)' | sort -u || true) | |
| # Process absolute URLs that match our base | |
| # Save local_path before loop since bash local variables can be affected by recursion | |
| local saved_local_path="$local_path" | |
| for linked_url in $linked_urls; do | |
| # Only process URLs from the same host | |
| if [[ "$linked_url" == "$BASE_URL"* ]]; then | |
| # Save URL before recursive call (bash local vars get clobbered in recursion) | |
| local saved_linked_url="$linked_url" | |
| # Download the linked file (may already be downloaded via recursion) | |
| download_and_process "$linked_url" "$base_dir" | |
| # Rewrite URL to relative path | |
| local rewrite_path | |
| rewrite_path=$(echo "$saved_linked_url" | sed -E 's|https?://[^/]+||') | |
| if [[ -n "$URL_BASE_PATH" ]]; then | |
| rewrite_path="${rewrite_path#"$URL_BASE_PATH"}" | |
| fi | |
| sed -i '' "s|$saved_linked_url|.$rewrite_path|g" "$saved_local_path" 2>/dev/null || \ | |
| sed -i "s|$saved_linked_url|.$rewrite_path|g" "$saved_local_path" 2>/dev/null || true | |
| fi | |
| done | |
| # Process relative URLs | |
| for rel_path in $relative_urls; do | |
| local full_url="${BASE_URL}${rel_path}" | |
| download_and_process "$full_url" "$base_dir" | |
| # Rewrite to local relative path (add . prefix) | |
| sed -i '' "s|(${rel_path})|(.$rel_path)|g" "$saved_local_path" 2>/dev/null || \ | |
| sed -i "s|(${rel_path})|(.$rel_path)|g" "$saved_local_path" 2>/dev/null || true | |
| done | |
| } | |
| # Clean and create target folder | |
| rm -rf "$TARGET_FOLDER" | |
| mkdir -p "$TARGET_FOLDER" | |
| # Start recursive download | |
| download_and_process "$SOURCE_URL" "$TARGET_FOLDER" | |
| TOTAL=$(wc -l < "$DOWNLOADED_FILE" | tr -d ' ') | |
| echo "" | |
| echo "Download complete. Files saved to: $TARGET_FOLDER" | |
| echo "Total files downloaded: $TOTAL" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment