Skip to content

Instantly share code, notes, and snippets.

@semanticentity
Last active December 24, 2025 06:08
Show Gist options
  • Select an option

  • Save semanticentity/1b9878878c47141eec55d28e0595ad6e to your computer and use it in GitHub Desktop.

Select an option

Save semanticentity/1b9878878c47141eec55d28e0595ad6e to your computer and use it in GitHub Desktop.
Heuristic robots.txt validator — detects malformed structure, orphaned rules, missing User-agent, sitemap issues, and unknown directives in-place. Optimized for real-world crawler behavior.
(function () {
/**
* Recognized directives (ignored if unknown)
* This is NOT a strict allowlist — unknowns are warnings only
*/
var KNOWN_DIRECTIVES = [
"user-agent",
"allow",
"disallow",
"sitemap",
"crawl-delay",
"host"
];
/**
* Extract visible text content
* Works for raw robots.txt and mis-served HTML pages
*/
var rawContent = document.body.innerText || document.body.textContent;
/**
* Heuristic HTML detection
* If this trips, the file is very likely not a real robots.txt
*/
var isHtml =
(document.querySelector("title") && document.title.length > 0) ||
document.querySelectorAll("div").length > 5;
/**
* Split file into lines
*/
var lines = rawContent.split("\n");
/**
* State tracking
*/
var hasUserAgent = false;
var hasGlobalUA = false;
var orphanedRules = false;
var groupCount = 0;
var parsedLines = 0;
var unknownDirectives = new Set();
var sitemapUrls = [];
var sitemapIssues = [];
/**
* Line-by-line parsing
*/
lines.forEach(function (line) {
// Strip comments and whitespace
line = line.split("#")[0].trim();
if (!line) return;
// Ignore lines without colon
var parts = line.split(":");
if (parts.length < 2) return;
var key = parts[0].trim().toLowerCase();
var value = parts.slice(1).join(":").trim();
if (key === "user-agent") {
hasUserAgent = true;
groupCount++;
if (value === "*") hasGlobalUA = true;
}
else if (key === "allow" || key === "disallow") {
// Rules before any User-agent are orphaned
if (!hasUserAgent) orphanedRules = true;
}
else if (key === "sitemap") {
sitemapUrls.push(value);
// Absolute URLs strongly recommended
if (!/^https?:\/\//i.test(value)) {
sitemapIssues.push("Relative sitemap URL: " + value);
}
}
else if (!KNOWN_DIRECTIVES.includes(key)) {
// Unknown directives are ignored but surfaced as advisories
unknownDirectives.add(key);
}
parsedLines++;
});
/**
* Verdict logic
*/
var verdict = "Well-formed";
var color = "#5cb85c";
var details = [];
if (isHtml || parsedLines === 0) {
verdict = "Likely Malformed";
color = "#d9534f";
details.push("HTML or empty content detected.");
}
else if (!hasUserAgent) {
verdict = "Likely Malformed";
color = "#d9534f";
details.push('Missing "User-agent" directive.');
}
else if (orphanedRules) {
verdict = "Likely Malformed";
color = "#d9534f";
details.push("Allow/Disallow rules appear before any User-agent.");
}
else {
// Advisory signals (non-fatal)
if (unknownDirectives.size) {
color = "#f0ad4e";
details.push(
"Unknown directives: " + Array.from(unknownDirectives).join(", ")
);
}
if (!sitemapUrls.length) {
color = "#f0ad4e";
details.push("Missing Sitemap directive (recommended).");
}
if (!hasGlobalUA) {
color = "#f0ad4e";
details.push('No global "User-agent: *" group found.');
}
sitemapIssues.forEach(function (issue) {
color = "#f0ad4e";
details.push(issue);
});
}
/**
* Optional: Check sitemap reachability
* Uses HEAD first, falls back to GET
*/
Promise.all(
sitemapUrls.map(function (url) {
return fetch(url, { method: "HEAD" })
.then(function (res) {
if (!res.ok) {
details.push("Sitemap unreachable: " + url);
}
})
.catch(function () {
return fetch(url, { method: "GET" })
.catch(function () {
details.push("Sitemap fetch failed: " + url);
});
});
})
).finally(function () {
/**
* Render overlay UI
*/
var panel = document.createElement("div");
panel.style.cssText =
"position:fixed;top:20px;right:20px;width:360px;" +
"background:#f8f9fa;color:#333;border:1px solid #ccc;" +
"box-shadow:0 4px 12px rgba(0,0,0,.15);" +
"padding:15px;z-index:99999;font-family:sans-serif;" +
"font-size:14px;border-radius:4px;" +
"max-height:90vh;overflow:auto";
panel.innerHTML =
'<span style="float:right;cursor:pointer" onclick="this.parentElement.remove()">✕</span>' +
"<h3>Robots.txt Heuristic</h3>" +
'<div style="background:' + color +
';color:#fff;padding:8px;margin:8px 0;font-weight:bold">' +
verdict +
"</div>" +
"<ul>" +
details.map(function (d) {
return "<li>" + d + "</li>";
}).join("") +
"</ul>" +
'<div style="font-size:12px;color:#666">' +
groupCount + " groups · " +
sitemapUrls.length + " sitemaps</div>";
document.body.appendChild(panel);
});
})();

“Well-Formed” robots.txt Heuristic Guide

This is a practical, implementation-driven guide for determining whether a robots.txt file is likely to be parsed correctly by modern crawlers.

There is no single authoritative modern specification for robots.txt.

In practice, crawler behavior (especially Google’s) defines what “well-formed” means.

This guide is designed for bookmarklets, linters, and lightweight validators that prioritize real-world parsing behavior over theoretical grammar.


Scope

This guide answers one question:

Does this robots.txt appear well-formed enough to be interpreted correctly by common crawler parsers?

It does not attempt to:

  • Enforce a strict grammar
  • Predict indexing behavior
  • Enforce SEO best practices
  • Guarantee crawler outcomes

1. File Availability

A robots.txt file is considered present if:

  • It is accessible at /robots.txt

  • The HTTP response is:

    • 200 OK, or
    • 401 / 403 (still treated as valid by major crawlers)

Notes:

  • A 404 means the file does not exist, not that it is malformed
  • Content-Type: text/plain is preferred but not required

2. Encoding and Content Type

Basic expectations:

  • Plain text (not HTML, JSON, or binary)
  • UTF-8 or ASCII encoding
  • Line-based content
  • No obvious binary signatures

Strong signals of a likely malformed file:

  • HTML pages (themes, error templates, CMS fallbacks)
  • Minified JS, JSON, or binary output

3. Line Grammar

Each non-empty, non-comment line is parsed independently.

Valid line structure:

field-name ":" optional-whitespace value

Rules:

  • Field names are case-insensitive
  • Lines without a colon (:) are ignored
  • Unknown field names are ignored, not errors
  • Inline comments are allowed using #

There is no global syntax error state. One malformed line does not invalidate the file.


4. Comments

  • Lines starting with # are comments
  • Comments may appear inline after directives
  • Comment-only lines are ignored

5. Group Structure

Rules are evaluated in groups.

A group is defined as:

User-agent: <value>
<directive>: <value>
<directive>: <value>

Behavior:

  • A group begins with one or more User-agent lines
  • All subsequent directives apply to that group
  • A new User-agent starts a new group
  • Empty lines are allowed and commonly used as separators

6. Commonly Recognized Directives

The following directives are widely recognized and safe to parse:

  • User-agent
  • Disallow
  • Allow
  • Crawl-delay (ignored by Google, used by some crawlers)
  • Sitemap (global; not group-scoped)

Unknown directives must be ignored, not treated as errors.


7. Structural Requirements (Heuristic)

A file is generally considered well-formed if:

  • At least one User-agent directive exists
  • Allow / Disallow rules appear after a User-agent
  • No rules are orphaned before the first User-agent

Violations of these rules are likely to cause mis-parsing.


8. Advisory (Non-Fatal) Signals

The following conditions do not make a file malformed, but may be surfaced as warnings:

  • Missing Sitemap directive
  • Relative (non-absolute) sitemap URLs
  • Unreachable sitemap URLs
  • Missing global User-agent: * group
  • Unknown or legacy directives

These are best-practice or robustness signals, not parsing failures.


9. Conditions That Are NOT Errors

Do not treat the following as malformed:

  • Duplicate User-agent entries
  • Multiple groups for the same agent
  • Mixed casing
  • Trailing whitespace
  • Empty Disallow: (means allow all)
  • Legacy or ignored directives (Noindex, etc.)

Modern crawlers tolerate all of the above.


10. Practical Verdict Categories

Recommended output categories for tools:

Well-formed

  • Parsable
  • At least one valid group
  • No orphaned rules

Well-formed with advisories

  • Parsable
  • Missing recommended signals (e.g. sitemap)
  • Contains ignored or legacy directives

Likely malformed

  • HTML or binary content
  • No parsable directives
  • Allow / Disallow before any User-agent

11. Recommended Public Framing

When presenting results to users:

“robots.txt does not have a strict modern specification. This check verifies whether the file appears well-formed according to common crawler parsers, including Google’s.”

This framing is accurate, defensible, and avoids false precision.


Summary

  • robots.txt is permissive by design
  • Modern behavior is defined by crawler implementations
  • Structural heuristics outperform strict validation
  • Bookmarklet-level tools should prioritize parsing safety
javascript:(function(){var K=["user-agent","allow","disallow","sitemap","crawl-delay","host"],R=document.body.innerText||document.body.textContent,H=(document.querySelector("title")&&document.title.length>0)||document.querySelectorAll("div").length>5,L=R.split("\n"),u=!1,G=!1,o=!1,g=0,n=0,d=new Set,S=[],m=[];L.forEach(function(t){t=t.split("#")[0].trim();if(!t)return;var e=t.split(":");if(e.length<2)return;var k=e[0].trim().toLowerCase(),v=e.slice(1).join(":").trim();if(k==="user-agent"){u=!0;g++;v==="*"&&(G=!0)}else if(k==="allow"||k==="disallow"){u||(o=!0)}else if(k==="sitemap"){S.push(v);/^https?:\/\//i.test(v)||m.push("Relative sitemap URL: "+v)}else K.includes(k)||d.add(k);n++});var verdict="Well-formed",color="#5cb85c",details=[];if(H||n===0){verdict="Likely Malformed";color="#d9534f";details.push("HTML or empty content.")}else if(!u){verdict="Likely Malformed";color="#d9534f";details.push('Missing "User-agent".')}else if(o){verdict="Likely Malformed";color="#d9534f";details.push("Orphaned Allow/Disallow.")}else{d.size&&(color="#f0ad4e",details.push("Unknown directives: "+Array.from(d).join(", ")));S.length||(color="#f0ad4e",details.push("Missing Sitemap (recommended)."));G||(color="#f0ad4e",details.push('No "User-agent: *".'));m.forEach(function(t){color="#f0ad4e";details.push(t)})}Promise.all(S.map(function(u){return fetch(u,{method:"HEAD"}).then(function(r){r.ok||details.push("Sitemap unreachable: "+u)}).catch(function(){return fetch(u,{method:"GET"}).catch(function(){details.push("Sitemap fetch failed: "+u)})})})).finally(function(){var e=document.createElement("div");e.style.cssText="position:fixed;top:20px;right:20px;width:360px;background:#f8f9fa;color:#333;border:1px solid #ccc;box-shadow:0 4px 12px rgba(0,0,0,.15);padding:15px;z-index:99999;font-family:sans-serif;font-size:14px;border-radius:4px;max-height:90vh;overflow:auto";e.innerHTML='<span style="float:right;cursor:pointer" onclick="this.parentElement.remove()">✕</span><h3>Robots.txt Heuristic</h3><div style="background:'+color+';color:#fff;padding:8px;margin:8px 0;font-weight:bold">'+verdict+'</div><ul>'+details.map(function(t){return"<li>"+t+"</li>"}).join("")+'</ul><div style="font-size:12px;color:#666">'+g+" groups · "+S.length+" sitemaps</div>";document.body.appendChild(e)})})();
@semanticentity
Copy link
Author

Well Formed:

Screenshot_2025-12-23_at_6 39 40_AM

Likely Malformed:

Screenshot_2025-12-23_at_6 40 51_AM

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment