semanticentity · December 24, 2025 06:08 · semanticentity · Dec 23, 2025
diff --git a/wellformed-robotstxt.js b/wellformed-robotstxt.js
 (function () {
  /**
   * Recognized directives (ignored if unknown)
   * This is NOT a strict allowlist — unknowns are warnings only
   */
  var KNOWN_DIRECTIVES = [
    "user-agent",
    "allow",
    "disallow",
    "sitemap",
    "crawl-delay",
    "host"
  ];

  /**
   * Extract visible text content
   * Works for raw robots.txt and mis-served HTML pages
   */
  var rawContent = document.body.innerText || document.body.textContent;

  /**
   * Heuristic HTML detection
   * If this trips, the file is very likely not a real robots.txt
   */
  var isHtml =
    (document.querySelector("title") && document.title.length > 0) ||
    document.querySelectorAll("div").length > 5;

  /**
   * Split file into lines
   */
  var lines = rawContent.split("\n");

  /**
   * State tracking
   */
  var hasUserAgent = false;
  var hasGlobalUA = false;
  var orphanedRules = false;
  var groupCount = 0;
  var parsedLines = 0;

  var unknownDirectives = new Set();
  var sitemapUrls = [];
  var sitemapIssues = [];

  /**
   * Line-by-line parsing
   */
  lines.forEach(function (line) {
    // Strip comments and whitespace
    line = line.split("#")[0].trim();
    if (!line) return;

    // Ignore lines without colon
    var parts = line.split(":");
    if (parts.length < 2) return;

    var key = parts[0].trim().toLowerCase();
    var value = parts.slice(1).join(":").trim();

    if (key === "user-agent") {
      hasUserAgent = true;
      groupCount++;
      if (value === "*") hasGlobalUA = true;
    }

    else if (key === "allow" || key === "disallow") {
      // Rules before any User-agent are orphaned
      if (!hasUserAgent) orphanedRules = true;
    }

    else if (key === "sitemap") {
      sitemapUrls.push(value);

      // Absolute URLs strongly recommended
      if (!/^https?:\/\//i.test(value)) {
        sitemapIssues.push("Relative sitemap URL: " + value);
      }
    }

    else if (!KNOWN_DIRECTIVES.includes(key)) {
      // Unknown directives are ignored but surfaced as advisories
      unknownDirectives.add(key);
    }

    parsedLines++;
  });

  /**
   * Verdict logic
   */
  var verdict = "Well-formed";
  var color = "#5cb85c";
  var details = [];

  if (isHtml || parsedLines === 0) {
    verdict = "Likely Malformed";
    color = "#d9534f";
    details.push("HTML or empty content detected.");
  }

  else if (!hasUserAgent) {
    verdict = "Likely Malformed";
    color = "#d9534f";
    details.push('Missing "User-agent" directive.');
  }

  else if (orphanedRules) {
    verdict = "Likely Malformed";
    color = "#d9534f";
    details.push("Allow/Disallow rules appear before any User-agent.");
  }

  else {
    // Advisory signals (non-fatal)

    if (unknownDirectives.size) {
      color = "#f0ad4e";
      details.push(
        "Unknown directives: " + Array.from(unknownDirectives).join(", ")
      );
    }

    if (!sitemapUrls.length) {
      color = "#f0ad4e";
      details.push("Missing Sitemap directive (recommended).");
    }

    if (!hasGlobalUA) {
      color = "#f0ad4e";
      details.push('No global "User-agent: *" group found.');
    }

    sitemapIssues.forEach(function (issue) {
      color = "#f0ad4e";
      details.push(issue);
    });
  }

  /**
   * Optional: Check sitemap reachability
   * Uses HEAD first, falls back to GET
   */
  Promise.all(
    sitemapUrls.map(function (url) {
      return fetch(url, { method: "HEAD" })
        .then(function (res) {
          if (!res.ok) {
            details.push("Sitemap unreachable: " + url);
          }
        })
        .catch(function () {
          return fetch(url, { method: "GET" })
            .catch(function () {
              details.push("Sitemap fetch failed: " + url);
            });
        });
    })
  ).finally(function () {
    /**
     * Render overlay UI
     */
    var panel = document.createElement("div");

    panel.style.cssText =
      "position:fixed;top:20px;right:20px;width:360px;" +
      "background:#f8f9fa;color:#333;border:1px solid #ccc;" +
      "box-shadow:0 4px 12px rgba(0,0,0,.15);" +
      "padding:15px;z-index:99999;font-family:sans-serif;" +
      "font-size:14px;border-radius:4px;" +
      "max-height:90vh;overflow:auto";

    panel.innerHTML =
      '<span style="float:right;cursor:pointer" onclick="this.parentElement.remove()">✕</span>' +
      "<h3>Robots.txt Heuristic</h3>" +
      '<div style="background:' + color +
      ';color:#fff;padding:8px;margin:8px 0;font-weight:bold">' +
      verdict +
      "</div>" +
      "<ul>" +
      details.map(function (d) {
        return "<li>" + d + "</li>";
      }).join("") +
      "</ul>" +
      '<div style="font-size:12px;color:#666">' +
      groupCount + " groups · " +
      sitemapUrls.length + " sitemaps</div>";

    document.body.appendChild(panel);
  });
 })();
diff --git a/wellformed-robotstxt.md b/wellformed-robotstxt.md
diff --git a/wellformed-robotstxt.min.js b/wellformed-robotstxt.min.js
 javascript:(function(){var K=["user-agent","allow","disallow","sitemap","crawl-delay","host"],R=document.body.innerText||document.body.textContent,H=(document.querySelector("title")&&document.title.length>0)||document.querySelectorAll("div").length>5,L=R.split("\n"),u=!1,G=!1,o=!1,g=0,n=0,d=new Set,S=[],m=[];L.forEach(function(t){t=t.split("#")[0].trim();if(!t)return;var e=t.split(":");if(e.length<2)return;var k=e[0].trim().toLowerCase(),v=e.slice(1).join(":").trim();if(k==="user-agent"){u=!0;g++;v==="*"&&(G=!0)}else if(k==="allow"||k==="disallow"){u||(o=!0)}else if(k==="sitemap"){S.push(v);/^https?:\/\//i.test(v)||m.push("Relative sitemap URL: "+v)}else K.includes(k)||d.add(k);n++});var verdict="Well-formed",color="#5cb85c",details=[];if(H||n===0){verdict="Likely Malformed";color="#d9534f";details.push("HTML or empty content.")}else if(!u){verdict="Likely Malformed";color="#d9534f";details.push('Missing "User-agent".')}else if(o){verdict="Likely Malformed";color="#d9534f";details.push("Orphaned Allow/Disallow.")}else{d.size&&(color="#f0ad4e",details.push("Unknown directives: "+Array.from(d).join(", ")));S.length||(color="#f0ad4e",details.push("Missing Sitemap (recommended)."));G||(color="#f0ad4e",details.push('No "User-agent: *".'));m.forEach(function(t){color="#f0ad4e";details.push(t)})}Promise.all(S.map(function(u){return fetch(u,{method:"HEAD"}).then(function(r){r.ok||details.push("Sitemap unreachable: "+u)}).catch(function(){return fetch(u,{method:"GET"}).catch(function(){details.push("Sitemap fetch failed: "+u)})})})).finally(function(){var e=document.createElement("div");e.style.cssText="position:fixed;top:20px;right:20px;width:360px;background:#f8f9fa;color:#333;border:1px solid #ccc;box-shadow:0 4px 12px rgba(0,0,0,.15);padding:15px;z-index:99999;font-family:sans-serif;font-size:14px;border-radius:4px;max-height:90vh;overflow:auto";e.innerHTML='<span style="float:right;cursor:pointer" onclick="this.parentElement.remove()">✕</span><h3>Robots.txt Heuristic</h3><div style="background:'+color+';color:#fff;padding:8px;margin:8px 0;font-weight:bold">'+verdict+'</div><ul>'+details.map(function(t){return"<li>"+t+"</li>"}).join("")+'</ul><div style="font-size:12px;color:#666">'+g+" groups · "+S.length+" sitemaps</div>";document.body.appendChild(e)})})();
	(function () {
	/**
	* Recognized directives (ignored if unknown)
	* This is NOT a strict allowlist — unknowns are warnings only
	*/
	var KNOWN_DIRECTIVES = [
	"user-agent",
	"allow",
	"disallow",
	"sitemap",
	"crawl-delay",
	"host"
	];

	/**
	* Extract visible text content
	* Works for raw robots.txt and mis-served HTML pages
	*/
	var rawContent = document.body.innerText \|\| document.body.textContent;

	/**
	* Heuristic HTML detection
	* If this trips, the file is very likely not a real robots.txt
	*/
	var isHtml =
	(document.querySelector("title") && document.title.length > 0) \|\|
	document.querySelectorAll("div").length > 5;

	/**
	* Split file into lines
	*/
	var lines = rawContent.split("\n");

	/**
	* State tracking
	*/
	var hasUserAgent = false;
	var hasGlobalUA = false;
	var orphanedRules = false;
	var groupCount = 0;
	var parsedLines = 0;

	var unknownDirectives = new Set();
	var sitemapUrls = [];
	var sitemapIssues = [];

	/**
	* Line-by-line parsing
	*/
	lines.forEach(function (line) {
	// Strip comments and whitespace
	line = line.split("#")[0].trim();
	if (!line) return;

	// Ignore lines without colon
	var parts = line.split(":");
	if (parts.length < 2) return;

	var key = parts[0].trim().toLowerCase();
	var value = parts.slice(1).join(":").trim();

	if (key === "user-agent") {
	hasUserAgent = true;
	groupCount++;
	if (value === "*") hasGlobalUA = true;
	}

	else if (key === "allow" \|\| key === "disallow") {
	// Rules before any User-agent are orphaned
	if (!hasUserAgent) orphanedRules = true;
	}

	else if (key === "sitemap") {
	sitemapUrls.push(value);

	// Absolute URLs strongly recommended
	if (!/^https?:\/\//i.test(value)) {
	sitemapIssues.push("Relative sitemap URL: " + value);
	}
	}

	else if (!KNOWN_DIRECTIVES.includes(key)) {
	// Unknown directives are ignored but surfaced as advisories
	unknownDirectives.add(key);
	}

	parsedLines++;
	});

	/**
	* Verdict logic
	*/
	var verdict = "Well-formed";
	var color = "#5cb85c";
	var details = [];

	if (isHtml \|\| parsedLines === 0) {
	verdict = "Likely Malformed";
	color = "#d9534f";
	details.push("HTML or empty content detected.");
	}

	else if (!hasUserAgent) {
	verdict = "Likely Malformed";
	color = "#d9534f";
	details.push('Missing "User-agent" directive.');
	}

	else if (orphanedRules) {
	verdict = "Likely Malformed";
	color = "#d9534f";
	details.push("Allow/Disallow rules appear before any User-agent.");
	}

	else {
	// Advisory signals (non-fatal)

	if (unknownDirectives.size) {
	color = "#f0ad4e";
	details.push(
	"Unknown directives: " + Array.from(unknownDirectives).join(", ")
	);
	}

	if (!sitemapUrls.length) {
	color = "#f0ad4e";
	details.push("Missing Sitemap directive (recommended).");
	}

	if (!hasGlobalUA) {
	color = "#f0ad4e";
	details.push('No global "User-agent: *" group found.');
	}

	sitemapIssues.forEach(function (issue) {
	color = "#f0ad4e";
	details.push(issue);
	});
	}

	/**
	* Optional: Check sitemap reachability
	* Uses HEAD first, falls back to GET
	*/
	Promise.all(
	sitemapUrls.map(function (url) {
	return fetch(url, { method: "HEAD" })
	.then(function (res) {
	if (!res.ok) {
	details.push("Sitemap unreachable: " + url);
	}
	})
	.catch(function () {
	return fetch(url, { method: "GET" })
	.catch(function () {
	details.push("Sitemap fetch failed: " + url);
	});
	});
	})
	).finally(function () {
	/**
	* Render overlay UI
	*/
	var panel = document.createElement("div");

	panel.style.cssText =
	"position:fixed;top:20px;right:20px;width:360px;" +
	"background:#f8f9fa;color:#333;border:1px solid #ccc;" +
	"box-shadow:0 4px 12px rgba(0,0,0,.15);" +
	"padding:15px;z-index:99999;font-family:sans-serif;" +
	"font-size:14px;border-radius:4px;" +
	"max-height:90vh;overflow:auto";

	panel.innerHTML =
	'<span style="float:right;cursor:pointer" onclick="this.parentElement.remove()">✕</span>' +
	"<h3>Robots.txt Heuristic</h3>" +
	'<div style="background:' + color +
	';color:#fff;padding:8px;margin:8px 0;font-weight:bold">' +
	verdict +
	"</div>" +
	"<ul>" +
	details.map(function (d) {
	return "<li>" + d + "</li>";
	}).join("") +
	"</ul>" +
	'<div style="font-size:12px;color:#666">' +
	groupCount + " groups · " +
	sitemapUrls.length + " sitemaps</div>";

	document.body.appendChild(panel);
	});
	})();
No results found