Skip to content

Instantly share code, notes, and snippets.

@sogaiu
Last active February 6, 2026 13:00
Show Gist options
  • Select an option

  • Save sogaiu/e923318051ccba3fa19205fdc95e3cf3 to your computer and use it in GitHub Desktop.

Select an option

Save sogaiu/e923318051ccba3fa19205fdc95e3cf3 to your computer and use it in GitHub Desktop.
(defn capture-empty-element
[& args]
(def sname (first args))
(case (length args)
0 (errorf "expected one or more args, got 0")
#
1 {:tag sname}
#
{:tag sname
:attrs (table ;(drop 1 args))}))
(comment
(capture-empty-element "empty")
# =>
{:tag "empty"}
(capture-empty-element "empty" "what" "mind")
# =>
{:attrs @{"what" "mind"} :tag "empty"}
)
(comment
(->> (partition-by |(not (string? $))
["" :x "hi" :a "there " "mate" {:a 1} "ho"])
(keep |(if (string? (first $))
(string/join $ "")
$))
flatten)
# =>
@["" :x "hi" :a "there mate" {:a 1} "ho"]
)
(defn merge-strings
[ind]
(->> (partition-by |(not (string? $))
ind)
(keep |(if (string? (first $))
(string/join $ "")
$))
(filter |(not (empty? $)))
flatten))
(comment
(merge-strings ["<" "p" ">"
"<"
"a href=\"https://lobste.rs/\""
">" "Comments" "<" "/a" ">"
"<" "/p" ">"])
# =>
@[(string "<p>"
"<a href=\"https://lobste.rs/\">Comments</a>"
"</p>")]
(merge-strings ["" :x "hi" :a "there " "mate" {:a 1} "ho"])
# =>
@[:x "hi" :a "there mate" {:a 1} "ho"]
)
(defn capture-nonempty-element
[& args]
(def stag (first args))
(def sname (first stag))
(case (length args)
0 (errorf "expected one or more args, got 0")
#
1 {:tag sname}
#
2 (when (= sname (last args))
{:tag sname})
#
(when (= sname (last args))
(def attrs (table ;(drop 1 stag)))
(def content (merge-strings (slice args 1 -2)))
{:tag sname
:attrs (if (not (empty? attrs)) attrs nil)
:content (if (not (empty? content)) content nil)})))
(comment
(capture-nonempty-element @["fun"])
# =>
{:tag "fun"}
(capture-nonempty-element @["fun"] "fun")
# =>
{:tag "fun"}
(capture-nonempty-element @["fun"] "" "fun")
# =>
{:tag "fun"}
(capture-nonempty-element @["fun" "flavor" "spicy"] "more" "fun")
# =>
{:attrs @{"flavor" "spicy"}
:content @["more"]
:tag "fun"}
(capture-nonempty-element
@["description"]
"" "<" "p" ">"
"" "<" "a href=\"https://lobste.rs/s/2svv99/competence_as_tragedy\""
">" "Comments" "<" "/a" ">"
"" "<" "/p" ">" "" "description")
# =>
{:content
@[(string "<" "p" ">"
"<"
"a href=\"https://lobste.rs/s/2svv99/competence_as_tragedy\""
">" "Comments" "<" "/a" ">"
"<" "/p" ">")]
:tag "description"}
)
# https://www.w3.org/TR/REC-xml/
# https://www.xml.com/axml/axml.html
(def xmlish-peg
# XXX: things marked with just SSS below were simplified
~@{# 2.1 Well-Formed XML Documents
:main (sequence :prolog :element (any :misc))
# 3 Logical Structures
:element (choice :empty-elem-tag :non-empty-elem)
#
:empty-elem-tag (cmt (sequence "<"
(capture :name)
(any (sequence :s+ :attribute))
:s*
"/>")
,capture-empty-element)
#
:non-empty-elem (cmt (sequence :stag :content :etag)
,capture-nonempty-element)
# 2.3 Common Syntactic Constructs
:name-start-char (choice ":" :a "_") # SSS
:name-char (sequence (choice :name-start-char "-" "." :d)) # SSS
:name (sequence :name-start-char (any :name-char))
#
:att-value
(choice (sequence
`"`
(capture (any (choice (sequence (not (set "%&")) (to `"`))
:reference)))
`"`)
(sequence
"'"
(capture (any (choice (sequence (not (set "%&")) (to "'"))
:reference)))
"'"))
# 2.4 Character Data and Markup
:char-data (any (to (set "<&"))) # XXX: `any` is correct?
# 2.5 Comments
:comment (sequence "<!--"
(any (choice (to "-")
(sequence "-" (to "-"))))
"-->") # SSS
# 2.6 Processing Instructions
:pi (sequence "<?" (to "?>") "?>") # SSS
# 2.7 CDATA Sections
:cdsect (sequence "<![CDATA[" (to "]]>") "]]>")
# 2.8 Prolog and Document Type Declaration
:prolog (sequence (opt :xml-decl)
(any :misc)
(opt (sequence :doctype-decl (any :misc))))
:xml-decl (sequence "<?xml" (to "?>") "?>") # SSS
:eq (sequence :s* "=" :s*)
:misc (choice :comment :pi :s+)
#
:doctype-decl (sequence "<!DOCTYPE" (to ">") ">") # SSS
# 3.1 Start-Tags, End-Tags, and Empty-Element Tags
:stag (group (sequence "<"
(capture :name)
(any (sequence :s+ :attribute))
:s*
">"))
#
:attribute (sequence (capture :name) :eq :att-value)
#
:etag (sequence "</"
(capture :name)
:s*
">")
#
:content (sequence (capture (opt :char-data))
(any (sequence (choice :element
(capture :reference)
(capture :cdsect)
:pi
:comment)
(capture (opt :char-data)))))
# 4.1 Character and Entity References
:char-ref (choice (sequence "&#" :d+ ";")
(sequence "&#x" :h+ ";"))
:reference (choice :entity-ref :char-ref)
:entity-ref (sequence "&" :name ";")})
(comment
(peg/match xmlish-peg "<fun></fun>")
# =>
@[{:tag "fun"}]
(peg/match xmlish-peg "<fun>more</fun>")
# =>
@[{:content @["more"] :tag "fun"}]
(peg/match xmlish-peg `<fun flavor="spicy">more</fun>`)
# =>
@[{:attrs @{"flavor" "spicy"}
:content @["more"]
:tag "fun"}]
(peg/match (merge xmlish-peg {:main :attribute})
`flavor="spicy"`)
# =>
@["flavor" "spicy"]
(peg/match (merge xmlish-peg {:main :att-value})
`"spicy"`)
# =>
@["spicy"]
(peg/match xmlish-peg
`<fun flavor="spicy" place="home">more</fun>`)
# =>
@[{:attrs @{"flavor" "spicy" "place" "home"}
:content @["more"]
:tag "fun"}]
(peg/match xmlish-peg "<empty />")
# =>
@[{:tag "empty"}]
(peg/match xmlish-peg "<empty what='mind'/>")
# =>
@[{:attrs @{"what" "mind"}
:tag "empty"}]
(peg/match xmlish-peg `<empty what='mind' when="now" />`)
# =>
@[{:attrs @{"what" "mind"
"when" "now"}
:tag "empty"}]
(peg/match xmlish-peg "<rss>\n</rss>")
# =>
@[{:content @["\n"] :tag "rss"}]
(def desc-with-content
(string
`<description>`
`&lt;p&gt;`
`&lt;a href="https://lobste.rs/"&gt;`
`Comments&lt;/a&gt;`
`&lt;/p&gt;`
`</description>`))
(peg/match (merge xmlish-peg {:main :non-empty-elem})
desc-with-content)
# =>
@[{:content
@[(string "&lt;" "p" "&gt;"
"&lt;"
"a href=\"https://lobste.rs/\""
"&gt;" "Comments" "&lt;" "/a" "&gt;"
"&lt;" "/p" "&gt;")]
:tag "description"}]
)
(comment
(def feed-1
(string
`<?xml version="1.0" encoding="UTF-8"?>` "\n"
`<rss version="2.0">`
`<description>`
`<![CDATA[a]]>`
`</description>`
`</rss>`))
(peg/match xmlish-peg feed-1)
# =>
@[{:attrs @{"version" "2.0"}
:content @[{:content @["<![CDATA[a]]>"]
:tag "description"}]
:tag "rss"}]
(def feed-2
(string
`<?xml version="1.0" encoding="UTF-8"?>` "\n"
`<rss version="2.0">`
`<description>`
`<![CDATA[`
`<a href="https://news.ycombinator.com/item?id=46895972">`
`Comments</a>`
`]]>`
`</description>`
`</rss>`))
(peg/match xmlish-peg feed-2)
# =>
@[{:attrs @{"version" "2.0"}
:content
@[{:content
@[(string "<![CDATA["
"<a href=\"https://news.ycombinator.com/"
"item?id=46895972\">Comments</a>"
"]]>")]
:tag "description"}]
:tag "rss"}]
(def feed-3
(string
`<rss version="2.0">`
`<description>`
`<![CDATA[`
`<a href="https://news.ycombinator.com/item?id=46895972">`
`Comments</a>`
`]]>`
`</description>`
`</rss>`))
(peg/match xmlish-peg feed-3)
# =>
@[{:attrs @{"version" "2.0"}
:content
@[{:content
@[(string "<![CDATA["
"<a href=\"https://news.ycombinator.com/"
"item?id=46895972\">Comments</a>"
"]]>")]
:tag "description"}]
:tag "rss"}]
(def feed-4
(string
`<rss version="2.0">`
`<channel>`
`<description>`
`<![CDATA[`
`<a href="https://news.ycombinator.com/item?id=46895972">`
`Comments</a>`
`]]>`
`</description>`
`</channel>`
`</rss>`))
(peg/match xmlish-peg feed-4)
# =>
@[{:attrs @{"version" "2.0"}
:content
@[{:content
@[{:content
@[(string "<![CDATA["
"<a href=\"https://news.ycombinator.com/"
"item?id=46895972\">Comments</a>"
"]]>")]
:tag "description"}]
:tag "channel"}]
:tag "rss"}]
(def feed-5
(string
`<rss version="2.0">`
`<channel>`
`<description>`
`<![CDATA[`
`<a href="https://news.ycombinator.com/item?id=46895972">`
`Comments</a>`
`]]>`
`</description>`
`<item>`
`</item>`
`</channel>`
`</rss>`))
(peg/match xmlish-peg feed-5)
# =>
@[{:attrs @{"version" "2.0"}
:content
@[{:content
@[{:content
@[(string "<![CDATA["
"<a href=\"https://news.ycombinator.com/"
"item?id=46895972\">Comments</a>"
"]]>")]
:tag "description"}
{:tag "item"}]
:tag "channel"}]
:tag "rss"}]
(def feed-6
(string
`<rss version="2.0">`
`<channel>`
`<title>Hacker News</title>`
`<link>https://news.ycombinator.com/</link>`
`<description>`
`Links for the intellectually curious, ranked by readers.`
`</description>`
`<item></item>`
`</channel>`
`</rss>`))
(peg/match xmlish-peg feed-6)
# =>
@[{:attrs @{"version" "2.0"}
:content
@[{:content
@[{:content @["Hacker News"] :tag "title"}
{:content @["https://news.ycombinator.com/"] :tag "link"}
{:content @[(string "Links for the intellectually curious, "
"ranked by readers.")]
:tag "description"}
{:tag "item"}]
:tag "channel"}]
:tag "rss"}]
(def feed-7
(string
`<?xml version="1.0" encoding="UTF-8"?>` "\n"
`<rss version="2.0">`
`<description>`
`<![CDATA[`
`<a href="https://news.ycombinator.com/item?id=46895972">`
`Comments</a>`
`]]>`
`</description>`
`</rss>`))
(peg/match xmlish-peg feed-7)
# =>
@[{:attrs @{"version" "2.0"}
:content
@[{:content
@[(string "<![CDATA[<a href=\"https://news.ycombinator.com/"
"item?id=46895972\">Comments</a>]]>")]
:tag "description"}]
:tag "rss"}]
(def feed-8
(string
`<?xml version="1.0" encoding="UTF-8"?>` "\n"
`<rss version="2.0">` "\n"
`<description>`
`<![CDATA[`
`<a href="https://news.ycombinator.com/item?id=46895972">`
`Comments</a>`
`]]>`
`</description>`
`</rss>`))
(peg/match xmlish-peg feed-8)
# =>
@[{:attrs @{"version" "2.0"}
:content
@["\n"
{:content
@[(string "<![CDATA[<a href=\"https://news.ycombinator.com/"
"item?id=46895972\">Comments</a>]]>")]
:tag "description"}]
:tag "rss"}]
(def feed-9
(string
`<rss version="2.0">` "\n"
`<description>`
`<![CDATA[`
`<a href="https://news.ycombinator.com/item?id=46895972">`
`Comments</a>`
`]]>`
`</description>`
`</rss>`))
(peg/match xmlish-peg feed-9)
# =>
@[{:attrs @{"version" "2.0"}
:content
@["\n"
{:content
@[(string "<![CDATA[<a href=\"https://news.ycombinator.com/"
"item?id=46895972\">Comments</a>]]>")]
:tag "description"}]
:tag "rss"}]
(def feed-10
(string
`<?xml version="1.0" encoding="UTF-8"?>` "\n"
`<rss version="2.0">`
`<channel>`
`<title>Hacker News</title>`
`<link>https://news.ycombinator.com/</link>`
`<description>`
`Links for the intellectually curious, ranked by readers.`
`</description>`
`<item>`
`<title>When internal hostnames are leaked to the clown</title>`
`<link>https://rachelbythebay.com/w/2026/02/03/badnas/</link>`
`<pubDate>Thu, 05 Feb 2026 05:22:36 +0000</pubDate>`
`<comments>https://news.ycombinator.com/item?id=46895972</comments>`
`<description>`
`<![CDATA[<a href="https://news.ycombinator.com/item?id=46895972">`
`Comments</a>]]>`
`</description>`
`</item>`
`</channel>`
`</rss>`))
(peg/match xmlish-peg feed-10)
# =>
@[{:attrs @{"version" "2.0"}
:content
@[{:content
@[{:content
@["Hacker News"]
:tag "title"}
{:content @["https://news.ycombinator.com/"]
:tag "link"}
{:content @[(string "Links for the intellectually curious, "
"ranked by readers.")]
:tag "description"}
{:content
@[{:content @["When internal hostnames are leaked to the clown"]
:tag "title"}
{:content @["https://rachelbythebay.com/w/2026/02/03/badnas/"]
:tag "link"}
{:content @["Thu, 05 Feb 2026 05:22:36 +0000"]
:tag "pubDate"}
{:content @["https://news.ycombinator.com/item?id=46895972"]
:tag "comments"}
{:content
@[(string "<![CDATA[<a href=\"https://"
"news.ycombinator.com/item?id=46895972\">"
"Comments</a>]]>")]
:tag "description"}]
:tag "item"}]
:tag "channel"}]
:tag "rss"}]
(def feed-11
(string
`<rss version="2.0">`
`<channel>`
`<title>Hacker News</title>`
`<link>https://news.ycombinator.com/</link>`
`<description>`
`Links for the intellectually curious, ranked by readers.`
`</description>`
`<item>`
`<title>When internal hostnames are leaked to the clown</title>`
`<link>https://rachelbythebay.com/w/2026/02/03/badnas/</link>`
`<pubDate>Thu, 05 Feb 2026 05:22:36 +0000</pubDate>`
`<comments>https://news.ycombinator.com/item?id=46895972</comments>`
`<description>`
`<![CDATA[<a href="https://news.ycombinator.com/item?id=46895972">`
`Comments</a>]]>`
`</description>`
`</item>`
`</channel>`
`</rss>`))
(peg/match xmlish-peg feed-11)
# =>
@[{:attrs @{"version" "2.0"}
:content
@[{:content
@[{:content @["Hacker News"] :tag "title"}
{:content @["https://news.ycombinator.com/"] :tag "link"}
{:content @[(string "Links for the intellectually curious, "
"ranked by readers.")]
:tag "description"}
{:content
@[{:content @["When internal hostnames are leaked to the clown"]
:tag "title"}
{:content @["https://rachelbythebay.com/w/2026/02/03/badnas/"]
:tag "link"}
{:content @["Thu, 05 Feb 2026 05:22:36 +0000"]
:tag "pubDate"}
{:content @["https://news.ycombinator.com/item?id=46895972"]
:tag "comments"}
{:content
@[(string "<![CDATA[<a href=\"https://"
"news.ycombinator.com/item?id=46895972\">"
"Comments</a>]]>")]
:tag "description"}]
:tag "item"}]
:tag "channel"}]
:tag "rss"}]
(def feed
``
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
<channel>
<title>Lobsters</title>
<link>https://lobste.rs/</link>
<atom:link href="https://lobste.rs/rss" rel="self"/>
<description></description>
<pubDate>Wed, 04 Feb 2026 13:20:32 -0600</pubDate>
<ttl>120</ttl>
<item>
<title>Recreating PDFs</title>
<link>https://neosmart.net/blog/</link>
<guid>https://lobste.rs/s/iyu0f8</guid>
<author>neosmart.net by mqudsi</author>
<pubDate>Wed, 04 Feb 2026 13:20:32 -0600</pubDate>
<comments>https://lobste.rs/s/iyu0f8/recreating</comments>
<description>&lt;p&gt;&lt;a href="https://lobste.rs/s/2svv99/competence_as_tragedy"&gt;Comments&lt;/a&gt;&lt;/p&gt;</description>
<category>security</category>
<category>reversing</category>
</item>
</channel>
</rss>
``)
(peg/match xmlish-peg feed)
# =>
@[{:attrs @{"version" "2.0"
"xmlns:atom" "http://www.w3.org/2005/Atom"}
:content
@["\n "
{:content
@["\n "
{:content @["Lobsters"] :tag "title"}
"\n "
{:content @["https://lobste.rs/"] :tag "link"}
"\n "
{:attrs @{"href" "https://lobste.rs/rss"
"rel" "self"}
:tag "atom:link"}
"\n "
{:tag "description"}
"\n "
{:content @["Wed, 04 Feb 2026 13:20:32 -0600"]
:tag "pubDate"}
"\n "
{:content @["120"] :tag "ttl"}
"\n "
{:content
@["\n "
{:content @["Recreating PDFs"] :tag "title"}
"\n "
{:content @["https://neosmart.net/blog/"] :tag "link"}
"\n "
{:content @["https://lobste.rs/s/iyu0f8"] :tag "guid"}
"\n "
{:content @["neosmart.net by mqudsi"] :tag "author"}
"\n "
{:content @["Wed, 04 Feb 2026 13:20:32 -0600"]
:tag "pubDate"}
"\n "
{:content @["https://lobste.rs/s/iyu0f8/recreating"]
:tag "comments"}
"\n "
{:content
@[(string "&lt;p&gt;"
"&lt;"
"a href=\"https://lobste.rs/s/2svv99/"
"competence_as_tragedy\""
"&gt;"
"Comments"
"&lt;/a&gt;"
"&lt;/p&gt;")]
:tag "description"}
"\n "
{:content @["security"] :tag "category"}
"\n "
{:content @["reversing"] :tag "category"}
"\n "]
:tag "item"}
"\n "]
:tag "channel"}
"\n"]
:tag "rss"}]
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment