Skip to content

Instantly share code, notes, and snippets.

@sogaiu
Last active February 5, 2026 09:57
Show Gist options
  • Select an option

  • Save sogaiu/ff8abc7730c238ec3cbf82699d91c638 to your computer and use it in GitHub Desktop.

Select an option

Save sogaiu/ff8abc7730c238ec3cbf82699d91c638 to your computer and use it in GitHub Desktop.
#! /usr/bin/env janet
# based on:
#
# https://codeberg.org/veqq/rss-reader/src/branch/master/rss-reader.janet
# usage: pass feed urls on command line
# requirements:
#
# * bash (or similar?)
# * md5sum
# * awk
# * w3m
# * grep
# * curl
########################################################################
(import spork/sh-dsl :as sh)
########################################################################
(def config
{# XXX: unused
:max-items 150
# limit when converting from html to text
:max-chars 1000
# stores a hash for each item already encountered
:downloaded "downloaded.txt"
# see usages for load-filters for examples
:filters "filters.txt"
# directory to store files in
:dir (string (os/getenv "HOME") "/Desktop/rss")})
########################################################################
(defn load-hashes
[content]
(when (not content)
(break @{}))
#
(tabseq [line :in (->> (string/trim content)
(string/split "\n"))
:when (not (empty? line))]
(string/trim line) true))
(comment
(def hash-content
``
b1946ac92492d2347c6235b4d2611184
c4ff45bb1fab99f9164b7fec14b2292a
``)
(load-hashes hash-content)
# =>
@{"b1946ac92492d2347c6235b4d2611184" true
"c4ff45bb1fab99f9164b7fec14b2292a" true}
)
(defn load-filters
[content]
(when (not content)
(break @[]))
#
(seq [line :in (->> (string/trim content)
(string/split "\n"))
:let [parts (string/split "," line)]
:when (>= (length parts) 3)]
{:url (get parts 0)
:type (get parts 1)
:regex (string/join (slice parts 2) ",")}))
(comment
(def filters
``
*,content,language|poetry|programming
https://news.ycombinator.com/rss,title,TypeScript
``)
(load-filters filters)
# =>
@[{:regex "language|poetry|programming"
:type "content"
:url "*"}
{:regex "TypeScript"
:type "title"
:url "https://news.ycombinator.com/rss"}]
)
(defn md5
[s]
(string (sh/$<_ echo -n ,s |
md5sum |
awk {print $1})))
(comment
(md5 "hello")
# =>
"5d41402abc4b2a76b9719d911017c592"
(md5 "hello\n")
# =>
"b1946ac92492d2347c6235b4d2611184"
(md5 "there")
# =>
"d850f04cdb48312a9be171e214c0b4ee"
(md5 "there\n")
# =>
"c4ff45bb1fab99f9164b7fec14b2292a"
)
########################################################################
# modified peg from janet-xmlish
(def xmlish-peg
~{:main (sequence (opt (drop :xml-declaration)) :s*
(opt (drop :doctype)) :s*
(any :comment) :s*
:element :s*
(any :comment) :s*)
#
:xml-declaration (sequence :s* "<?xml" :s*
(any :attribute) :s*
"?>")
# XXX: only handles very simple case
:doctype (sequence :s* "<!doctype" :s*
:tag-name :s*
">")
# XXX: not accurate
:attribute (sequence
(capture (to (set " /<=>\""))) :s*
"=" :s*
(choice (sequence `"` (capture (to `"`)) `"`)
(sequence "'" (capture (to "'")) "'"))
:s*)
# section 2.5 of xml spec
:comment (sequence
"<!--"
(any (choice
(if-not (set "-") 1)
(sequence "-" (if-not (set "-") 1))))
"-->")
#
:element (choice :empty-element :non-empty-element)
#
:empty-element (cmt (sequence
"<" (capture :tag-name) :s*
(any :attribute)
"/>")
,|(let [args $&
elt-name (first args)
attrs (drop 1 args)
attrs (if (= (length attrs) 0)
nil
(table ;attrs))]
{:attrs attrs
:tag elt-name}))
#
:non-empty-element
(cmt (sequence :open-tag
(any (choice :s+
(capture :pcdata)
:comment
:element
(capture :content)
))
:close-tag)
,|(let [args $&
open-name (first (first args))
attrs (drop 1 (first args))
close-name (last args)]
(when (= open-name close-name)
(let [elt-name open-name
content (filter (fn [c-item]
(if (string? c-item)
# drop whitespace-only strings
(not= "" (string/trim c-item))
(not= "" c-item)))
(tuple/slice args 1 -2))
content (if (= (length content) 0)
nil
content)
attrs (if (= (length attrs) 0)
nil
(table ;attrs))]
{:attrs attrs
:content content
:tag elt-name}))))
# XXX: could use work?
:tag-name (sequence (not "!") (not "?") (to (set " /<>")))
#
:open-tag (group
(sequence
"<" (capture :tag-name) :s*
(any :attribute)
">"))
#
:close-tag (sequence
"</" (capture :tag-name) :s* ">")
# XXX: may not be accurate
:pcdata (sequence "<![CDATA[" (to "]]>") "]]>")
# XXX: may not be accurate
:content (to "<")})
(comment
(def feed
``
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
<channel>
<title>Lobsters</title>
<link>https://lobste.rs/</link>
<atom:link href="https://lobste.rs/rss" rel="self"/>
<description></description>
<pubDate>Wed, 04 Feb 2026 13:20:32 -0600</pubDate>
<ttl>120</ttl>
<item>
<title>Recreating PDFs</title>
<link>https://neosmart.net/blog/</link>
<guid>https://lobste.rs/s/iyu0f8</guid>
<author>neosmart.net by mqudsi</author>
<pubDate>Wed, 04 Feb 2026 13:20:32 -0600</pubDate>
<comments>https://lobste.rs/s/iyu0f8/recreating</comments>
<description>&lt;p&gt;&lt;a href="https://lobste.rs/s/2svv99/competence_as_tragedy"&gt;Comments&lt;/a&gt;&lt;/p&gt;</description>
<category>security</category>
<category>reversing</category>
</item>
</channel>
</rss>
``)
(peg/match xmlish-peg feed)
# =>
@[{:attrs @{"version" "2.0"
"xmlns:atom" "http://www.w3.org/2005/Atom"}
:content
@[{:content
@[{:content @["Lobsters"]
:tag "title"}
{:content @["https://lobste.rs/"]
:tag "link"}
{:attrs @{"href" "https://lobste.rs/rss"
"rel" "self"}
:tag "atom:link"}
{:tag "description"}
{:content @["Wed, 04 Feb 2026 13:20:32 -0600"]
:tag "pubDate"}
{:content @["120"]
:tag "ttl"}
{:content
@[{:content @["Recreating PDFs"]
:tag "title"}
{:content @["https://neosmart.net/blog/"]
:tag "link"}
{:content @["https://lobste.rs/s/iyu0f8"]
:tag "guid"}
{:content @["neosmart.net by mqudsi"]
:tag "author"}
{:content @["Wed, 04 Feb 2026 13:20:32 -0600"]
:tag "pubDate"}
{:content @["https://lobste.rs/s/iyu0f8/recreating"]
:tag "comments"}
{:content
@[(string `&lt;p&gt;&lt;a href="https://lobste.rs/s/`
`2svv99/competence_as_tragedy"&gt;Comments&lt;`
`/a&gt;&lt;/p&gt;`)]
:tag "description"}
{:content @["security"]
:tag "category"}
{:content @["reversing"]
:tag "category"}]
:tag "item"}]
:tag "channel"}]
:tag "rss"}]
(peg/match xmlish-peg "<rss>\n</rss>")
# =>
@[{:tag "rss"}]
(def feed-1
``
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"><description><![CDATA[a]]></description></rss>
``)
(peg/match xmlish-peg feed-1)
# =>
@[{:attrs @{"version" "2.0"}
:content @[{:content @["<![CDATA[a]]>"] :tag "description"}]
:tag "rss"}]
(def feed-2
``
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"><description><![CDATA[<a href="https://news.ycombinator.com/item?id=46895972">Comments</a>]]></description></rss>
``)
(peg/match xmlish-peg feed-2)
# =>
@[{:attrs @{"version" "2.0"}
:content
@[{:content
@["<![CDATA[<a href=\"https://news.ycombinator.com/item?id=46895972\">Comments</a>]]>"]
:tag "description"}]
:tag "rss"}]
(def feed-3
``
<rss version="2.0"><description><![CDATA[<a href="https://news.ycombinator.com/item?id=46895972">Comments</a>]]></description></rss>
``)
(peg/match xmlish-peg feed-3)
# =>
@[{:attrs @{"version" "2.0"}
:content
@[{:content @["<![CDATA[<a href=\"https://news.ycombinator.com/item?id=46895972\">Comments</a>]]>"]
:tag "description"}]
:tag "rss"}]
(def feed-4
``
<rss version="2.0"><channel><description><![CDATA[<a href="https://news.ycombinator.com/item?id=46895972">Comments</a>]]></description></channel></rss>
``)
(peg/match xmlish-peg feed-4)
# =>
@[{:attrs @{"version" "2.0"}
:content
@[{:content
@[{:content
@["<![CDATA[<a href=\"https://news.ycombinator.com/item?id=46895972\">Comments</a>]]>"]
:tag "description"}]
:tag "channel"}]
:tag "rss"}]
(def feed-5
``
<rss version="2.0"><channel><description><![CDATA[<a href="https://news.ycombinator.com/item?id=46895972">Comments</a>]]></description><item></item></channel></rss>
``)
(peg/match xmlish-peg feed-5)
# =>
@[{:attrs @{"version" "2.0"}
:content
@[{:content
@[{:content
@["<![CDATA[<a href=\"https://news.ycombinator.com/item?id=46895972\">Comments</a>]]>"]
:tag "description"}
{:tag "item"}]
:tag "channel"}]
:tag "rss"}]
(def feed-6
``
<rss version="2.0"><channel><title>Hacker News</title><link>https://news.ycombinator.com/</link><description>Links for the intellectually curious, ranked by readers.</description><item></item></channel></rss>
``)
(peg/match xmlish-peg feed-6)
# =>
@[{:attrs @{"version" "2.0"}
:content
@[{:content
@[{:content @["Hacker News"] :tag "title"}
{:content @["https://news.ycombinator.com/"] :tag "link"}
{:content @["Links for the intellectually curious, ranked by readers."]
:tag "description"}
{:tag "item"}]
:tag "channel"}]
:tag "rss"}]
(def feed-7
``
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"><description><![CDATA[<a href="https://news.ycombinator.com/item?id=46895972">Comments</a>]]></description></rss>
``)
(peg/match xmlish-peg feed-7)
# =>
@[{:attrs @{"version" "2.0"}
:content
@[{:content
@["<![CDATA[<a href=\"https://news.ycombinator.com/item?id=46895972\">Comments</a>]]>"]
:tag "description"}]
:tag "rss"}]
(def feed-8
``
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<description><![CDATA[<a href="https://news.ycombinator.com/item?id=46895972">Comments</a>]]></description></rss>
``)
(peg/match xmlish-peg feed-8)
# =>
@[{:attrs @{"version" "2.0"}
:content
@[{:content
@["<![CDATA[<a href=\"https://news.ycombinator.com/item?id=46895972\">Comments</a>]]>"]
:tag "description"}]
:tag "rss"}]
(def feed-9
``
<rss version="2.0">
<description><![CDATA[<a href="https://news.ycombinator.com/item?id=46895972">Comments</a>]]></description></rss>
``)
(peg/match xmlish-peg feed-9)
# =>
'@[{:attrs @{"version" "2.0"}
:content
@[{:content
@["<![CDATA[<a href=\"https://news.ycombinator.com/item?id=46895972\">Comments</a>]]>"]
:tag "description"}]
:tag "rss"}]
(def feed-10
``
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0"><channel><title>Hacker News</title><link>https://news.ycombinator.com/</link><description>Links for the intellectually curious, ranked by readers.</description><item><title>When internal hostnames are leaked to the clown</title><link>https://rachelbythebay.com/w/2026/02/03/badnas/</link><pubDate>Thu, 05 Feb 2026 05:22:36 +0000</pubDate><comments>https://news.ycombinator.com/item?id=46895972</comments><description><![CDATA[<a href="https://news.ycombinator.com/item?id=46895972">Comments</a>]]></description></item></channel></rss>
``)
(peg/match xmlish-peg feed-10)
# =>
@[{:attrs @{"version" "2.0"}
:content
@[{:content
@[{:content
@["Hacker News"]
:tag "title"}
{:content @["https://news.ycombinator.com/"]
:tag "link"}
{:content @["Links for the intellectually curious, ranked by readers."]
:tag "description"}
{:content
@[{:content @["When internal hostnames are leaked to the clown"]
:tag "title"}
{:content @["https://rachelbythebay.com/w/2026/02/03/badnas/"]
:tag "link"}
{:content @["Thu, 05 Feb 2026 05:22:36 +0000"]
:tag "pubDate"}
{:content @["https://news.ycombinator.com/item?id=46895972"]
:tag "comments"}
{:content @["<![CDATA[<a href=\"https://news.ycombinator.com/item?id=46895972\">Comments</a>]]>"]
:tag "description"}]
:tag "item"}]
:tag "channel"}]
:tag "rss"}]
(def feed-11
``
<rss version="2.0"><channel><title>Hacker News</title><link>https://news.ycombinator.com/</link><description>Links for the intellectually curious, ranked by readers.</description><item><title>When internal hostnames are leaked to the clown</title><link>https://rachelbythebay.com/w/2026/02/03/badnas/</link><pubDate>Thu, 05 Feb 2026 05:22:36 +0000</pubDate><comments>https://news.ycombinator.com/item?id=46895972</comments><description><![CDATA[<a href="https://news.ycombinator.com/item?id=46895972">Comments</a>]]></description></item></channel></rss>
``)
(peg/match xmlish-peg feed-11)
# =>
'@[{:attrs @{"version" "2.0"}
:content
@[{:content
@[{:content @["Hacker News"] :tag "title"}
{:content @["https://news.ycombinator.com/"] :tag "link"}
{:content @["Links for the intellectually curious, ranked by readers."]
:tag "description"}
{:content
@[{:content @["When internal hostnames are leaked to the clown"]
:tag "title"}
{:content @["https://rachelbythebay.com/w/2026/02/03/badnas/"]
:tag "link"}
{:content @["Thu, 05 Feb 2026 05:22:36 +0000"]
:tag "pubDate"}
{:content @["https://news.ycombinator.com/item?id=46895972"]
:tag "comments"}
{:content @["<![CDATA[<a href=\"https://news.ycombinator.com/item?id=46895972\">Comments</a>]]>"]
:tag "description"}]
:tag "item"}]
:tag "channel"}]
:tag "rss"}]
)
########################################################################
(defn html->text
[html &opt max-chars]
(default max-chars (dyn :max-chars 1000))
(def clipped (string/slice html 0
(min (length html) max-chars)))
(when (empty? clipped)
(break ""))
#
(sh/$<_ echo ,clipped |
w3m -T text/html -dump))
(comment
(def html
``
<html>
<body>
<h1>greetings</h1>
<p>health, happiness, and peace</p>
</body>
</html>
``)
(html->text html)
# =>
@"greetings\n\nhealth, happiness, and peace\n"
)
(defn extract-href
[xml]
(def href-str `href="`)
(def hslen (length href-str))
#
(if-let [s (string/find href-str xml)
e (string/find `"` xml (+ s hslen))]
(string/slice xml (+ s hslen) e) ""))
(comment
(def item-str
``
<item>
<title>I miss thinking hard</title>
<link>https://www.jernesto.com/articles/thinking_hard</link>
<guid>https://lobste.rs/s/hfuiti</guid>
<author>jernesto.com via xyproto</author>
<pubDate>Wed, 04 Feb 2026 02:35:56 -0600</pubDate>
<comments>https://lobste.rs/s/hfuiti/i_miss_thinking_hard</comments>
<description>&lt;p&gt;&lt;a href="https://lobste.rs/s/hfuiti/i_miss_thinking_hard"&gt;Comments&lt;/a&gt;&lt;/p&gt;</description>
<category>philosophy</category>
<category>vibecoding</category>
</item>
``)
(extract-href item-str)
# =>
"https://lobste.rs/s/hfuiti/i_miss_thinking_hard"
(def desc-content-str
(string `&lt;p&gt;&lt;a href="https://lobste.rs/s/hfuiti/`
`i_miss_thinking_hard"&gt;Comments&lt;/a&gt;&lt;/p&gt;`))
(extract-href desc-content-str)
# =>
"https://lobste.rs/s/hfuiti/i_miss_thinking_hard"
)
########################################################################
(defn rss->items
[rss-str]
(def m (peg/match xmlish-peg rss-str))
(assert m "failed to extract items")
#
(def rss (first m))
(assertf (= "rss" (get rss :tag))
"unexpected tag: %n in rss: %n" (get rss :tag) rss)
#
(def rss-content (get rss :content))
(when (not rss-content)
(break @[]))
#
(def chnl (first rss-content))
(when (not chnl)
(break @[]))
#
(assertf (= "channel" (get chnl :tag))
"unexpected tag: %n in channel: %n" (get chnl :tag) chnl)
#
(def chnl-content (get chnl :content))
(when (not chnl-content)
(break @[]))
#
(def items (filter |(= "item" (get $ :tag)) chnl-content))
(when (empty? items)
(break @[]))
#
(seq [it :in items
:let [it-content (get it :content)]
:when (and it-content (not (empty? it-content)))]
(def item @{})
(each thing it-content
(when (dictionary? thing)
# XXX: highlander assumption
(def thing-content (first (get thing :content)))
(put item (get thing :tag) thing-content)))
(def title (get item "title"))
(assertf title "failed to find title in: %n" item)
(def link (get item "link"))
(assertf link "failed to find link in: %n" item)
(def desc (get item "description"))
(assertf desc "failed to find description in: %n" item)
#
(def content (when-let [content-raw (get item "content")]
(html->text content-raw)))
# XXX: should content be examined for oddities?
{:title title
:link link
:desc-link (extract-href desc)
:description (html->text desc)
:content content}))
(comment
(def feed
``
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
<channel>
<title>Lobsters</title>
<link>https://lobste.rs/</link>
<atom:link href="https://lobste.rs/rss" rel="self"/>
<description></description>
<pubDate>Wed, 04 Feb 2026 13:20:32 -0600</pubDate>
<ttl>120</ttl>
<item>
<title>Recreating PDFs</title>
<link>https://neosmart.net/blog/</link>
<guid>https://lobste.rs/s/iyu0f8</guid>
<author>neosmart.net by mqudsi</author>
<pubDate>Wed, 04 Feb 2026 13:20:32 -0600</pubDate>
<comments>https://lobste.rs/s/iyu0f8/recreating</comments>
<description>&lt;p&gt;&lt;a href="https://lobste.rs/s/2svv99/competence_as_tragedy"&gt;Comments&lt;/a&gt;&lt;/p&gt;</description>
<category>security</category>
<category>reversing</category>
</item>
</channel>
</rss>
``)
(rss->items feed)
# =>
@[{:desc-link "https://lobste.rs/s/2svv99/competence_as_tragedy"
:description
(buffer "<p><a href=\"https://lobste.rs/s/2svv99/"
"competence_as_tragedy\">Comments</a></p>")
:link "https://neosmart.net/blog/"
:title "Recreating PDFs"}]
)
########################################################################
(defn passes?
[filters url ftype content]
(when (or (not content) (empty? content))
(break true))
#
(var passed true)
(each {:type a-type :url a-url :regex a-regex} filters
(when (and (= a-type ftype)
(or (= a-url "*") (= a-url url))
(not (sh/$? echo ,content |
grep -Eq ,a-regex)))
(set passed false)
(break)))
#
passed)
(comment
(def filters
@[{:regex "language|poetry|programming"
:type "content"
:url "*"}
{:regex "TypeScript"
:type "title"
:url "https://news.ycombinator.com/rss"}])
(def item-url "https://news.ycombinator.com/rss")
(passes? filters "https://news.ycombinator.com/rss" "url" item-url)
# =>
true
(passes? filters "https://news.ycombinator.com/rss" "content" "yuck")
# =>
false
)
(defn get-feed
[url]
(try (sh/$< curl -s ,url)
([e] (errorf "curl failed for %s with: %s" url e))))
(comment
(get-feed "https://lobste.rs/rss")
)
(defn process-feed
[url &opt dl-content filter-content]
(default dl-content (dyn :downloaded))
(default filter-content (dyn :filters))
#
(def hashes (load-hashes dl-content))
(def filters (load-filters filter-content))
#
(def new @[])
#
(def rss-str (get-feed url))
(when (empty? rss-str)
(printf "Empty content from: %s" url)
(break nil))
#
(print "From " url)
(print)
#
(def items (rss->items rss-str))
#
(each {:title title
:link link1
:desc-link link2
:description description
:content content} items
(def hsh (md5 (string link1 link2 title)))
(when (not (get hashes hsh))
(when (and (passes? filters url "url" link1)
(passes? filters url "url" link2)
(passes? filters url "title" title)
(passes? filters url "content" description)
(passes? filters url "content" content))
(print "--- " url)
(print hsh)
(when (not (empty? link1)) (print link1))
(when (not (empty? link2)) (print link2))
(when (not (empty? title)) (print title))
(when (not (empty? description)) (print description))
(when (not (or (not content)
(empty? content)
(= description content)))
(print content))
(print)
#
(array/push new hsh)
(put hashes hsh true))))
#
new)
########################################################################
(defn main
[_ & args]
(def {:max-items max-items
:max-chars max-chars
:dir config-dir
:downloaded downloaded
:filters filters} config)
#
(setdyn :max-chars max-chars)
#
(os/mkdir config-dir)
#
(def dl-path (string config-dir "/" downloaded))
(def dl-content
(when (= :file (os/stat dl-path :mode))
(slurp dl-path)))
#
(def filters-path (string config-dir "/" filters))
(def filter-content
(when (= :file (os/stat filters-path :mode))
(slurp filters-path)))
#
(setdyn :downloaded dl-content)
(setdyn :filters filter-content)
#
(def seen? (if-not dl-content
@{}
(->> (string/trim dl-content)
(string/split "\n")
invert)))
#
(each feed-url args
(def new (process-feed feed-url))
#
(when (not (empty? new))
(with [f (file/open dl-path :a)]
(each h new
(when (not (get seen? h))
(put seen? h true)
(file/write f h "\n")))))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment