Skip to content

Instantly share code, notes, and snippets.

@Thorium
Created February 10, 2026 18:17
Show Gist options
  • Select an option

  • Save Thorium/a8bbd9f772adfa9798060d84c96bc5da to your computer and use it in GitHub Desktop.

Select an option

Save Thorium/a8bbd9f772adfa9798060d84c96bc5da to your computer and use it in GitHub Desktop.
1) Script for downloading reMarkable PDFs from their cloud service, 2) Script to convert PDFs to Markdown (pdf2md) to be consumed e.g. by agents.
#!/usr/bin/env dotnet fsi
// =============================================================================
// reMarkable Cloud File Downloader and PDF to Markdown (pdf2md) - F# Script
// Based on the PHP ReMarkableAPI by splitbrain/remarkable-api
// This is to download reMarkable cloud files and convert them e.g. for agents to read.
// You'll need to add/use reMarkable device web-script (to get auth token).
// =============================================================================
//
// Usage:
// dotnet fsi remarkable-download.fsx register <one-time-code>
// dotnet fsi remarkable-download.fsx list [/path]
// dotnet fsi remarkable-download.fsx download <path-or-id> [output-dir]
//
// The script stores the auth token in 'auth.token' in the current directory.
// =============================================================================
open System
open System.IO
open System.Net.Http
open System.Net.Http.Headers
open System.Text
open System.Text.Json
open System.Threading.Tasks
// =============================================================================
// Constants
// =============================================================================
[<Literal>]
let AuthApi = "https://webapp-prod.cloud.remarkable.engineering"
[<Literal>]
let ServiceDiscoveryApi = "https://service-manager-production-dot-remarkable-production.appspot.com"
let DefaultStorageApi = "https://document-storage-production-dot-remarkable-production.appspot.com"
[<Literal>]
let TokenFile = "auth.token"
[<Literal>]
let TypeCollection = "CollectionType"
[<Literal>]
let TypeDocument = "DocumentType"
// =============================================================================
// Data types
// =============================================================================
type RemarkableItem = {
ID: string
Version: int
Message: string
Success: bool
BlobURLGet: string
BlobURLGetExpires: string
ModifiedClient: string
Type: string
VissibleName: string
Parent: string
CurrentPage: int
Bookmarked: bool
// Computed client-side
Path: string
}
type ServiceDiscoveryResponse = {
Status: string
Host: string
}
// =============================================================================
// JSON helpers
// =============================================================================
let jsonOptions =
let opts = JsonSerializerOptions()
opts.PropertyNameCaseInsensitive <- true
opts
let parseItem (elem: JsonElement) : RemarkableItem =
let str (name: string) =
match elem.TryGetProperty(name) with
| true, v when v.ValueKind = JsonValueKind.String -> v.GetString()
| _ -> ""
let int' (name: string) =
match elem.TryGetProperty(name) with
| true, v when v.ValueKind = JsonValueKind.Number -> v.GetInt32()
| _ -> 0
let bool' (name: string) =
match elem.TryGetProperty(name) with
| true, v when v.ValueKind = JsonValueKind.True -> true
| true, v when v.ValueKind = JsonValueKind.False -> v.GetBoolean()
| _ -> false
{
ID = str "ID"
Version = int' "Version"
Message = str "Message"
Success = bool' "Success"
BlobURLGet = str "BlobURLGet"
BlobURLGetExpires = str "BlobURLGetExpires"
ModifiedClient = str "ModifiedClient"
Type = str "Type"
VissibleName = str "VissibleName"
Parent = str "Parent"
CurrentPage = int' "CurrentPage"
Bookmarked = bool' "Bookmarked"
Path = ""
}
let parseItems (json: string) : RemarkableItem list =
let doc = JsonDocument.Parse(json)
[ for elem in doc.RootElement.EnumerateArray() -> parseItem elem ]
// =============================================================================
// Console helpers
// =============================================================================
module Console =
let color c (f: unit -> 'a) =
let old = Console.ForegroundColor
Console.ForegroundColor <- c
let result = f ()
Console.ForegroundColor <- old
result
let error msg =
color ConsoleColor.Red (fun () -> eprintfn "ERROR: %s" msg)
let warn msg =
color ConsoleColor.Yellow (fun () -> eprintfn "WARNING: %s" msg)
let info msg =
color ConsoleColor.Cyan (fun () -> printfn "%s" msg)
let success msg =
color ConsoleColor.Green (fun () -> printfn "%s" msg)
// =============================================================================
// Progress bar
// =============================================================================
module ProgressBar =
let private barWidth = 40
let render (label: string) (current: int64) (total: int64) =
let pct =
if total > 0L then float current / float total
else 0.0
let filled = int (pct * float barWidth)
let empty = barWidth - filled
let bar = String('#', filled) + String('-', empty)
let sizeMB = float current / (1024.0 * 1024.0)
let totalMB = float total / (1024.0 * 1024.0)
let text =
if total > 0L then
sprintf "\r %s [%s] %5.1f%% (%5.1f / %5.1f MB)" label bar (pct * 100.0) sizeMB totalMB
else
sprintf "\r %s [%s] %5.1f MB" label (String('?', barWidth)) sizeMB
Console.Error.Write(text)
let complete (label: string) (total: int64) =
let totalMB = float total / (1024.0 * 1024.0)
let bar = String('#', barWidth)
let text = sprintf "\r %s [%s] 100.0%% (%5.1f MB)" label bar totalMB
Console.Error.WriteLine(text)
// =============================================================================
// HTTP Client wrapper
// =============================================================================
type RemarkableClient() =
let httpClient = new HttpClient()
let mutable token = ""
let mutable storageApi = DefaultStorageApi
member _.Token
with get() = token
and set(v) = token <- v
member _.StorageApi
with get() = storageApi
and set(v) = storageApi <- v
member private _.AuthHeaders () =
let msg = new HttpRequestMessage()
if not (String.IsNullOrEmpty(token)) then
msg.Headers.Authorization <- AuthenticationHeaderValue("Bearer", token)
msg
member this.GetStringAsync(url: string) = task {
use msg = new HttpRequestMessage(HttpMethod.Get, url)
if not (String.IsNullOrEmpty(token)) then
msg.Headers.Authorization <- AuthenticationHeaderValue("Bearer", token)
let! resp = httpClient.SendAsync(msg)
resp.EnsureSuccessStatusCode() |> ignore
return! resp.Content.ReadAsStringAsync()
}
member this.PostStringAsync(url: string, body: string) = task {
use msg = new HttpRequestMessage(HttpMethod.Post, url)
if not (String.IsNullOrEmpty(token)) then
msg.Headers.Authorization <- AuthenticationHeaderValue("Bearer", token)
if not (String.IsNullOrEmpty(body)) then
msg.Content <- new StringContent(body, Encoding.UTF8, "application/json")
let! resp = httpClient.SendAsync(msg)
resp.EnsureSuccessStatusCode() |> ignore
return! resp.Content.ReadAsStringAsync()
}
member this.PutJsonAsync(url: string, body: string) = task {
use msg = new HttpRequestMessage(HttpMethod.Put, url)
if not (String.IsNullOrEmpty(token)) then
msg.Headers.Authorization <- AuthenticationHeaderValue("Bearer", token)
msg.Content <- new StringContent(body, Encoding.UTF8, "application/json")
let! resp = httpClient.SendAsync(msg)
resp.EnsureSuccessStatusCode() |> ignore
return! resp.Content.ReadAsStringAsync()
}
member this.DownloadFileAsync(url: string, destPath: string, label: string) = task {
use msg = new HttpRequestMessage(HttpMethod.Get, url)
if not (String.IsNullOrEmpty(token)) then
msg.Headers.Authorization <- AuthenticationHeaderValue("Bearer", token)
let! resp = httpClient.SendAsync(msg, HttpCompletionOption.ResponseHeadersRead)
resp.EnsureSuccessStatusCode() |> ignore
let totalBytes =
if resp.Content.Headers.ContentLength.HasValue then
resp.Content.Headers.ContentLength.Value
else
0L
use! stream = resp.Content.ReadAsStreamAsync()
use fileStream = new FileStream(destPath, FileMode.Create, FileAccess.Write, FileShare.None, 8192, true)
let buffer = Array.zeroCreate<byte> 81920
let mutable totalRead = 0L
let mutable reading = true
while reading do
let! bytesRead = stream.ReadAsync(buffer, 0, buffer.Length)
if bytesRead = 0 then
reading <- false
else
do! fileStream.WriteAsync(buffer, 0, bytesRead)
totalRead <- totalRead + int64 bytesRead
ProgressBar.render label totalRead totalBytes
ProgressBar.complete label totalRead
return totalRead
}
interface IDisposable with
member _.Dispose() = httpClient.Dispose()
// =============================================================================
// reMarkable API functions
// =============================================================================
module Api =
/// Register a new device with a one-time code
let register (client: RemarkableClient) (code: string) = task {
let deviceId = Guid.NewGuid().ToString()
let body = sprintf """{"code":"%s","deviceDesc":"desktop-windows","deviceID":"%s"}""" code deviceId
let url = sprintf "%s/token/json/2/device/new" AuthApi
let! token = client.PostStringAsync(url, body)
// The response is a raw JWT token string (may be quoted)
let token = token.Trim().Trim('"')
return token
}
/// Refresh the bearer token
let refreshToken (client: RemarkableClient) = task {
let url = sprintf "%s/token/json/2/user/new" AuthApi
let! newToken = client.PostStringAsync(url, "")
let newToken = newToken.Trim().Trim('"')
client.Token <- newToken
return newToken
}
/// Discover the storage API endpoint
let discoverStorage (client: RemarkableClient) = task {
let url = sprintf "%s/service/json/1/document-storage?environment=production&group=auth0%%7C5a68dc51cb30df3877a1d7c4&apiVer=2" ServiceDiscoveryApi
let! json = client.GetStringAsync(url)
let doc = JsonDocument.Parse(json)
let root = doc.RootElement
match root.TryGetProperty("Status") with
| true, status when status.GetString() = "OK" ->
match root.TryGetProperty("Host") with
| true, host ->
let hostStr = host.GetString()
client.StorageApi <- sprintf "https://%s" hostStr
| _ -> ()
| _ ->
Console.warn "Service discovery did not return OK status, using default storage API"
}
/// Initialize the client: refresh token + discover storage
let init (client: RemarkableClient) (savedToken: string) = task {
client.Token <- savedToken
let! newToken = refreshToken client
do! discoverStorage client
return newToken
}
/// List all items (flat list from API)
let listItems (client: RemarkableClient) = task {
let url = sprintf "%s/document-storage/json/2/docs" client.StorageApi
let! json = client.GetStringAsync(url)
return parseItems json
}
/// Get a single item by ID, optionally with blob download URL
let getItem (client: RemarkableClient) (id: string) (withBlob: bool) = task {
let blobParam = if withBlob then "&withBlob=true" else ""
let url = sprintf "%s/document-storage/json/2/docs?doc=%s%s" client.StorageApi id blobParam
let! json = client.GetStringAsync(url)
let items = parseItems json
match items with
| item :: _ -> return Some item
| [] -> return None
}
// =============================================================================
// Filesystem tree (reconstructing hierarchy from flat items)
// =============================================================================
module FS =
/// Build a lookup index and compute paths for all items
let buildTree (items: RemarkableItem list) : Map<string, RemarkableItem> * Map<string, RemarkableItem list> =
let index = items |> List.map (fun i -> i.ID, i) |> Map.ofList
// Recursive path calculation
let rec calcPath (index: Map<string, RemarkableItem>) (item: RemarkableItem) : string option =
if String.IsNullOrEmpty(item.Parent) then
Some (sprintf "/%s" item.VissibleName)
else
match Map.tryFind item.Parent index with
| Some parent ->
match calcPath index parent with
| Some parentPath -> Some (sprintf "%s/%s" parentPath item.VissibleName)
| None -> None
| None -> None // orphan item
// Compute paths for all items
let itemsWithPaths =
items
|> List.choose (fun item ->
match calcPath index item with
| Some path -> Some { item with Path = path }
| None -> None)
let indexWithPaths =
itemsWithPaths |> List.map (fun i -> i.ID, i) |> Map.ofList
let tree =
itemsWithPaths
|> List.groupBy (fun i -> i.Path)
|> Map.ofList
(indexWithPaths, tree)
/// Find items under a given path prefix
let findUnderPath (indexWithPaths: Map<string, RemarkableItem>) (path: string) : RemarkableItem list =
let normalizedPath = if path.EndsWith("/") then path else path + "/"
indexWithPaths
|> Map.toList
|> List.map snd
|> List.filter (fun item ->
item.Path.StartsWith(normalizedPath, StringComparison.OrdinalIgnoreCase)
|| item.Path.Equals(path.TrimEnd('/'), StringComparison.OrdinalIgnoreCase))
/// Find an item by exact path
let findByPath (indexWithPaths: Map<string, RemarkableItem>) (path: string) (itemType: string option) : RemarkableItem option =
indexWithPaths
|> Map.toList
|> List.map snd
|> List.tryFind (fun item ->
item.Path.Equals(path, StringComparison.OrdinalIgnoreCase)
&& (match itemType with Some t -> item.Type = t | None -> true))
/// Get direct children of a path (one level deep)
let directChildren (indexWithPaths: Map<string, RemarkableItem>) (parentPath: string) : RemarkableItem list =
let normalizedParent = parentPath.TrimEnd('/')
indexWithPaths
|> Map.toList
|> List.map snd
|> List.filter (fun item ->
if normalizedParent = "" then
// Root: items whose path is /<name> (single component after root slash)
let parts = item.Path.TrimStart('/').Split('/')
parts.Length = 1 && parts.[0] <> ""
else
// Children: items whose path starts with parentPath/ and have exactly one more component
let afterParent =
if item.Path.StartsWith(normalizedParent + "/", StringComparison.OrdinalIgnoreCase) then
item.Path.Substring(normalizedParent.Length + 1)
else
""
afterParent <> "" && not (afterParent.Contains("/")))
|> List.sortBy (fun item -> (item.Type <> TypeCollection, item.VissibleName.ToLowerInvariant()))
// =============================================================================
// Commands
// =============================================================================
module Commands =
// --- Token persistence ---
let loadToken () =
if File.Exists(TokenFile) then
Some (File.ReadAllText(TokenFile).Trim())
else
None
let saveToken (token: string) =
File.WriteAllText(TokenFile, token)
// --- Register ---
let cmdRegister (code: string) = task {
use client = new RemarkableClient()
Console.info "Registering device with reMarkable cloud..."
let! token = Api.register client code
saveToken token
Console.success "Registration successful! Token saved to auth.token"
}
// --- Initialize client (shared by list/download) ---
let initClient () = task {
match loadToken () with
| None ->
Console.error "No auth token found. Run 'register <code>' first."
Console.info "Get a code at: https://my.remarkable.com/device/connect/desktop"
return None
| Some savedToken ->
use client = new RemarkableClient()
try
let! newToken = Api.init client savedToken
saveToken newToken
// Return a new client that we won't dispose (caller manages it)
let c = new RemarkableClient()
c.Token <- client.Token
c.StorageApi <- client.StorageApi
return Some c
with ex ->
Console.error (sprintf "Failed to initialize: %s" ex.Message)
return None
}
// --- List ---
let cmdList (path: string option) = task {
let! clientOpt = initClient ()
match clientOpt with
| None -> ()
| Some client ->
use _client = client
Console.info "Fetching document list..."
let! items = Api.listItems client
let (index, _tree) = FS.buildTree items
let targetPath =
match path with
| Some p -> p.TrimEnd('/')
| None -> ""
let children = FS.directChildren index targetPath
if children.IsEmpty then
let displayPath = if targetPath = "" then "/" else targetPath
Console.warn (sprintf "No items found under '%s'" displayPath)
else
let displayPath = if targetPath = "" then "/" else targetPath
printfn ""
Console.info (sprintf "Contents of %s:" displayPath)
printfn ""
// Print header
printfn " %-6s %-40s %-20s %s" "Type" "Name" "Modified" "ID"
printfn " %s %s %s %s" (String('-', 6)) (String('-', 40)) (String('-', 20)) (String('-', 36))
for item in children do
let icon = if item.Type = TypeCollection then "[DIR]" else "[DOC]"
let modified =
if String.IsNullOrEmpty(item.ModifiedClient) then "—"
else
try
let dt = DateTimeOffset.Parse(item.ModifiedClient)
dt.LocalDateTime.ToString("yyyy-MM-dd HH:mm")
with _ -> item.ModifiedClient.Substring(0, min 19 item.ModifiedClient.Length)
let name =
if item.VissibleName.Length > 40 then
item.VissibleName.Substring(0, 37) + "..."
else
item.VissibleName
let color =
if item.Type = TypeCollection then ConsoleColor.Yellow
else ConsoleColor.White
Console.color color (fun () ->
printfn " %-6s %-40s %-20s %s" icon name modified item.ID)
printfn ""
let folders = children |> List.filter (fun i -> i.Type = TypeCollection) |> List.length
let docs = children |> List.filter (fun i -> i.Type = TypeDocument) |> List.length
printfn " %d folder(s), %d document(s)" folders docs
printfn ""
}
// --- Download helpers ---
let sanitizeFileName (name: string) =
let invalid = Path.GetInvalidFileNameChars()
name.ToCharArray()
|> Array.map (fun c -> if Array.contains c invalid then '_' else c)
|> String
let downloadSingleFile (client: RemarkableClient) (item: RemarkableItem) (outputDir: string) = task {
// Get the download URL
let! itemWithBlob = Api.getItem client item.ID true
match itemWithBlob with
| None ->
Console.error (sprintf "Could not retrieve item: %s" item.VissibleName)
return false
| Some itemData ->
if String.IsNullOrEmpty(itemData.BlobURLGet) then
Console.error (sprintf "No download URL for: %s (is it a folder?)" item.VissibleName)
return false
else
let fileName = sanitizeFileName item.VissibleName + ".zip"
let destPath = Path.Combine(outputDir, fileName)
// Ensure output directory exists
Directory.CreateDirectory(outputDir) |> ignore
let! _size = client.DownloadFileAsync(itemData.BlobURLGet, destPath, item.VissibleName)
return true
}
// --- Download ---
let cmdDownload (pathOrId: string) (outputDir: string option) = task {
let! clientOpt = initClient ()
match clientOpt with
| None -> ()
| Some client ->
use _client = client
Console.info "Fetching document list..."
let! items = Api.listItems client
let (index, _tree) = FS.buildTree items
let outDir = defaultArg outputDir "."
// Try to find by path first, then by ID
let isUuid =
match Guid.TryParse(pathOrId) with
| true, _ -> true
| false, _ -> false
let targetItems =
if isUuid then
// Direct ID lookup
match Map.tryFind pathOrId index with
| Some item -> [ item ]
| None ->
Console.error (sprintf "No item found with ID: %s" pathOrId)
[]
else
let normalizedPath =
if pathOrId.StartsWith("/") then pathOrId
else "/" + pathOrId
// Try exact match first
match FS.findByPath index normalizedPath None with
| Some item -> [ item ]
| None ->
// Try as prefix (folder path)
let found = FS.findUnderPath index normalizedPath
if found.IsEmpty then
Console.error (sprintf "No item found matching: %s" pathOrId)
found
if not targetItems.IsEmpty then
// Separate folders and documents
let documents =
if targetItems.Length = 1 && targetItems.[0].Type = TypeCollection then
// Single folder selected: download all documents inside it recursively
let folderPath = targetItems.[0].Path
let allUnder = FS.findUnderPath index folderPath
allUnder |> List.filter (fun i -> i.Type = TypeDocument)
else
targetItems |> List.filter (fun i -> i.Type = TypeDocument)
if documents.IsEmpty then
Console.warn "No downloadable documents found at the specified path."
else
printfn ""
Console.info (sprintf "Downloading %d document(s) to '%s'..." documents.Length outDir)
printfn ""
let mutable successCount = 0
let mutable failCount = 0
for i, doc in documents |> List.mapi (fun i d -> (i, d)) do
printfn " [%d/%d] %s" (i + 1) documents.Length doc.Path
// Reconstruct relative directory structure
let relativePath =
if targetItems.Length = 1 && targetItems.[0].Type = TypeCollection then
let basePath = targetItems.[0].Path
let rel = doc.Path.Substring(basePath.Length).TrimStart('/')
let parts = rel.Split('/')
if parts.Length > 1 then
// Has subdirectories
parts.[.. parts.Length - 2]
|> Array.map sanitizeFileName
|> String.concat (string Path.DirectorySeparatorChar)
else
""
else
""
let destDir =
if String.IsNullOrEmpty(relativePath) then outDir
else Path.Combine(outDir, relativePath)
try
let! ok = downloadSingleFile client doc destDir
if ok then successCount <- successCount + 1
else failCount <- failCount + 1
with ex ->
Console.error (sprintf "Failed to download '%s': %s" doc.VissibleName ex.Message)
failCount <- failCount + 1
printfn ""
Console.success (sprintf "Download complete: %d succeeded, %d failed" successCount failCount)
printfn ""
}
// =============================================================================
// PDF to Markdown conversion
// =============================================================================
module Pdf2Md =
open System.Diagnostics
/// Result of a conversion attempt
type ConvertResult =
| Success of outputPath: string
| Failed of reason: string
/// Run an external process with optional environment variables and capture stdout/stderr
let private runProcessWithEnv (fileName: string) (arguments: string) (workDir: string option) (envVars: (string * string) list) =
let psi = ProcessStartInfo(fileName, arguments)
psi.UseShellExecute <- false
psi.RedirectStandardOutput <- true
psi.RedirectStandardError <- true
psi.CreateNoWindow <- true
workDir |> Option.iter (fun d -> psi.WorkingDirectory <- d)
for (key, value) in envVars do
psi.EnvironmentVariables.[key] <- value
try
use proc = Process.Start(psi)
// Read stdout/stderr asynchronously to avoid deadlocks
let stdoutTask = proc.StandardOutput.ReadToEndAsync()
let stderrTask = proc.StandardError.ReadToEndAsync()
let exited = proc.WaitForExit(5 * 60 * 1000) // 5-minute timeout per process
if not exited then
try proc.Kill(true) with _ -> ()
Some (-1, "", "Process timed out after 10 minutes")
else
let stdout = stdoutTask.Result
let stderr = stderrTask.Result
Some (proc.ExitCode, stdout, stderr)
with _ ->
None
/// Run an external process and capture stdout/stderr
let private runProcess (fileName: string) (arguments: string) (workDir: string option) =
runProcessWithEnv fileName arguments workDir []
/// Check if a command exists on PATH
let private commandExists (cmd: string) =
let whereCmd =
if Environment.OSVersion.Platform = PlatformID.Win32NT then "where"
else "which"
match runProcess whereCmd cmd None with
| Some (0, out, _) when not (String.IsNullOrWhiteSpace(out)) -> true
| _ -> false
/// Check if marker-pdf is available (tries multiple methods)
let isMarkerAvailable () =
let candidates = [
// marker_single CLI
fun () ->
match runProcess "marker_single" "--help" None with
| Some (0, _, _) -> Some "marker_single"
| _ -> None
// Python marker API
fun () ->
match runProcess "python" "-c \"from marker.converters.pdf import PdfConverter; print('ok')\"" None with
| Some (0, out, _) when out.Trim() = "ok" -> Some "python-api"
| _ -> None
// Python3 marker API
fun () ->
match runProcess "python3" "-c \"from marker.converters.pdf import PdfConverter; print('ok')\"" None with
| Some (0, out, _) when out.Trim() = "ok" -> Some "python3-api"
| _ -> None
// WSL marker_single
fun () ->
match runProcess "wsl" "marker_single --help" None with
| Some (0, _, _) -> Some "wsl marker_single"
| _ -> None
]
candidates |> List.tryPick (fun f -> f ())
/// Check if pdftotext is available
let isPdfToTextAvailable () =
if commandExists "pdftotext" then Some "pdftotext"
else
match runProcess "wsl" "which pdftotext" None with
| Some (0, out, _) when not (String.IsNullOrWhiteSpace(out)) -> Some "wsl pdftotext"
| _ -> None
/// Check if pdfplumber is available via WSL (pdf2md.py script)
let isPdfPlumberAvailable () =
// Check if pdf2md.py exists in the same directory as this script
let scriptDir = AppDomain.CurrentDomain.BaseDirectory
let pdf2mdPaths = [
Path.Combine(scriptDir, "pdf2md.py")
Path.Combine(Environment.CurrentDirectory, "pdf2md.py")
]
let pdf2mdPath = pdf2mdPaths |> List.tryFind File.Exists
match pdf2mdPath with
| None -> None
| Some localPath ->
// Check if WSL pdfplumber venv is available (MSYS_NO_PATHCONV prevents Git Bash path mangling)
let wslEnv = [("MSYS_NO_PATHCONV", "1")]
match runProcessWithEnv "wsl" "-- /home/thorium/.spectrum/venv/bin/python3 -c \"import pdfplumber; print('ok')\"" None wslEnv with
| Some (0, out, _) when out.Trim() = "ok" -> Some localPath
| _ -> None
/// Convert a Windows path (e.g. C:\foo\bar) to WSL path (/mnt/c/foo/bar)
let private toWslPath (windowsPath: string) =
let fullPath = Path.GetFullPath(windowsPath)
let drive = fullPath.[0] |> Char.ToLower
let rest = fullPath.[2..].Replace('\\', '/')
$"/mnt/{drive}{rest}"
/// Convert PDF using marker-pdf (high quality, ML-based)
let convertWithMarker (pdfPath: string) (outputDir: string) (markerCmd: string) =
Directory.CreateDirectory(outputDir) |> ignore
let pdfFull = Path.GetFullPath(pdfPath)
let baseName = Path.GetFileNameWithoutExtension(pdfPath)
let expectedOutputDir = Path.Combine(outputDir, baseName)
let outputMdPath = Path.Combine(outputDir, baseName + ".md")
let cmd, args =
if markerCmd.EndsWith("-api") then
let pythonCmd = if markerCmd.StartsWith("python3") then "python3" else "python"
let escapedOutDir = outputDir.Replace("\\", "\\\\").Replace("'", "\\'")
let escapedPdf = pdfFull.Replace("\\", "\\\\").Replace("'", "\\'")
let escapedMd = outputMdPath.Replace("\\", "\\\\").Replace("'", "\\'")
let tempScript = Path.Combine(Path.GetTempPath(), "marker_convert.py")
let pyCode = [
"import os"
"from multiprocessing import freeze_support"
""
"def main():"
$" os.makedirs('{escapedOutDir}', exist_ok=True)"
" from marker.models import create_model_dict"
" from marker.converters.pdf import PdfConverter"
" from marker.config.parser import ConfigParser"
" config = ConfigParser({'output_format': 'markdown'})"
" artifacts = create_model_dict()"
" converter = PdfConverter(artifact_dict=artifacts, config=config.generate_config_dict())"
$" rendered = converter('{escapedPdf}')"
$" with open('{escapedMd}', 'w', encoding='utf-8') as f:"
" f.write(rendered.markdown)"
" print(f'Wrote {len(rendered.markdown)} chars')"
""
"if __name__ == '__main__':"
" freeze_support()"
" main()"
]
File.WriteAllLines(tempScript, pyCode)
pythonCmd, $"\"{tempScript}\""
elif markerCmd.StartsWith("wsl") then
let wslPath = toWslPath pdfFull
let wslOutDir = toWslPath (Path.GetFullPath(outputDir))
"wsl", $"marker_single \"{wslPath}\" --output_dir \"{wslOutDir}\" --output_format markdown"
else
markerCmd, $"\"{pdfFull}\" --output_dir \"{outputDir}\" --output_format markdown"
Console.info $" Converting with marker-pdf: {baseName}"
match runProcess cmd args None with
| Some (0, _, _) ->
if File.Exists(outputMdPath) then
let content = File.ReadAllText(outputMdPath)
if content.Trim().Length > 50 then Success outputMdPath
else Failed "marker produced empty or near-empty output"
else
let mdFiles =
if Directory.Exists(expectedOutputDir) then
Directory.GetFiles(expectedOutputDir, "*.md")
else
Directory.GetFiles(outputDir, $"{baseName}*.md")
match mdFiles |> Array.tryHead with
| Some mdFile ->
let content = File.ReadAllText(mdFile)
if content.Trim().Length > 50 then Success mdFile
else Failed "marker produced empty or near-empty output"
| None ->
Failed $"marker ran but no .md file found in {outputDir} or {expectedOutputDir}"
| Some (code, _, stderr) ->
Failed $"marker exited with code {code}: {stderr.Trim()}"
| None ->
Failed "failed to start marker process"
// -----------------------------------------------------------------
// pdfplumber conversion (via WSL pdf2md.py)
// -----------------------------------------------------------------
/// Convert PDF using pdfplumber via WSL (pdf2md.py script)
let convertWithPdfPlumber (pdfPath: string) (outputDir: string) (pdf2mdScriptPath: string) =
Directory.CreateDirectory(outputDir) |> ignore
let pdfFull = Path.GetFullPath(pdfPath)
let baseName = Path.GetFileNameWithoutExtension(pdfPath)
let mdPath = Path.Combine(Path.GetFullPath(outputDir), baseName + ".md")
let wslPdfPath = toWslPath pdfFull
let wslMdPath = toWslPath mdPath
let wslScriptPath = toWslPath pdf2mdScriptPath
let pythonPath = "/home/thorium/.spectrum/venv/bin/python3"
let args = $"-- {pythonPath} {wslScriptPath} \"{wslPdfPath}\" \"{wslMdPath}\""
let wslEnv = [("MSYS_NO_PATHCONV", "1")]
Console.info $" Converting with pdfplumber: {baseName}"
match runProcessWithEnv "wsl" args None wslEnv with
| Some (0, _, _) ->
if File.Exists(mdPath) then
let content = File.ReadAllText(mdPath)
if content.Trim().Length > 50 then Success mdPath
else Failed "pdfplumber produced empty or near-empty output"
else
Failed $"pdfplumber ran but output file not found: {mdPath}"
| Some (code, stdout, stderr) ->
let errMsg = if String.IsNullOrWhiteSpace(stderr) then stdout.Trim() else stderr.Trim()
Failed $"pdfplumber exited with code {code}: {errMsg}"
| None ->
Failed "failed to start WSL/pdfplumber process"
// -----------------------------------------------------------------
// pdftotext + heuristic formatting fallback
// -----------------------------------------------------------------
open System.Text.RegularExpressions
/// Common PDF ligature breaks: "fi" -> "f i", "fl" -> "f l", "ff" -> "f f", etc.
let private fixLigatures (text: string) =
text
// fi ligature: "classi cation" -> "classification"
.Replace("fi ", "fi").Replace("fi\n", "fi\n")
// fl ligature: "re ect" -> "reflect"
.Replace("fl ", "fl").Replace("fl\n", "fl\n")
// ff ligature: "e ective" -> "effective", "di erence" -> "difference"
.Replace("ff ", "ff").Replace("ff\n", "ff\n")
// ffi ligature: "e cient" -> "efficient"
.Replace("ffi ", "ffi").Replace("ffi\n", "ffi\n")
// ffl ligature
.Replace("ffl ", "ffl").Replace("ffl\n", "ffl\n")
/// More targeted ligature repair using regex for common broken words
let private fixCommonLigatureWords (text: string) =
// These are the most common ligature-broken words in academic PDFs
let fixes = [
(@"\bclassi\s+cation", "classification")
(@"\bspeci\s+c\b", "specific")
(@"\bsigni\s+cant", "significant")
(@"\barti\s+cial", "artificial")
(@"\bscienti\s+c\b", "scientific")
(@"\bdi\s+erent", "different")
(@"\bdi\s+erence", "difference")
(@"\bdi\s+cult", "difficult")
(@"\be\s+ective", "effective")
(@"\be\s+ect\b", "effect")
(@"\be\s+ort\b", "effort")
(@"\be\s+cien", "efficien")
(@"\bo\s+er\b", "offer")
(@"\bsu\s+cien", "sufficien")
(@"\bsu\s+er\b", "suffer")
(@"\bco\s+ee\b", "coffee")
(@"\b\s+lter", "filter")
(@"\b\s+rst\b", "first")
(@"\b\s+nd\b", "find")
(@"\b\s+eld", "field")
(@"\b\s+le\b", "file")
(@"\b\s+nite", "finite")
(@"\b\s+gure", "figure")
(@"\b\s+ve\b", "five")
(@"\b\s+xed\b", "fixed")
(@"\b\s+t\b", "fit")
(@"\bpro\s+le\b", "profile")
(@"\bin\s+uence", "influence")
(@"\bre\s+ect", "reflect")
(@"\buore", "fluore")
]
fixes |> List.fold (fun text (pattern, replacement) ->
Regex.Replace(text, pattern, replacement, RegexOptions.IgnoreCase)
) text
/// Heuristic: detect if a line looks like a chapter/section heading.
/// Must be a standalone heading line (not mid-sentence).
let private isHeading (line: string) (prevLine: string option) =
let trimmed = line.Trim()
if String.IsNullOrEmpty(trimmed) then None
// Only match "Chapter N" at the very start of a line, and require it to be
// a standalone heading (previous line was empty or doesn't exist)
elif Regex.IsMatch(trimmed, @"^Chapter\s+\d+\s*[\-–—:]") then
// Ensure previous line was blank (standalone heading, not mid-sentence)
match prevLine with
| Some prev when not (String.IsNullOrWhiteSpace(prev)) -> None
| _ -> Some 1
// Numbered sub-sub-sections: "2.3.1 Something"
elif Regex.IsMatch(trimmed, @"^\d+\.\d+\.\d+\s+[A-Z]") then
Some 3
// Numbered sections: "2.1 Introduction", "14.10 The Google PageRank"
elif Regex.IsMatch(trimmed, @"^\d+\.\d+\s+[A-Z]") then
Some 2
// ALL-CAPS short lines (often headings, but not single words like "SPAM")
elif trimmed.Length <= 60 && trimmed.Length >= 5
&& trimmed = trimmed.ToUpper()
&& Regex.IsMatch(trimmed, @"^[A-Z][A-Z\s\-:]+[A-Z]$") then
Some 2
// Well-known section names, only when standalone (previous blank)
elif Regex.IsMatch(trimmed, @"^(Preface|Appendix|Bibliography|Bibliographic Notes|Exercises|References|Index|Glossary|Acknowledgments|Dedication|Foreword|Conclusion|Summary)\b")
&& trimmed.Length < 80 then
match prevLine with
| Some prev when not (String.IsNullOrWhiteSpace(prev)) -> None
| _ -> Some 2
else
None
/// Heuristic: detect if a line is likely a page number or header/footer artifact
let private isArtifact (line: string) =
let trimmed = line.Trim()
// Standalone page number (1-4 digits alone on a line)
Regex.IsMatch(trimmed, @"^\d{1,4}$")
// Page header pattern like "42 2. Overview of Supervised Learning"
|| Regex.IsMatch(trimmed, @"^\d+\s{2,}\d+\.\s")
// Roman numeral page numbers
|| Regex.IsMatch(trimmed, @"^[ivxlcdm]+$")
/// Detect FIGURE/TABLE captions
let private isFigureOrTable (line: string) =
let trimmed = line.Trim()
Regex.IsMatch(trimmed, @"^(FIGURE|TABLE|Figure|Table)\s+\d+[\.\:]")
/// Detect bullet points
let private isBullet (line: string) =
let trimmed = line.Trim()
trimmed.StartsWith("●") || trimmed.StartsWith("•")
|| trimmed.StartsWith("◦") || trimmed.StartsWith("▪")
|| Regex.IsMatch(trimmed, @"^[\-\*]\s+\S")
// Lettered list items: "a. something", "b. something"
|| Regex.IsMatch(trimmed, @"^[a-z]\.\s+\S")
/// Detect equation reference lines (standalone "(2.7)" etc.)
let private isEquationRef (line: string) =
let trimmed = line.Trim()
Regex.IsMatch(trimmed, @"^\(\d+\.\d+\)$")
/// Convert raw pdftotext output to structured markdown
let private textToMarkdown (rawText: string) (sourceFileName: string) =
let lines = rawText.Split([| '\n' |])
let sb = StringBuilder()
let title = Path.GetFileNameWithoutExtension(sourceFileName)
sb.AppendLine($"# {title}").AppendLine() |> ignore
let mutable prevWasEmpty = true
let mutable inBlock = false
let mutable prevRawLine : string option = None
for line in lines do
let trimmed = line.Trim()
if String.IsNullOrWhiteSpace(trimmed) then
if not prevWasEmpty then
sb.AppendLine() |> ignore
prevWasEmpty <- true
inBlock <- false
elif isArtifact trimmed then
() // skip page numbers and headers
elif isEquationRef trimmed then
if not prevWasEmpty then sb.AppendLine() |> ignore
sb.AppendLine(trimmed).AppendLine() |> ignore
prevWasEmpty <- true
inBlock <- false
elif isFigureOrTable trimmed then
if not prevWasEmpty then sb.AppendLine() |> ignore
sb.AppendLine($"***{trimmed}***").AppendLine() |> ignore
prevWasEmpty <- true
inBlock <- false
elif isBullet trimmed then
let content =
if trimmed.StartsWith("●") || trimmed.StartsWith("•")
|| trimmed.StartsWith("◦") || trimmed.StartsWith("▪") then
trimmed.Substring(1).Trim()
elif Regex.IsMatch(trimmed, @"^[a-z]\.\s+") then
trimmed
else
trimmed.Substring(1).Trim()
sb.AppendLine($"- {content}") |> ignore
prevWasEmpty <- false
inBlock <- false
else
match isHeading trimmed prevRawLine with
| Some level ->
if not prevWasEmpty then sb.AppendLine() |> ignore
let prefix = String('#', level + 1)
sb.AppendLine($"{prefix} {trimmed}").AppendLine() |> ignore
prevWasEmpty <- true
inBlock <- false
| None ->
if inBlock && not prevWasEmpty then
sb.Append(' ').Append(trimmed) |> ignore
else
sb.Append(trimmed) |> ignore
prevWasEmpty <- false
inBlock <- true
prevRawLine <- Some trimmed
sb.ToString() |> fixCommonLigatureWords
/// Convert PDF using pdftotext + heuristic markdown formatting
let convertWithPdfToText (pdfPath: string) (outputDir: string) (pdftotextCmd: string) =
Directory.CreateDirectory(outputDir) |> ignore
let pdfFull = Path.GetFullPath(pdfPath)
let baseName = Path.GetFileNameWithoutExtension(pdfPath)
let mdPath = Path.Combine(outputDir, baseName + ".md")
let cmd, args =
if pdftotextCmd.StartsWith("wsl") then
"wsl", $"pdftotext -layout \"{toWslPath pdfFull}\" -"
else
"pdftotext", $"-layout \"{pdfFull}\" -"
Console.info $" Converting with pdftotext (fallback): {baseName}"
match runProcess cmd args None with
| Some (0, stdout, _) ->
if stdout.Trim().Length > 50 then
let markdown = textToMarkdown stdout baseName
File.WriteAllText(mdPath, markdown)
Success mdPath
else
Failed "pdftotext produced empty or near-empty output (scanned PDF without OCR?)"
| Some (code, _, stderr) ->
Failed $"pdftotext exited with code {code}: {stderr.Trim()}"
| None ->
Failed "failed to start pdftotext process"
// -----------------------------------------------------------------
// Fallback chain: marker -> pdfplumber -> pdftotext+heuristics
// -----------------------------------------------------------------
/// Detected tool configuration (avoids re-detecting per PDF in batch mode)
type ToolConfig = {
Marker: string option
PdfPlumber: string option
PdfToText: string option
}
/// Detect available conversion tools
let detectTools () : ToolConfig =
{ Marker = isMarkerAvailable ()
PdfPlumber = isPdfPlumberAvailable ()
PdfToText = isPdfToTextAvailable () }
/// Convert a single PDF to markdown using the best available tool
let convertPdfWith (tools: ToolConfig) (pdfPath: string) (outputDir: string) : ConvertResult =
if not (File.Exists(pdfPath)) then
Failed $"PDF file not found: {pdfPath}"
else
// Build a prioritized list of conversion attempts
let converters =
[ tools.Marker |> Option.map (fun cmd -> "marker-pdf", (fun () -> convertWithMarker pdfPath outputDir cmd))
tools.PdfPlumber |> Option.map (fun scr -> "pdfplumber", (fun () -> convertWithPdfPlumber pdfPath outputDir scr))
tools.PdfToText |> Option.map (fun cmd -> "pdftotext", (fun () -> convertWithPdfToText pdfPath outputDir cmd)) ]
|> List.choose id
if converters.IsEmpty then
Failed "No conversion tools found. Install marker-pdf, pdfplumber (WSL), or pdftotext (poppler-utils)."
else
let rec tryConverters remaining =
match remaining with
| [] -> Failed "All conversion tools failed"
| (name, convert) :: rest ->
match convert () with
| Success path -> Success path
| Failed reason when rest.IsEmpty ->
Failed $"{name} failed: {reason}"
| Failed reason ->
let nextName = fst rest.Head
Console.warn $" {name} failed ({reason}), trying {nextName}..."
tryConverters rest
tryConverters converters
/// Convert a single PDF (auto-detects tools; use convertPdfWith for batch)
let convertPdf (pdfPath: string) (outputDir: string) =
convertPdfWith (detectTools ()) pdfPath outputDir
/// Convert multiple PDFs to markdown, showing progress
let convertBatch (pdfPaths: string list) (outputDir: string) =
printfn ""
let total = pdfPaths.Length
Console.info $"Converting {total} PDF(s) to Markdown..."
// Detect tools once for the entire batch
let tools = detectTools ()
// Display detected tools
tools.Marker |> Option.iter (fun cmd -> Console.info $" Primary tool: marker-pdf ({cmd})")
tools.PdfPlumber |> Option.iter (fun path -> Console.info $" Mid-tier tool: pdfplumber via WSL ({Path.GetFileName(path)})")
tools.PdfToText |> Option.iter (fun cmd -> Console.info $" Fallback tool: pdftotext ({cmd})")
if tools.Marker.IsNone then Console.warn " marker-pdf not found (install with: pip install marker-pdf)"
if tools.PdfPlumber.IsNone then Console.warn " pdfplumber not found (requires WSL + pdf2md.py + pdfplumber venv)"
if tools.PdfToText.IsNone then Console.warn " pdftotext not found (install poppler-utils)"
if tools.Marker.IsNone && tools.PdfPlumber.IsNone && tools.PdfToText.IsNone then
Console.error "No PDF conversion tools available!"
Console.info "Install one of:"
Console.info " pip install marker-pdf (recommended, ML-based, high quality)"
Console.info " WSL + pdfplumber (mid-tier, font-size-aware headings)"
Console.info " apt install poppler-utils (pdftotext, lightweight fallback)"
[]
else
printfn ""
let results =
pdfPaths |> List.mapi (fun i pdfPath ->
let name = Path.GetFileName(pdfPath)
printfn $" [{i + 1}/{total}] {name}"
let result = convertPdfWith tools pdfPath outputDir
match result with
| Success outPath -> Console.success $" -> {outPath}"
| Failed reason -> Console.error $" FAILED: {reason}"
pdfPath, result
)
let successCount = results |> List.filter (fun (_, r) -> match r with Success _ -> true | _ -> false) |> List.length
let failCount = results.Length - successCount
printfn ""
Console.success $"Conversion complete: {successCount} succeeded, {failCount} failed"
results
// =============================================================================
// Convert command
// =============================================================================
module Commands2 =
let cmdConvert (pathOrPattern: string) (outputDir: string option) =
let outDir = defaultArg outputDir "."
// Collect PDF files to convert
let pdfFiles =
if File.Exists(pathOrPattern) then
// Single file
[ Path.GetFullPath(pathOrPattern) ]
elif Directory.Exists(pathOrPattern) then
// Directory: find all PDFs recursively
Directory.GetFiles(Path.GetFullPath(pathOrPattern), "*.pdf", SearchOption.AllDirectories)
|> Array.toList
else
// Try as glob pattern
let dir = Path.GetDirectoryName(pathOrPattern)
let pattern = Path.GetFileName(pathOrPattern)
let searchDir = if String.IsNullOrEmpty(dir) then "." else dir
if Directory.Exists(searchDir) then
Directory.GetFiles(searchDir, pattern)
|> Array.filter (fun f -> f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase))
|> Array.toList
else
[]
if pdfFiles.IsEmpty then
Console.error (sprintf "No PDF files found matching: %s" pathOrPattern)
else
let _results = Pdf2Md.convertBatch pdfFiles outDir
()
// =============================================================================
// CLI entry point
// =============================================================================
let showUsage () =
printfn ""
printfn "reMarkable Cloud File Downloader & PDF Converter"
printfn "================================================"
printfn ""
printfn "Usage:"
printfn " dotnet fsi remarkable-download.fsx register <code> Register device"
printfn " dotnet fsi remarkable-download.fsx list [/path] List folders and files"
printfn " dotnet fsi remarkable-download.fsx download <path|id> [output-dir]"
printfn " Download file or folder"
printfn " dotnet fsi remarkable-download.fsx convert <pdf|dir> [output-dir]"
printfn " Convert PDF(s) to Markdown"
printfn ""
printfn "Examples:"
printfn " dotnet fsi remarkable-download.fsx register abcdefgh"
printfn " dotnet fsi remarkable-download.fsx list"
printfn " dotnet fsi remarkable-download.fsx list /MyNotebooks"
printfn " dotnet fsi remarkable-download.fsx download /MyNotebooks/Paper.pdf"
printfn " dotnet fsi remarkable-download.fsx download /MyNotebooks ./downloads"
printfn " dotnet fsi remarkable-download.fsx download 12345678-abcd-efgh-ijkl-123456789abc"
printfn " dotnet fsi remarkable-download.fsx convert mybook.pdf"
printfn " dotnet fsi remarkable-download.fsx convert mybook.pdf ./markdown-output"
printfn " dotnet fsi remarkable-download.fsx convert ./pdfs-folder ./markdown-output"
printfn ""
printfn "Conversion tools (install at least one):"
printfn " pip install marker-pdf ML-based, best quality (recommended)"
printfn " WSL + pdfplumber + pdf2md.py Mid-tier, font-size-aware headings"
printfn " pdftotext (poppler-utils) Lightweight fallback, heuristic formatting"
printfn ""
printfn "First-time setup:"
printfn " 1. Go to https://my.remarkable.com/device/connect/desktop"
printfn " 2. Get a one-time code"
printfn " 3. Run: dotnet fsi remarkable-download.fsx register <code>"
printfn ""
let main (args: string[]) =
// fsi.CommandLineArgs has the script name as [0], then user args
// Environment.GetCommandLineArgs() has dotnet, fsi flags, script name - less reliable
// We try fsi.CommandLineArgs first, fall back to Environment.GetCommandLineArgs()
let args =
args
|> Array.toList
// Skip the script file name and any flags
|> List.skipWhile (fun a -> a.EndsWith(".fsx") || a.StartsWith("--"))
match args with
| "register" :: code :: _ ->
let t = Commands.cmdRegister code
t.GetAwaiter().GetResult()
| "list" :: [] ->
let t = Commands.cmdList None
t.GetAwaiter().GetResult()
| "list" :: path :: _ ->
let t = Commands.cmdList (Some path)
t.GetAwaiter().GetResult()
| "download" :: pathOrId :: [] ->
let t = Commands.cmdDownload pathOrId None
t.GetAwaiter().GetResult()
| "download" :: pathOrId :: outDir :: _ ->
let t = Commands.cmdDownload pathOrId (Some outDir)
t.GetAwaiter().GetResult()
| "convert" :: pathOrPattern :: [] ->
Commands2.cmdConvert pathOrPattern None
| "convert" :: pathOrPattern :: outDir :: _ ->
Commands2.cmdConvert pathOrPattern (Some outDir)
| _ ->
showUsage ()
main (fsi.CommandLineArgs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment