Created
February 10, 2026 18:17
-
-
Save Thorium/a8bbd9f772adfa9798060d84c96bc5da to your computer and use it in GitHub Desktop.
1) Script for downloading reMarkable PDFs from their cloud service, 2) Script to convert PDFs to Markdown (pdf2md) to be consumed e.g. by agents.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env dotnet fsi | |
| // ============================================================================= | |
| // reMarkable Cloud File Downloader and PDF to Markdown (pdf2md) - F# Script | |
| // Based on the PHP ReMarkableAPI by splitbrain/remarkable-api | |
| // This is to download reMarkable cloud files and convert them e.g. for agents to read. | |
| // You'll need to add/use reMarkable device web-script (to get auth token). | |
| // ============================================================================= | |
| // | |
| // Usage: | |
| // dotnet fsi remarkable-download.fsx register <one-time-code> | |
| // dotnet fsi remarkable-download.fsx list [/path] | |
| // dotnet fsi remarkable-download.fsx download <path-or-id> [output-dir] | |
| // | |
| // The script stores the auth token in 'auth.token' in the current directory. | |
| // ============================================================================= | |
| open System | |
| open System.IO | |
| open System.Net.Http | |
| open System.Net.Http.Headers | |
| open System.Text | |
| open System.Text.Json | |
| open System.Threading.Tasks | |
| // ============================================================================= | |
| // Constants | |
| // ============================================================================= | |
| [<Literal>] | |
| let AuthApi = "https://webapp-prod.cloud.remarkable.engineering" | |
| [<Literal>] | |
| let ServiceDiscoveryApi = "https://service-manager-production-dot-remarkable-production.appspot.com" | |
| let DefaultStorageApi = "https://document-storage-production-dot-remarkable-production.appspot.com" | |
| [<Literal>] | |
| let TokenFile = "auth.token" | |
| [<Literal>] | |
| let TypeCollection = "CollectionType" | |
| [<Literal>] | |
| let TypeDocument = "DocumentType" | |
| // ============================================================================= | |
| // Data types | |
| // ============================================================================= | |
| type RemarkableItem = { | |
| ID: string | |
| Version: int | |
| Message: string | |
| Success: bool | |
| BlobURLGet: string | |
| BlobURLGetExpires: string | |
| ModifiedClient: string | |
| Type: string | |
| VissibleName: string | |
| Parent: string | |
| CurrentPage: int | |
| Bookmarked: bool | |
| // Computed client-side | |
| Path: string | |
| } | |
| type ServiceDiscoveryResponse = { | |
| Status: string | |
| Host: string | |
| } | |
| // ============================================================================= | |
| // JSON helpers | |
| // ============================================================================= | |
| let jsonOptions = | |
| let opts = JsonSerializerOptions() | |
| opts.PropertyNameCaseInsensitive <- true | |
| opts | |
| let parseItem (elem: JsonElement) : RemarkableItem = | |
| let str (name: string) = | |
| match elem.TryGetProperty(name) with | |
| | true, v when v.ValueKind = JsonValueKind.String -> v.GetString() | |
| | _ -> "" | |
| let int' (name: string) = | |
| match elem.TryGetProperty(name) with | |
| | true, v when v.ValueKind = JsonValueKind.Number -> v.GetInt32() | |
| | _ -> 0 | |
| let bool' (name: string) = | |
| match elem.TryGetProperty(name) with | |
| | true, v when v.ValueKind = JsonValueKind.True -> true | |
| | true, v when v.ValueKind = JsonValueKind.False -> v.GetBoolean() | |
| | _ -> false | |
| { | |
| ID = str "ID" | |
| Version = int' "Version" | |
| Message = str "Message" | |
| Success = bool' "Success" | |
| BlobURLGet = str "BlobURLGet" | |
| BlobURLGetExpires = str "BlobURLGetExpires" | |
| ModifiedClient = str "ModifiedClient" | |
| Type = str "Type" | |
| VissibleName = str "VissibleName" | |
| Parent = str "Parent" | |
| CurrentPage = int' "CurrentPage" | |
| Bookmarked = bool' "Bookmarked" | |
| Path = "" | |
| } | |
| let parseItems (json: string) : RemarkableItem list = | |
| let doc = JsonDocument.Parse(json) | |
| [ for elem in doc.RootElement.EnumerateArray() -> parseItem elem ] | |
| // ============================================================================= | |
| // Console helpers | |
| // ============================================================================= | |
| module Console = | |
| let color c (f: unit -> 'a) = | |
| let old = Console.ForegroundColor | |
| Console.ForegroundColor <- c | |
| let result = f () | |
| Console.ForegroundColor <- old | |
| result | |
| let error msg = | |
| color ConsoleColor.Red (fun () -> eprintfn "ERROR: %s" msg) | |
| let warn msg = | |
| color ConsoleColor.Yellow (fun () -> eprintfn "WARNING: %s" msg) | |
| let info msg = | |
| color ConsoleColor.Cyan (fun () -> printfn "%s" msg) | |
| let success msg = | |
| color ConsoleColor.Green (fun () -> printfn "%s" msg) | |
| // ============================================================================= | |
| // Progress bar | |
| // ============================================================================= | |
| module ProgressBar = | |
| let private barWidth = 40 | |
| let render (label: string) (current: int64) (total: int64) = | |
| let pct = | |
| if total > 0L then float current / float total | |
| else 0.0 | |
| let filled = int (pct * float barWidth) | |
| let empty = barWidth - filled | |
| let bar = String('#', filled) + String('-', empty) | |
| let sizeMB = float current / (1024.0 * 1024.0) | |
| let totalMB = float total / (1024.0 * 1024.0) | |
| let text = | |
| if total > 0L then | |
| sprintf "\r %s [%s] %5.1f%% (%5.1f / %5.1f MB)" label bar (pct * 100.0) sizeMB totalMB | |
| else | |
| sprintf "\r %s [%s] %5.1f MB" label (String('?', barWidth)) sizeMB | |
| Console.Error.Write(text) | |
| let complete (label: string) (total: int64) = | |
| let totalMB = float total / (1024.0 * 1024.0) | |
| let bar = String('#', barWidth) | |
| let text = sprintf "\r %s [%s] 100.0%% (%5.1f MB)" label bar totalMB | |
| Console.Error.WriteLine(text) | |
| // ============================================================================= | |
| // HTTP Client wrapper | |
| // ============================================================================= | |
| type RemarkableClient() = | |
| let httpClient = new HttpClient() | |
| let mutable token = "" | |
| let mutable storageApi = DefaultStorageApi | |
| member _.Token | |
| with get() = token | |
| and set(v) = token <- v | |
| member _.StorageApi | |
| with get() = storageApi | |
| and set(v) = storageApi <- v | |
| member private _.AuthHeaders () = | |
| let msg = new HttpRequestMessage() | |
| if not (String.IsNullOrEmpty(token)) then | |
| msg.Headers.Authorization <- AuthenticationHeaderValue("Bearer", token) | |
| msg | |
| member this.GetStringAsync(url: string) = task { | |
| use msg = new HttpRequestMessage(HttpMethod.Get, url) | |
| if not (String.IsNullOrEmpty(token)) then | |
| msg.Headers.Authorization <- AuthenticationHeaderValue("Bearer", token) | |
| let! resp = httpClient.SendAsync(msg) | |
| resp.EnsureSuccessStatusCode() |> ignore | |
| return! resp.Content.ReadAsStringAsync() | |
| } | |
| member this.PostStringAsync(url: string, body: string) = task { | |
| use msg = new HttpRequestMessage(HttpMethod.Post, url) | |
| if not (String.IsNullOrEmpty(token)) then | |
| msg.Headers.Authorization <- AuthenticationHeaderValue("Bearer", token) | |
| if not (String.IsNullOrEmpty(body)) then | |
| msg.Content <- new StringContent(body, Encoding.UTF8, "application/json") | |
| let! resp = httpClient.SendAsync(msg) | |
| resp.EnsureSuccessStatusCode() |> ignore | |
| return! resp.Content.ReadAsStringAsync() | |
| } | |
| member this.PutJsonAsync(url: string, body: string) = task { | |
| use msg = new HttpRequestMessage(HttpMethod.Put, url) | |
| if not (String.IsNullOrEmpty(token)) then | |
| msg.Headers.Authorization <- AuthenticationHeaderValue("Bearer", token) | |
| msg.Content <- new StringContent(body, Encoding.UTF8, "application/json") | |
| let! resp = httpClient.SendAsync(msg) | |
| resp.EnsureSuccessStatusCode() |> ignore | |
| return! resp.Content.ReadAsStringAsync() | |
| } | |
| member this.DownloadFileAsync(url: string, destPath: string, label: string) = task { | |
| use msg = new HttpRequestMessage(HttpMethod.Get, url) | |
| if not (String.IsNullOrEmpty(token)) then | |
| msg.Headers.Authorization <- AuthenticationHeaderValue("Bearer", token) | |
| let! resp = httpClient.SendAsync(msg, HttpCompletionOption.ResponseHeadersRead) | |
| resp.EnsureSuccessStatusCode() |> ignore | |
| let totalBytes = | |
| if resp.Content.Headers.ContentLength.HasValue then | |
| resp.Content.Headers.ContentLength.Value | |
| else | |
| 0L | |
| use! stream = resp.Content.ReadAsStreamAsync() | |
| use fileStream = new FileStream(destPath, FileMode.Create, FileAccess.Write, FileShare.None, 8192, true) | |
| let buffer = Array.zeroCreate<byte> 81920 | |
| let mutable totalRead = 0L | |
| let mutable reading = true | |
| while reading do | |
| let! bytesRead = stream.ReadAsync(buffer, 0, buffer.Length) | |
| if bytesRead = 0 then | |
| reading <- false | |
| else | |
| do! fileStream.WriteAsync(buffer, 0, bytesRead) | |
| totalRead <- totalRead + int64 bytesRead | |
| ProgressBar.render label totalRead totalBytes | |
| ProgressBar.complete label totalRead | |
| return totalRead | |
| } | |
| interface IDisposable with | |
| member _.Dispose() = httpClient.Dispose() | |
| // ============================================================================= | |
| // reMarkable API functions | |
| // ============================================================================= | |
| module Api = | |
| /// Register a new device with a one-time code | |
| let register (client: RemarkableClient) (code: string) = task { | |
| let deviceId = Guid.NewGuid().ToString() | |
| let body = sprintf """{"code":"%s","deviceDesc":"desktop-windows","deviceID":"%s"}""" code deviceId | |
| let url = sprintf "%s/token/json/2/device/new" AuthApi | |
| let! token = client.PostStringAsync(url, body) | |
| // The response is a raw JWT token string (may be quoted) | |
| let token = token.Trim().Trim('"') | |
| return token | |
| } | |
| /// Refresh the bearer token | |
| let refreshToken (client: RemarkableClient) = task { | |
| let url = sprintf "%s/token/json/2/user/new" AuthApi | |
| let! newToken = client.PostStringAsync(url, "") | |
| let newToken = newToken.Trim().Trim('"') | |
| client.Token <- newToken | |
| return newToken | |
| } | |
| /// Discover the storage API endpoint | |
| let discoverStorage (client: RemarkableClient) = task { | |
| let url = sprintf "%s/service/json/1/document-storage?environment=production&group=auth0%%7C5a68dc51cb30df3877a1d7c4&apiVer=2" ServiceDiscoveryApi | |
| let! json = client.GetStringAsync(url) | |
| let doc = JsonDocument.Parse(json) | |
| let root = doc.RootElement | |
| match root.TryGetProperty("Status") with | |
| | true, status when status.GetString() = "OK" -> | |
| match root.TryGetProperty("Host") with | |
| | true, host -> | |
| let hostStr = host.GetString() | |
| client.StorageApi <- sprintf "https://%s" hostStr | |
| | _ -> () | |
| | _ -> | |
| Console.warn "Service discovery did not return OK status, using default storage API" | |
| } | |
| /// Initialize the client: refresh token + discover storage | |
| let init (client: RemarkableClient) (savedToken: string) = task { | |
| client.Token <- savedToken | |
| let! newToken = refreshToken client | |
| do! discoverStorage client | |
| return newToken | |
| } | |
| /// List all items (flat list from API) | |
| let listItems (client: RemarkableClient) = task { | |
| let url = sprintf "%s/document-storage/json/2/docs" client.StorageApi | |
| let! json = client.GetStringAsync(url) | |
| return parseItems json | |
| } | |
| /// Get a single item by ID, optionally with blob download URL | |
| let getItem (client: RemarkableClient) (id: string) (withBlob: bool) = task { | |
| let blobParam = if withBlob then "&withBlob=true" else "" | |
| let url = sprintf "%s/document-storage/json/2/docs?doc=%s%s" client.StorageApi id blobParam | |
| let! json = client.GetStringAsync(url) | |
| let items = parseItems json | |
| match items with | |
| | item :: _ -> return Some item | |
| | [] -> return None | |
| } | |
| // ============================================================================= | |
| // Filesystem tree (reconstructing hierarchy from flat items) | |
| // ============================================================================= | |
| module FS = | |
| /// Build a lookup index and compute paths for all items | |
| let buildTree (items: RemarkableItem list) : Map<string, RemarkableItem> * Map<string, RemarkableItem list> = | |
| let index = items |> List.map (fun i -> i.ID, i) |> Map.ofList | |
| // Recursive path calculation | |
| let rec calcPath (index: Map<string, RemarkableItem>) (item: RemarkableItem) : string option = | |
| if String.IsNullOrEmpty(item.Parent) then | |
| Some (sprintf "/%s" item.VissibleName) | |
| else | |
| match Map.tryFind item.Parent index with | |
| | Some parent -> | |
| match calcPath index parent with | |
| | Some parentPath -> Some (sprintf "%s/%s" parentPath item.VissibleName) | |
| | None -> None | |
| | None -> None // orphan item | |
| // Compute paths for all items | |
| let itemsWithPaths = | |
| items | |
| |> List.choose (fun item -> | |
| match calcPath index item with | |
| | Some path -> Some { item with Path = path } | |
| | None -> None) | |
| let indexWithPaths = | |
| itemsWithPaths |> List.map (fun i -> i.ID, i) |> Map.ofList | |
| let tree = | |
| itemsWithPaths | |
| |> List.groupBy (fun i -> i.Path) | |
| |> Map.ofList | |
| (indexWithPaths, tree) | |
| /// Find items under a given path prefix | |
| let findUnderPath (indexWithPaths: Map<string, RemarkableItem>) (path: string) : RemarkableItem list = | |
| let normalizedPath = if path.EndsWith("/") then path else path + "/" | |
| indexWithPaths | |
| |> Map.toList | |
| |> List.map snd | |
| |> List.filter (fun item -> | |
| item.Path.StartsWith(normalizedPath, StringComparison.OrdinalIgnoreCase) | |
| || item.Path.Equals(path.TrimEnd('/'), StringComparison.OrdinalIgnoreCase)) | |
| /// Find an item by exact path | |
| let findByPath (indexWithPaths: Map<string, RemarkableItem>) (path: string) (itemType: string option) : RemarkableItem option = | |
| indexWithPaths | |
| |> Map.toList | |
| |> List.map snd | |
| |> List.tryFind (fun item -> | |
| item.Path.Equals(path, StringComparison.OrdinalIgnoreCase) | |
| && (match itemType with Some t -> item.Type = t | None -> true)) | |
| /// Get direct children of a path (one level deep) | |
| let directChildren (indexWithPaths: Map<string, RemarkableItem>) (parentPath: string) : RemarkableItem list = | |
| let normalizedParent = parentPath.TrimEnd('/') | |
| indexWithPaths | |
| |> Map.toList | |
| |> List.map snd | |
| |> List.filter (fun item -> | |
| if normalizedParent = "" then | |
| // Root: items whose path is /<name> (single component after root slash) | |
| let parts = item.Path.TrimStart('/').Split('/') | |
| parts.Length = 1 && parts.[0] <> "" | |
| else | |
| // Children: items whose path starts with parentPath/ and have exactly one more component | |
| let afterParent = | |
| if item.Path.StartsWith(normalizedParent + "/", StringComparison.OrdinalIgnoreCase) then | |
| item.Path.Substring(normalizedParent.Length + 1) | |
| else | |
| "" | |
| afterParent <> "" && not (afterParent.Contains("/"))) | |
| |> List.sortBy (fun item -> (item.Type <> TypeCollection, item.VissibleName.ToLowerInvariant())) | |
| // ============================================================================= | |
| // Commands | |
| // ============================================================================= | |
| module Commands = | |
| // --- Token persistence --- | |
| let loadToken () = | |
| if File.Exists(TokenFile) then | |
| Some (File.ReadAllText(TokenFile).Trim()) | |
| else | |
| None | |
| let saveToken (token: string) = | |
| File.WriteAllText(TokenFile, token) | |
| // --- Register --- | |
| let cmdRegister (code: string) = task { | |
| use client = new RemarkableClient() | |
| Console.info "Registering device with reMarkable cloud..." | |
| let! token = Api.register client code | |
| saveToken token | |
| Console.success "Registration successful! Token saved to auth.token" | |
| } | |
| // --- Initialize client (shared by list/download) --- | |
| let initClient () = task { | |
| match loadToken () with | |
| | None -> | |
| Console.error "No auth token found. Run 'register <code>' first." | |
| Console.info "Get a code at: https://my.remarkable.com/device/connect/desktop" | |
| return None | |
| | Some savedToken -> | |
| use client = new RemarkableClient() | |
| try | |
| let! newToken = Api.init client savedToken | |
| saveToken newToken | |
| // Return a new client that we won't dispose (caller manages it) | |
| let c = new RemarkableClient() | |
| c.Token <- client.Token | |
| c.StorageApi <- client.StorageApi | |
| return Some c | |
| with ex -> | |
| Console.error (sprintf "Failed to initialize: %s" ex.Message) | |
| return None | |
| } | |
| // --- List --- | |
| let cmdList (path: string option) = task { | |
| let! clientOpt = initClient () | |
| match clientOpt with | |
| | None -> () | |
| | Some client -> | |
| use _client = client | |
| Console.info "Fetching document list..." | |
| let! items = Api.listItems client | |
| let (index, _tree) = FS.buildTree items | |
| let targetPath = | |
| match path with | |
| | Some p -> p.TrimEnd('/') | |
| | None -> "" | |
| let children = FS.directChildren index targetPath | |
| if children.IsEmpty then | |
| let displayPath = if targetPath = "" then "/" else targetPath | |
| Console.warn (sprintf "No items found under '%s'" displayPath) | |
| else | |
| let displayPath = if targetPath = "" then "/" else targetPath | |
| printfn "" | |
| Console.info (sprintf "Contents of %s:" displayPath) | |
| printfn "" | |
| // Print header | |
| printfn " %-6s %-40s %-20s %s" "Type" "Name" "Modified" "ID" | |
| printfn " %s %s %s %s" (String('-', 6)) (String('-', 40)) (String('-', 20)) (String('-', 36)) | |
| for item in children do | |
| let icon = if item.Type = TypeCollection then "[DIR]" else "[DOC]" | |
| let modified = | |
| if String.IsNullOrEmpty(item.ModifiedClient) then "—" | |
| else | |
| try | |
| let dt = DateTimeOffset.Parse(item.ModifiedClient) | |
| dt.LocalDateTime.ToString("yyyy-MM-dd HH:mm") | |
| with _ -> item.ModifiedClient.Substring(0, min 19 item.ModifiedClient.Length) | |
| let name = | |
| if item.VissibleName.Length > 40 then | |
| item.VissibleName.Substring(0, 37) + "..." | |
| else | |
| item.VissibleName | |
| let color = | |
| if item.Type = TypeCollection then ConsoleColor.Yellow | |
| else ConsoleColor.White | |
| Console.color color (fun () -> | |
| printfn " %-6s %-40s %-20s %s" icon name modified item.ID) | |
| printfn "" | |
| let folders = children |> List.filter (fun i -> i.Type = TypeCollection) |> List.length | |
| let docs = children |> List.filter (fun i -> i.Type = TypeDocument) |> List.length | |
| printfn " %d folder(s), %d document(s)" folders docs | |
| printfn "" | |
| } | |
| // --- Download helpers --- | |
| let sanitizeFileName (name: string) = | |
| let invalid = Path.GetInvalidFileNameChars() | |
| name.ToCharArray() | |
| |> Array.map (fun c -> if Array.contains c invalid then '_' else c) | |
| |> String | |
| let downloadSingleFile (client: RemarkableClient) (item: RemarkableItem) (outputDir: string) = task { | |
| // Get the download URL | |
| let! itemWithBlob = Api.getItem client item.ID true | |
| match itemWithBlob with | |
| | None -> | |
| Console.error (sprintf "Could not retrieve item: %s" item.VissibleName) | |
| return false | |
| | Some itemData -> | |
| if String.IsNullOrEmpty(itemData.BlobURLGet) then | |
| Console.error (sprintf "No download URL for: %s (is it a folder?)" item.VissibleName) | |
| return false | |
| else | |
| let fileName = sanitizeFileName item.VissibleName + ".zip" | |
| let destPath = Path.Combine(outputDir, fileName) | |
| // Ensure output directory exists | |
| Directory.CreateDirectory(outputDir) |> ignore | |
| let! _size = client.DownloadFileAsync(itemData.BlobURLGet, destPath, item.VissibleName) | |
| return true | |
| } | |
| // --- Download --- | |
| let cmdDownload (pathOrId: string) (outputDir: string option) = task { | |
| let! clientOpt = initClient () | |
| match clientOpt with | |
| | None -> () | |
| | Some client -> | |
| use _client = client | |
| Console.info "Fetching document list..." | |
| let! items = Api.listItems client | |
| let (index, _tree) = FS.buildTree items | |
| let outDir = defaultArg outputDir "." | |
| // Try to find by path first, then by ID | |
| let isUuid = | |
| match Guid.TryParse(pathOrId) with | |
| | true, _ -> true | |
| | false, _ -> false | |
| let targetItems = | |
| if isUuid then | |
| // Direct ID lookup | |
| match Map.tryFind pathOrId index with | |
| | Some item -> [ item ] | |
| | None -> | |
| Console.error (sprintf "No item found with ID: %s" pathOrId) | |
| [] | |
| else | |
| let normalizedPath = | |
| if pathOrId.StartsWith("/") then pathOrId | |
| else "/" + pathOrId | |
| // Try exact match first | |
| match FS.findByPath index normalizedPath None with | |
| | Some item -> [ item ] | |
| | None -> | |
| // Try as prefix (folder path) | |
| let found = FS.findUnderPath index normalizedPath | |
| if found.IsEmpty then | |
| Console.error (sprintf "No item found matching: %s" pathOrId) | |
| found | |
| if not targetItems.IsEmpty then | |
| // Separate folders and documents | |
| let documents = | |
| if targetItems.Length = 1 && targetItems.[0].Type = TypeCollection then | |
| // Single folder selected: download all documents inside it recursively | |
| let folderPath = targetItems.[0].Path | |
| let allUnder = FS.findUnderPath index folderPath | |
| allUnder |> List.filter (fun i -> i.Type = TypeDocument) | |
| else | |
| targetItems |> List.filter (fun i -> i.Type = TypeDocument) | |
| if documents.IsEmpty then | |
| Console.warn "No downloadable documents found at the specified path." | |
| else | |
| printfn "" | |
| Console.info (sprintf "Downloading %d document(s) to '%s'..." documents.Length outDir) | |
| printfn "" | |
| let mutable successCount = 0 | |
| let mutable failCount = 0 | |
| for i, doc in documents |> List.mapi (fun i d -> (i, d)) do | |
| printfn " [%d/%d] %s" (i + 1) documents.Length doc.Path | |
| // Reconstruct relative directory structure | |
| let relativePath = | |
| if targetItems.Length = 1 && targetItems.[0].Type = TypeCollection then | |
| let basePath = targetItems.[0].Path | |
| let rel = doc.Path.Substring(basePath.Length).TrimStart('/') | |
| let parts = rel.Split('/') | |
| if parts.Length > 1 then | |
| // Has subdirectories | |
| parts.[.. parts.Length - 2] | |
| |> Array.map sanitizeFileName | |
| |> String.concat (string Path.DirectorySeparatorChar) | |
| else | |
| "" | |
| else | |
| "" | |
| let destDir = | |
| if String.IsNullOrEmpty(relativePath) then outDir | |
| else Path.Combine(outDir, relativePath) | |
| try | |
| let! ok = downloadSingleFile client doc destDir | |
| if ok then successCount <- successCount + 1 | |
| else failCount <- failCount + 1 | |
| with ex -> | |
| Console.error (sprintf "Failed to download '%s': %s" doc.VissibleName ex.Message) | |
| failCount <- failCount + 1 | |
| printfn "" | |
| Console.success (sprintf "Download complete: %d succeeded, %d failed" successCount failCount) | |
| printfn "" | |
| } | |
| // ============================================================================= | |
| // PDF to Markdown conversion | |
| // ============================================================================= | |
| module Pdf2Md = | |
| open System.Diagnostics | |
| /// Result of a conversion attempt | |
| type ConvertResult = | |
| | Success of outputPath: string | |
| | Failed of reason: string | |
| /// Run an external process with optional environment variables and capture stdout/stderr | |
| let private runProcessWithEnv (fileName: string) (arguments: string) (workDir: string option) (envVars: (string * string) list) = | |
| let psi = ProcessStartInfo(fileName, arguments) | |
| psi.UseShellExecute <- false | |
| psi.RedirectStandardOutput <- true | |
| psi.RedirectStandardError <- true | |
| psi.CreateNoWindow <- true | |
| workDir |> Option.iter (fun d -> psi.WorkingDirectory <- d) | |
| for (key, value) in envVars do | |
| psi.EnvironmentVariables.[key] <- value | |
| try | |
| use proc = Process.Start(psi) | |
| // Read stdout/stderr asynchronously to avoid deadlocks | |
| let stdoutTask = proc.StandardOutput.ReadToEndAsync() | |
| let stderrTask = proc.StandardError.ReadToEndAsync() | |
| let exited = proc.WaitForExit(5 * 60 * 1000) // 5-minute timeout per process | |
| if not exited then | |
| try proc.Kill(true) with _ -> () | |
| Some (-1, "", "Process timed out after 10 minutes") | |
| else | |
| let stdout = stdoutTask.Result | |
| let stderr = stderrTask.Result | |
| Some (proc.ExitCode, stdout, stderr) | |
| with _ -> | |
| None | |
| /// Run an external process and capture stdout/stderr | |
| let private runProcess (fileName: string) (arguments: string) (workDir: string option) = | |
| runProcessWithEnv fileName arguments workDir [] | |
| /// Check if a command exists on PATH | |
| let private commandExists (cmd: string) = | |
| let whereCmd = | |
| if Environment.OSVersion.Platform = PlatformID.Win32NT then "where" | |
| else "which" | |
| match runProcess whereCmd cmd None with | |
| | Some (0, out, _) when not (String.IsNullOrWhiteSpace(out)) -> true | |
| | _ -> false | |
| /// Check if marker-pdf is available (tries multiple methods) | |
| let isMarkerAvailable () = | |
| let candidates = [ | |
| // marker_single CLI | |
| fun () -> | |
| match runProcess "marker_single" "--help" None with | |
| | Some (0, _, _) -> Some "marker_single" | |
| | _ -> None | |
| // Python marker API | |
| fun () -> | |
| match runProcess "python" "-c \"from marker.converters.pdf import PdfConverter; print('ok')\"" None with | |
| | Some (0, out, _) when out.Trim() = "ok" -> Some "python-api" | |
| | _ -> None | |
| // Python3 marker API | |
| fun () -> | |
| match runProcess "python3" "-c \"from marker.converters.pdf import PdfConverter; print('ok')\"" None with | |
| | Some (0, out, _) when out.Trim() = "ok" -> Some "python3-api" | |
| | _ -> None | |
| // WSL marker_single | |
| fun () -> | |
| match runProcess "wsl" "marker_single --help" None with | |
| | Some (0, _, _) -> Some "wsl marker_single" | |
| | _ -> None | |
| ] | |
| candidates |> List.tryPick (fun f -> f ()) | |
| /// Check if pdftotext is available | |
| let isPdfToTextAvailable () = | |
| if commandExists "pdftotext" then Some "pdftotext" | |
| else | |
| match runProcess "wsl" "which pdftotext" None with | |
| | Some (0, out, _) when not (String.IsNullOrWhiteSpace(out)) -> Some "wsl pdftotext" | |
| | _ -> None | |
| /// Check if pdfplumber is available via WSL (pdf2md.py script) | |
| let isPdfPlumberAvailable () = | |
| // Check if pdf2md.py exists in the same directory as this script | |
| let scriptDir = AppDomain.CurrentDomain.BaseDirectory | |
| let pdf2mdPaths = [ | |
| Path.Combine(scriptDir, "pdf2md.py") | |
| Path.Combine(Environment.CurrentDirectory, "pdf2md.py") | |
| ] | |
| let pdf2mdPath = pdf2mdPaths |> List.tryFind File.Exists | |
| match pdf2mdPath with | |
| | None -> None | |
| | Some localPath -> | |
| // Check if WSL pdfplumber venv is available (MSYS_NO_PATHCONV prevents Git Bash path mangling) | |
| let wslEnv = [("MSYS_NO_PATHCONV", "1")] | |
| match runProcessWithEnv "wsl" "-- /home/thorium/.spectrum/venv/bin/python3 -c \"import pdfplumber; print('ok')\"" None wslEnv with | |
| | Some (0, out, _) when out.Trim() = "ok" -> Some localPath | |
| | _ -> None | |
| /// Convert a Windows path (e.g. C:\foo\bar) to WSL path (/mnt/c/foo/bar) | |
| let private toWslPath (windowsPath: string) = | |
| let fullPath = Path.GetFullPath(windowsPath) | |
| let drive = fullPath.[0] |> Char.ToLower | |
| let rest = fullPath.[2..].Replace('\\', '/') | |
| $"/mnt/{drive}{rest}" | |
| /// Convert PDF using marker-pdf (high quality, ML-based) | |
| let convertWithMarker (pdfPath: string) (outputDir: string) (markerCmd: string) = | |
| Directory.CreateDirectory(outputDir) |> ignore | |
| let pdfFull = Path.GetFullPath(pdfPath) | |
| let baseName = Path.GetFileNameWithoutExtension(pdfPath) | |
| let expectedOutputDir = Path.Combine(outputDir, baseName) | |
| let outputMdPath = Path.Combine(outputDir, baseName + ".md") | |
| let cmd, args = | |
| if markerCmd.EndsWith("-api") then | |
| let pythonCmd = if markerCmd.StartsWith("python3") then "python3" else "python" | |
| let escapedOutDir = outputDir.Replace("\\", "\\\\").Replace("'", "\\'") | |
| let escapedPdf = pdfFull.Replace("\\", "\\\\").Replace("'", "\\'") | |
| let escapedMd = outputMdPath.Replace("\\", "\\\\").Replace("'", "\\'") | |
| let tempScript = Path.Combine(Path.GetTempPath(), "marker_convert.py") | |
| let pyCode = [ | |
| "import os" | |
| "from multiprocessing import freeze_support" | |
| "" | |
| "def main():" | |
| $" os.makedirs('{escapedOutDir}', exist_ok=True)" | |
| " from marker.models import create_model_dict" | |
| " from marker.converters.pdf import PdfConverter" | |
| " from marker.config.parser import ConfigParser" | |
| " config = ConfigParser({'output_format': 'markdown'})" | |
| " artifacts = create_model_dict()" | |
| " converter = PdfConverter(artifact_dict=artifacts, config=config.generate_config_dict())" | |
| $" rendered = converter('{escapedPdf}')" | |
| $" with open('{escapedMd}', 'w', encoding='utf-8') as f:" | |
| " f.write(rendered.markdown)" | |
| " print(f'Wrote {len(rendered.markdown)} chars')" | |
| "" | |
| "if __name__ == '__main__':" | |
| " freeze_support()" | |
| " main()" | |
| ] | |
| File.WriteAllLines(tempScript, pyCode) | |
| pythonCmd, $"\"{tempScript}\"" | |
| elif markerCmd.StartsWith("wsl") then | |
| let wslPath = toWslPath pdfFull | |
| let wslOutDir = toWslPath (Path.GetFullPath(outputDir)) | |
| "wsl", $"marker_single \"{wslPath}\" --output_dir \"{wslOutDir}\" --output_format markdown" | |
| else | |
| markerCmd, $"\"{pdfFull}\" --output_dir \"{outputDir}\" --output_format markdown" | |
| Console.info $" Converting with marker-pdf: {baseName}" | |
| match runProcess cmd args None with | |
| | Some (0, _, _) -> | |
| if File.Exists(outputMdPath) then | |
| let content = File.ReadAllText(outputMdPath) | |
| if content.Trim().Length > 50 then Success outputMdPath | |
| else Failed "marker produced empty or near-empty output" | |
| else | |
| let mdFiles = | |
| if Directory.Exists(expectedOutputDir) then | |
| Directory.GetFiles(expectedOutputDir, "*.md") | |
| else | |
| Directory.GetFiles(outputDir, $"{baseName}*.md") | |
| match mdFiles |> Array.tryHead with | |
| | Some mdFile -> | |
| let content = File.ReadAllText(mdFile) | |
| if content.Trim().Length > 50 then Success mdFile | |
| else Failed "marker produced empty or near-empty output" | |
| | None -> | |
| Failed $"marker ran but no .md file found in {outputDir} or {expectedOutputDir}" | |
| | Some (code, _, stderr) -> | |
| Failed $"marker exited with code {code}: {stderr.Trim()}" | |
| | None -> | |
| Failed "failed to start marker process" | |
| // ----------------------------------------------------------------- | |
| // pdfplumber conversion (via WSL pdf2md.py) | |
| // ----------------------------------------------------------------- | |
| /// Convert PDF using pdfplumber via WSL (pdf2md.py script) | |
| let convertWithPdfPlumber (pdfPath: string) (outputDir: string) (pdf2mdScriptPath: string) = | |
| Directory.CreateDirectory(outputDir) |> ignore | |
| let pdfFull = Path.GetFullPath(pdfPath) | |
| let baseName = Path.GetFileNameWithoutExtension(pdfPath) | |
| let mdPath = Path.Combine(Path.GetFullPath(outputDir), baseName + ".md") | |
| let wslPdfPath = toWslPath pdfFull | |
| let wslMdPath = toWslPath mdPath | |
| let wslScriptPath = toWslPath pdf2mdScriptPath | |
| let pythonPath = "/home/thorium/.spectrum/venv/bin/python3" | |
| let args = $"-- {pythonPath} {wslScriptPath} \"{wslPdfPath}\" \"{wslMdPath}\"" | |
| let wslEnv = [("MSYS_NO_PATHCONV", "1")] | |
| Console.info $" Converting with pdfplumber: {baseName}" | |
| match runProcessWithEnv "wsl" args None wslEnv with | |
| | Some (0, _, _) -> | |
| if File.Exists(mdPath) then | |
| let content = File.ReadAllText(mdPath) | |
| if content.Trim().Length > 50 then Success mdPath | |
| else Failed "pdfplumber produced empty or near-empty output" | |
| else | |
| Failed $"pdfplumber ran but output file not found: {mdPath}" | |
| | Some (code, stdout, stderr) -> | |
| let errMsg = if String.IsNullOrWhiteSpace(stderr) then stdout.Trim() else stderr.Trim() | |
| Failed $"pdfplumber exited with code {code}: {errMsg}" | |
| | None -> | |
| Failed "failed to start WSL/pdfplumber process" | |
| // ----------------------------------------------------------------- | |
| // pdftotext + heuristic formatting fallback | |
| // ----------------------------------------------------------------- | |
| open System.Text.RegularExpressions | |
| /// Common PDF ligature breaks: "fi" -> "f i", "fl" -> "f l", "ff" -> "f f", etc. | |
| let private fixLigatures (text: string) = | |
| text | |
| // fi ligature: "classi cation" -> "classification" | |
| .Replace("fi ", "fi").Replace("fi\n", "fi\n") | |
| // fl ligature: "re ect" -> "reflect" | |
| .Replace("fl ", "fl").Replace("fl\n", "fl\n") | |
| // ff ligature: "e ective" -> "effective", "di erence" -> "difference" | |
| .Replace("ff ", "ff").Replace("ff\n", "ff\n") | |
| // ffi ligature: "e cient" -> "efficient" | |
| .Replace("ffi ", "ffi").Replace("ffi\n", "ffi\n") | |
| // ffl ligature | |
| .Replace("ffl ", "ffl").Replace("ffl\n", "ffl\n") | |
| /// More targeted ligature repair using regex for common broken words | |
| let private fixCommonLigatureWords (text: string) = | |
| // These are the most common ligature-broken words in academic PDFs | |
| let fixes = [ | |
| (@"\bclassi\s+cation", "classification") | |
| (@"\bspeci\s+c\b", "specific") | |
| (@"\bsigni\s+cant", "significant") | |
| (@"\barti\s+cial", "artificial") | |
| (@"\bscienti\s+c\b", "scientific") | |
| (@"\bdi\s+erent", "different") | |
| (@"\bdi\s+erence", "difference") | |
| (@"\bdi\s+cult", "difficult") | |
| (@"\be\s+ective", "effective") | |
| (@"\be\s+ect\b", "effect") | |
| (@"\be\s+ort\b", "effort") | |
| (@"\be\s+cien", "efficien") | |
| (@"\bo\s+er\b", "offer") | |
| (@"\bsu\s+cien", "sufficien") | |
| (@"\bsu\s+er\b", "suffer") | |
| (@"\bco\s+ee\b", "coffee") | |
| (@"\b\s+lter", "filter") | |
| (@"\b\s+rst\b", "first") | |
| (@"\b\s+nd\b", "find") | |
| (@"\b\s+eld", "field") | |
| (@"\b\s+le\b", "file") | |
| (@"\b\s+nite", "finite") | |
| (@"\b\s+gure", "figure") | |
| (@"\b\s+ve\b", "five") | |
| (@"\b\s+xed\b", "fixed") | |
| (@"\b\s+t\b", "fit") | |
| (@"\bpro\s+le\b", "profile") | |
| (@"\bin\s+uence", "influence") | |
| (@"\bre\s+ect", "reflect") | |
| (@"\buore", "fluore") | |
| ] | |
| fixes |> List.fold (fun text (pattern, replacement) -> | |
| Regex.Replace(text, pattern, replacement, RegexOptions.IgnoreCase) | |
| ) text | |
| /// Heuristic: detect if a line looks like a chapter/section heading. | |
| /// Must be a standalone heading line (not mid-sentence). | |
| let private isHeading (line: string) (prevLine: string option) = | |
| let trimmed = line.Trim() | |
| if String.IsNullOrEmpty(trimmed) then None | |
| // Only match "Chapter N" at the very start of a line, and require it to be | |
| // a standalone heading (previous line was empty or doesn't exist) | |
| elif Regex.IsMatch(trimmed, @"^Chapter\s+\d+\s*[\-–—:]") then | |
| // Ensure previous line was blank (standalone heading, not mid-sentence) | |
| match prevLine with | |
| | Some prev when not (String.IsNullOrWhiteSpace(prev)) -> None | |
| | _ -> Some 1 | |
| // Numbered sub-sub-sections: "2.3.1 Something" | |
| elif Regex.IsMatch(trimmed, @"^\d+\.\d+\.\d+\s+[A-Z]") then | |
| Some 3 | |
| // Numbered sections: "2.1 Introduction", "14.10 The Google PageRank" | |
| elif Regex.IsMatch(trimmed, @"^\d+\.\d+\s+[A-Z]") then | |
| Some 2 | |
| // ALL-CAPS short lines (often headings, but not single words like "SPAM") | |
| elif trimmed.Length <= 60 && trimmed.Length >= 5 | |
| && trimmed = trimmed.ToUpper() | |
| && Regex.IsMatch(trimmed, @"^[A-Z][A-Z\s\-:]+[A-Z]$") then | |
| Some 2 | |
| // Well-known section names, only when standalone (previous blank) | |
| elif Regex.IsMatch(trimmed, @"^(Preface|Appendix|Bibliography|Bibliographic Notes|Exercises|References|Index|Glossary|Acknowledgments|Dedication|Foreword|Conclusion|Summary)\b") | |
| && trimmed.Length < 80 then | |
| match prevLine with | |
| | Some prev when not (String.IsNullOrWhiteSpace(prev)) -> None | |
| | _ -> Some 2 | |
| else | |
| None | |
| /// Heuristic: detect if a line is likely a page number or header/footer artifact | |
| let private isArtifact (line: string) = | |
| let trimmed = line.Trim() | |
| // Standalone page number (1-4 digits alone on a line) | |
| Regex.IsMatch(trimmed, @"^\d{1,4}$") | |
| // Page header pattern like "42 2. Overview of Supervised Learning" | |
| || Regex.IsMatch(trimmed, @"^\d+\s{2,}\d+\.\s") | |
| // Roman numeral page numbers | |
| || Regex.IsMatch(trimmed, @"^[ivxlcdm]+$") | |
| /// Detect FIGURE/TABLE captions | |
| let private isFigureOrTable (line: string) = | |
| let trimmed = line.Trim() | |
| Regex.IsMatch(trimmed, @"^(FIGURE|TABLE|Figure|Table)\s+\d+[\.\:]") | |
| /// Detect bullet points | |
| let private isBullet (line: string) = | |
| let trimmed = line.Trim() | |
| trimmed.StartsWith("●") || trimmed.StartsWith("•") | |
| || trimmed.StartsWith("◦") || trimmed.StartsWith("▪") | |
| || Regex.IsMatch(trimmed, @"^[\-\*]\s+\S") | |
| // Lettered list items: "a. something", "b. something" | |
| || Regex.IsMatch(trimmed, @"^[a-z]\.\s+\S") | |
| /// Detect equation reference lines (standalone "(2.7)" etc.) | |
| let private isEquationRef (line: string) = | |
| let trimmed = line.Trim() | |
| Regex.IsMatch(trimmed, @"^\(\d+\.\d+\)$") | |
| /// Convert raw pdftotext output to structured markdown | |
| let private textToMarkdown (rawText: string) (sourceFileName: string) = | |
| let lines = rawText.Split([| '\n' |]) | |
| let sb = StringBuilder() | |
| let title = Path.GetFileNameWithoutExtension(sourceFileName) | |
| sb.AppendLine($"# {title}").AppendLine() |> ignore | |
| let mutable prevWasEmpty = true | |
| let mutable inBlock = false | |
| let mutable prevRawLine : string option = None | |
| for line in lines do | |
| let trimmed = line.Trim() | |
| if String.IsNullOrWhiteSpace(trimmed) then | |
| if not prevWasEmpty then | |
| sb.AppendLine() |> ignore | |
| prevWasEmpty <- true | |
| inBlock <- false | |
| elif isArtifact trimmed then | |
| () // skip page numbers and headers | |
| elif isEquationRef trimmed then | |
| if not prevWasEmpty then sb.AppendLine() |> ignore | |
| sb.AppendLine(trimmed).AppendLine() |> ignore | |
| prevWasEmpty <- true | |
| inBlock <- false | |
| elif isFigureOrTable trimmed then | |
| if not prevWasEmpty then sb.AppendLine() |> ignore | |
| sb.AppendLine($"***{trimmed}***").AppendLine() |> ignore | |
| prevWasEmpty <- true | |
| inBlock <- false | |
| elif isBullet trimmed then | |
| let content = | |
| if trimmed.StartsWith("●") || trimmed.StartsWith("•") | |
| || trimmed.StartsWith("◦") || trimmed.StartsWith("▪") then | |
| trimmed.Substring(1).Trim() | |
| elif Regex.IsMatch(trimmed, @"^[a-z]\.\s+") then | |
| trimmed | |
| else | |
| trimmed.Substring(1).Trim() | |
| sb.AppendLine($"- {content}") |> ignore | |
| prevWasEmpty <- false | |
| inBlock <- false | |
| else | |
| match isHeading trimmed prevRawLine with | |
| | Some level -> | |
| if not prevWasEmpty then sb.AppendLine() |> ignore | |
| let prefix = String('#', level + 1) | |
| sb.AppendLine($"{prefix} {trimmed}").AppendLine() |> ignore | |
| prevWasEmpty <- true | |
| inBlock <- false | |
| | None -> | |
| if inBlock && not prevWasEmpty then | |
| sb.Append(' ').Append(trimmed) |> ignore | |
| else | |
| sb.Append(trimmed) |> ignore | |
| prevWasEmpty <- false | |
| inBlock <- true | |
| prevRawLine <- Some trimmed | |
| sb.ToString() |> fixCommonLigatureWords | |
| /// Convert PDF using pdftotext + heuristic markdown formatting | |
| let convertWithPdfToText (pdfPath: string) (outputDir: string) (pdftotextCmd: string) = | |
| Directory.CreateDirectory(outputDir) |> ignore | |
| let pdfFull = Path.GetFullPath(pdfPath) | |
| let baseName = Path.GetFileNameWithoutExtension(pdfPath) | |
| let mdPath = Path.Combine(outputDir, baseName + ".md") | |
| let cmd, args = | |
| if pdftotextCmd.StartsWith("wsl") then | |
| "wsl", $"pdftotext -layout \"{toWslPath pdfFull}\" -" | |
| else | |
| "pdftotext", $"-layout \"{pdfFull}\" -" | |
| Console.info $" Converting with pdftotext (fallback): {baseName}" | |
| match runProcess cmd args None with | |
| | Some (0, stdout, _) -> | |
| if stdout.Trim().Length > 50 then | |
| let markdown = textToMarkdown stdout baseName | |
| File.WriteAllText(mdPath, markdown) | |
| Success mdPath | |
| else | |
| Failed "pdftotext produced empty or near-empty output (scanned PDF without OCR?)" | |
| | Some (code, _, stderr) -> | |
| Failed $"pdftotext exited with code {code}: {stderr.Trim()}" | |
| | None -> | |
| Failed "failed to start pdftotext process" | |
| // ----------------------------------------------------------------- | |
| // Fallback chain: marker -> pdfplumber -> pdftotext+heuristics | |
| // ----------------------------------------------------------------- | |
| /// Detected tool configuration (avoids re-detecting per PDF in batch mode) | |
| type ToolConfig = { | |
| Marker: string option | |
| PdfPlumber: string option | |
| PdfToText: string option | |
| } | |
| /// Detect available conversion tools | |
| let detectTools () : ToolConfig = | |
| { Marker = isMarkerAvailable () | |
| PdfPlumber = isPdfPlumberAvailable () | |
| PdfToText = isPdfToTextAvailable () } | |
| /// Convert a single PDF to markdown using the best available tool | |
| let convertPdfWith (tools: ToolConfig) (pdfPath: string) (outputDir: string) : ConvertResult = | |
| if not (File.Exists(pdfPath)) then | |
| Failed $"PDF file not found: {pdfPath}" | |
| else | |
| // Build a prioritized list of conversion attempts | |
| let converters = | |
| [ tools.Marker |> Option.map (fun cmd -> "marker-pdf", (fun () -> convertWithMarker pdfPath outputDir cmd)) | |
| tools.PdfPlumber |> Option.map (fun scr -> "pdfplumber", (fun () -> convertWithPdfPlumber pdfPath outputDir scr)) | |
| tools.PdfToText |> Option.map (fun cmd -> "pdftotext", (fun () -> convertWithPdfToText pdfPath outputDir cmd)) ] | |
| |> List.choose id | |
| if converters.IsEmpty then | |
| Failed "No conversion tools found. Install marker-pdf, pdfplumber (WSL), or pdftotext (poppler-utils)." | |
| else | |
| let rec tryConverters remaining = | |
| match remaining with | |
| | [] -> Failed "All conversion tools failed" | |
| | (name, convert) :: rest -> | |
| match convert () with | |
| | Success path -> Success path | |
| | Failed reason when rest.IsEmpty -> | |
| Failed $"{name} failed: {reason}" | |
| | Failed reason -> | |
| let nextName = fst rest.Head | |
| Console.warn $" {name} failed ({reason}), trying {nextName}..." | |
| tryConverters rest | |
| tryConverters converters | |
| /// Convert a single PDF (auto-detects tools; use convertPdfWith for batch) | |
| let convertPdf (pdfPath: string) (outputDir: string) = | |
| convertPdfWith (detectTools ()) pdfPath outputDir | |
| /// Convert multiple PDFs to markdown, showing progress | |
| let convertBatch (pdfPaths: string list) (outputDir: string) = | |
| printfn "" | |
| let total = pdfPaths.Length | |
| Console.info $"Converting {total} PDF(s) to Markdown..." | |
| // Detect tools once for the entire batch | |
| let tools = detectTools () | |
| // Display detected tools | |
| tools.Marker |> Option.iter (fun cmd -> Console.info $" Primary tool: marker-pdf ({cmd})") | |
| tools.PdfPlumber |> Option.iter (fun path -> Console.info $" Mid-tier tool: pdfplumber via WSL ({Path.GetFileName(path)})") | |
| tools.PdfToText |> Option.iter (fun cmd -> Console.info $" Fallback tool: pdftotext ({cmd})") | |
| if tools.Marker.IsNone then Console.warn " marker-pdf not found (install with: pip install marker-pdf)" | |
| if tools.PdfPlumber.IsNone then Console.warn " pdfplumber not found (requires WSL + pdf2md.py + pdfplumber venv)" | |
| if tools.PdfToText.IsNone then Console.warn " pdftotext not found (install poppler-utils)" | |
| if tools.Marker.IsNone && tools.PdfPlumber.IsNone && tools.PdfToText.IsNone then | |
| Console.error "No PDF conversion tools available!" | |
| Console.info "Install one of:" | |
| Console.info " pip install marker-pdf (recommended, ML-based, high quality)" | |
| Console.info " WSL + pdfplumber (mid-tier, font-size-aware headings)" | |
| Console.info " apt install poppler-utils (pdftotext, lightweight fallback)" | |
| [] | |
| else | |
| printfn "" | |
| let results = | |
| pdfPaths |> List.mapi (fun i pdfPath -> | |
| let name = Path.GetFileName(pdfPath) | |
| printfn $" [{i + 1}/{total}] {name}" | |
| let result = convertPdfWith tools pdfPath outputDir | |
| match result with | |
| | Success outPath -> Console.success $" -> {outPath}" | |
| | Failed reason -> Console.error $" FAILED: {reason}" | |
| pdfPath, result | |
| ) | |
| let successCount = results |> List.filter (fun (_, r) -> match r with Success _ -> true | _ -> false) |> List.length | |
| let failCount = results.Length - successCount | |
| printfn "" | |
| Console.success $"Conversion complete: {successCount} succeeded, {failCount} failed" | |
| results | |
| // ============================================================================= | |
| // Convert command | |
| // ============================================================================= | |
| module Commands2 = | |
| let cmdConvert (pathOrPattern: string) (outputDir: string option) = | |
| let outDir = defaultArg outputDir "." | |
| // Collect PDF files to convert | |
| let pdfFiles = | |
| if File.Exists(pathOrPattern) then | |
| // Single file | |
| [ Path.GetFullPath(pathOrPattern) ] | |
| elif Directory.Exists(pathOrPattern) then | |
| // Directory: find all PDFs recursively | |
| Directory.GetFiles(Path.GetFullPath(pathOrPattern), "*.pdf", SearchOption.AllDirectories) | |
| |> Array.toList | |
| else | |
| // Try as glob pattern | |
| let dir = Path.GetDirectoryName(pathOrPattern) | |
| let pattern = Path.GetFileName(pathOrPattern) | |
| let searchDir = if String.IsNullOrEmpty(dir) then "." else dir | |
| if Directory.Exists(searchDir) then | |
| Directory.GetFiles(searchDir, pattern) | |
| |> Array.filter (fun f -> f.EndsWith(".pdf", StringComparison.OrdinalIgnoreCase)) | |
| |> Array.toList | |
| else | |
| [] | |
| if pdfFiles.IsEmpty then | |
| Console.error (sprintf "No PDF files found matching: %s" pathOrPattern) | |
| else | |
| let _results = Pdf2Md.convertBatch pdfFiles outDir | |
| () | |
| // ============================================================================= | |
| // CLI entry point | |
| // ============================================================================= | |
| let showUsage () = | |
| printfn "" | |
| printfn "reMarkable Cloud File Downloader & PDF Converter" | |
| printfn "================================================" | |
| printfn "" | |
| printfn "Usage:" | |
| printfn " dotnet fsi remarkable-download.fsx register <code> Register device" | |
| printfn " dotnet fsi remarkable-download.fsx list [/path] List folders and files" | |
| printfn " dotnet fsi remarkable-download.fsx download <path|id> [output-dir]" | |
| printfn " Download file or folder" | |
| printfn " dotnet fsi remarkable-download.fsx convert <pdf|dir> [output-dir]" | |
| printfn " Convert PDF(s) to Markdown" | |
| printfn "" | |
| printfn "Examples:" | |
| printfn " dotnet fsi remarkable-download.fsx register abcdefgh" | |
| printfn " dotnet fsi remarkable-download.fsx list" | |
| printfn " dotnet fsi remarkable-download.fsx list /MyNotebooks" | |
| printfn " dotnet fsi remarkable-download.fsx download /MyNotebooks/Paper.pdf" | |
| printfn " dotnet fsi remarkable-download.fsx download /MyNotebooks ./downloads" | |
| printfn " dotnet fsi remarkable-download.fsx download 12345678-abcd-efgh-ijkl-123456789abc" | |
| printfn " dotnet fsi remarkable-download.fsx convert mybook.pdf" | |
| printfn " dotnet fsi remarkable-download.fsx convert mybook.pdf ./markdown-output" | |
| printfn " dotnet fsi remarkable-download.fsx convert ./pdfs-folder ./markdown-output" | |
| printfn "" | |
| printfn "Conversion tools (install at least one):" | |
| printfn " pip install marker-pdf ML-based, best quality (recommended)" | |
| printfn " WSL + pdfplumber + pdf2md.py Mid-tier, font-size-aware headings" | |
| printfn " pdftotext (poppler-utils) Lightweight fallback, heuristic formatting" | |
| printfn "" | |
| printfn "First-time setup:" | |
| printfn " 1. Go to https://my.remarkable.com/device/connect/desktop" | |
| printfn " 2. Get a one-time code" | |
| printfn " 3. Run: dotnet fsi remarkable-download.fsx register <code>" | |
| printfn "" | |
| let main (args: string[]) = | |
| // fsi.CommandLineArgs has the script name as [0], then user args | |
| // Environment.GetCommandLineArgs() has dotnet, fsi flags, script name - less reliable | |
| // We try fsi.CommandLineArgs first, fall back to Environment.GetCommandLineArgs() | |
| let args = | |
| args | |
| |> Array.toList | |
| // Skip the script file name and any flags | |
| |> List.skipWhile (fun a -> a.EndsWith(".fsx") || a.StartsWith("--")) | |
| match args with | |
| | "register" :: code :: _ -> | |
| let t = Commands.cmdRegister code | |
| t.GetAwaiter().GetResult() | |
| | "list" :: [] -> | |
| let t = Commands.cmdList None | |
| t.GetAwaiter().GetResult() | |
| | "list" :: path :: _ -> | |
| let t = Commands.cmdList (Some path) | |
| t.GetAwaiter().GetResult() | |
| | "download" :: pathOrId :: [] -> | |
| let t = Commands.cmdDownload pathOrId None | |
| t.GetAwaiter().GetResult() | |
| | "download" :: pathOrId :: outDir :: _ -> | |
| let t = Commands.cmdDownload pathOrId (Some outDir) | |
| t.GetAwaiter().GetResult() | |
| | "convert" :: pathOrPattern :: [] -> | |
| Commands2.cmdConvert pathOrPattern None | |
| | "convert" :: pathOrPattern :: outDir :: _ -> | |
| Commands2.cmdConvert pathOrPattern (Some outDir) | |
| | _ -> | |
| showUsage () | |
| main (fsi.CommandLineArgs) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment