Skip to content

Instantly share code, notes, and snippets.

@bkataru
Last active October 17, 2025 11:25
Show Gist options
  • Select an option

  • Save bkataru/4d14ff55280b09d2b1bb5e4946bff3a3 to your computer and use it in GitHub Desktop.

Select an option

Save bkataru/4d14ff55280b09d2b1bb5e4946bff3a3 to your computer and use it in GitHub Desktop.
const std = @import("std");
// DTO for deserialization
const LLMResponse = struct {
id: []const u8, // Unique identifier for the response
object: []const u8, // Type of object returned
created: u32, // Unix timestamp of when the response was generated
model: []const u8, // Name of the model used to generate the response
usage: ?struct { // Usage statistics for the response, optional
prompt_tokens: u32, // Number of tokens in the prompt
completion_tokens: u32, // Number of tokens in the completion
total_tokens: u32, // Total number of tokens used
} = null,
timings: ?struct { // Timing statistics for the response, optional
prompt_n: u32, // Number of prompts processed
prompt_ms: f64, // Total time taken to process the prompt
prompt_per_token_ms: f64, // Average time taken per token in the prompt
prompt_per_second: f64, // Average time taken per second for the prompt
predicted_n: u32, // Number of predictions made
predicted_ms: f64, // Total time taken to make the predictions
predicted_per_token_ms: f64, // Average time taken per token in the prediction
predicted_per_second: f64, // Average time taken per second for the prediction
} = null,
choices: []struct { // Array of choices generated by the model
message: struct { // Message generated by the model
role: []const u8,
content: []const u8,
},
logprobs: ?struct { // Log probabilities of the tokens generated, optional
content: []struct { // Array of token logprob objects
token: []const u8, // Token ID or string representation of the token
logprob: f64, // Using f64 for double precision log probabilities
bytes: []const u8, // Raw bytes of the token
// top_logprobs is an array of objects, each containing a token and its logprob
// This is present only if top_logprobs was requested in the API call
top_logprobs: ?[]struct {
token: []const u8,
logprob: f64,
},
},
} = null,
finish_reason: []const u8, // Reason for finishing the response
index: u32, // Index of the choice in the array
},
system_fingerprint: []const u8, // Fingerprint of the system used to generate the response
};
// DTO for serialization (when sending requests)
const Message = struct {
role: []const u8,
content: []const u8,
};
const RequestPayload = struct {
model: []const u8,
messages: []Message,
};
/// Formats a multiline string template with a varying number of dynamic string arguments via substitutions
///
/// The template is expected to contain "{s}" placeholders where the dynamic arguments
/// should be inserted. Each line of the template is treated as a potential insertion point.
///
/// Returns an allocated string containing the formatted template.
/// Caller owns the returned memory.
pub fn formatTemplate(allocator: std.mem.Allocator, template: []const u8, substitutions: []const []const u8) ![]u8 {
var result = std.ArrayList(u8).init(allocator);
errdefer result.deinit();
var index: usize = 0;
var line_iter = std.mem.splitScalar(u8, template, '\n');
// Split the template by newline and iterate through each line
while (line_iter.next()) |line| {
var parts = std.mem.splitSequence(u8, line, "{s}"); // Split each line by the "{s}" placeholder
try result.writer().print("{s}", .{parts.next().?}); // Print the first part
while (parts.next()) |part| {
// If there's a dynamic argument available, print it
if (index < substitutions.len) {
try result.writer().print("{s}", .{substitutions[index]});
index += 1;
}
try result.writer().print("{s}", .{part}); // Print the next part of the line
}
try result.writer().writeByte('\n'); // Add a newline after each line is processed
}
_ = result.pop(); // Remove the last (unnecessary) newline added by the loop
return result.toOwnedSlice();
}
/// Invoke an LLM with a given system prompt and user prompt
/// Returns an LLMResponse instance
/// Caller owns returned memory and must call .deinit()
pub fn llmCall(allocator: std.mem.Allocator, system_prompt: []const u8, user_prompt: []const u8) !std.json.Parsed(LLMResponse) {
// Handles all memory allocations for the network request
// This means any derived deinits are all noops, so can be omitted
var request_arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
defer request_arena.deinit();
const request_arena_allocator = request_arena.allocator();
// Create client
var client = std.http.Client{ .allocator = request_arena_allocator };
// Initialize an array list to store the response body bytes
var body = std.ArrayList(u8).init(request_arena_allocator);
// Parse URI for POST endpoint /v1/chat/completions
const uri = try std.Uri.parse("http://127.0.0.1:1337/v1/chat/completions");
// Prepare request payload
var messages = [_]Message{
Message{ .role = "system", .content = system_prompt },
Message{ .role = "user", .content = user_prompt },
};
const request_payload = RequestPayload{
.model = "Qwen_Qwen3-4B-Instruct-2507-IQ4_XS",
.messages = &messages,
};
const payload = try std.json.stringifyAlloc(request_arena_allocator, request_payload, .{});
std.debug.print("{s}\n", .{"=" ** 50});
std.debug.print("Payload: {s}\n", .{payload});
// Make the POST request
const response = try client.fetch(.{
.method = .POST,
.location = .{ .uri = uri },
.response_storage = .{ .dynamic = &body },
.payload = payload,
.headers = .{
.content_type = .{ .override = "application/json" },
.accept_encoding = .{ .override = "application/json" },
.authorization = .{ .override = "Bearer so-this-is-an-api-key" },
},
});
// print the response status
std.debug.print("{s}\n", .{"=" ** 50});
std.debug.print("Response status: {}\n", .{response.status});
// Do whatever you need to in case of HTTP error.
if (response.status != .ok) {
std.debug.print("HTTP Error: {}\n", .{response.status});
std.debug.print("Response body: {s}\n", .{body.items});
std.debug.print("Error connecting to llama-server: {s}\n", .{body.items});
}
// Deserialize JSON response into a struct
const parsed = try std.json.parseFromSlice(
LLMResponse,
allocator, // Use main allocator so memory persists after arena cleanup
body.items,
.{
.allocate = .alloc_always,
.parse_numbers = true,
.ignore_unknown_fields = true,
.duplicate_field_behavior = .use_last,
},
);
// note: wow an arena is perfect for this typa control flow lol
return parsed;
}
pub fn main() !void {
var gpa = std.heap.GeneralPurposeAllocator(.{}){};
var allocator = gpa.allocator(); // a.k.a. debug allocator
defer {
if (gpa.deinit() == .leak) {
std.debug.print("Memory leak detected\n", .{});
std.process.exit(1);
}
}
const system_prompt_template =
\\You are a helpful assistant.
\\The user's name is {s}.
\\And your identity is {s}.
;
const system_prompt_vars = [_][]const u8{ "raja", "jocasta" };
const system_prompt = try formatTemplate(allocator, system_prompt_template, &system_prompt_vars);
defer allocator.free(system_prompt);
std.debug.print("system prompt: {s}\n", .{system_prompt});
std.debug.print("{s}\n", .{"=" ** 50});
const prompt = "who are we?";
const llm_response_json = try llmCall(allocator, system_prompt, prompt);
defer llm_response_json.deinit();
const llm_response = llm_response_json.value;
// Assistant's response
const content = llm_response.choices[0].message.content;
try std.io.getStdOut().writer().print("Assistant: {s}\n", .{content});
}
@bkataru
Copy link
Author

bkataru commented Aug 15, 2025

a single-file request client in pure zig for llama.cpp's OpenAI API compatible inference server

  • minimal
  • no dependencies
  • pure zig 0.14.1
  • did i mention it's a single file?
  • wields zig's rich standard library:
    • std.json for (de)serialization
    • std.http for interneting.
    • is all you need
  • deterministic memory allocations per request because holy hell arena allocators go so hard.
  • with a tiny, from-scratch, templating engine???!?! langchain go brr

just make sure you have an installation of llama.cpp, you can

  • download precompiled binaries either
    • directly
    • using some package manager, OS or otherwise
  • build from source
    • get yourself a C/C++ toolchain/compiler (MSVC/GCC/Clang) with Make/CMake also beforehand if you choose to venture down this road.
  • use a frontend that wraps and ships it such as
    • ollama (ew)
    • jan (yay)
    • lm studio (yay but closed-source)

and you have its OpenAI compatible inference server fired up and listening on http://127.0.0.1:1337

and finally that you have a .GGUF of Qwen_Qwen3-4B-Instruct-2507-IQ4_XS downloaded and ready to go

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment