Last active
October 17, 2025 11:25
-
-
Save bkataru/4d14ff55280b09d2b1bb5e4946bff3a3 to your computer and use it in GitHub Desktop.
how to llama_cpp_client.zig - https://bkataru.bearblog.dev/llama-cpp-client-zig
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const std = @import("std"); | |
| // DTO for deserialization | |
| const LLMResponse = struct { | |
| id: []const u8, // Unique identifier for the response | |
| object: []const u8, // Type of object returned | |
| created: u32, // Unix timestamp of when the response was generated | |
| model: []const u8, // Name of the model used to generate the response | |
| usage: ?struct { // Usage statistics for the response, optional | |
| prompt_tokens: u32, // Number of tokens in the prompt | |
| completion_tokens: u32, // Number of tokens in the completion | |
| total_tokens: u32, // Total number of tokens used | |
| } = null, | |
| timings: ?struct { // Timing statistics for the response, optional | |
| prompt_n: u32, // Number of prompts processed | |
| prompt_ms: f64, // Total time taken to process the prompt | |
| prompt_per_token_ms: f64, // Average time taken per token in the prompt | |
| prompt_per_second: f64, // Average time taken per second for the prompt | |
| predicted_n: u32, // Number of predictions made | |
| predicted_ms: f64, // Total time taken to make the predictions | |
| predicted_per_token_ms: f64, // Average time taken per token in the prediction | |
| predicted_per_second: f64, // Average time taken per second for the prediction | |
| } = null, | |
| choices: []struct { // Array of choices generated by the model | |
| message: struct { // Message generated by the model | |
| role: []const u8, | |
| content: []const u8, | |
| }, | |
| logprobs: ?struct { // Log probabilities of the tokens generated, optional | |
| content: []struct { // Array of token logprob objects | |
| token: []const u8, // Token ID or string representation of the token | |
| logprob: f64, // Using f64 for double precision log probabilities | |
| bytes: []const u8, // Raw bytes of the token | |
| // top_logprobs is an array of objects, each containing a token and its logprob | |
| // This is present only if top_logprobs was requested in the API call | |
| top_logprobs: ?[]struct { | |
| token: []const u8, | |
| logprob: f64, | |
| }, | |
| }, | |
| } = null, | |
| finish_reason: []const u8, // Reason for finishing the response | |
| index: u32, // Index of the choice in the array | |
| }, | |
| system_fingerprint: []const u8, // Fingerprint of the system used to generate the response | |
| }; | |
| // DTO for serialization (when sending requests) | |
| const Message = struct { | |
| role: []const u8, | |
| content: []const u8, | |
| }; | |
| const RequestPayload = struct { | |
| model: []const u8, | |
| messages: []Message, | |
| }; | |
| /// Formats a multiline string template with a varying number of dynamic string arguments via substitutions | |
| /// | |
| /// The template is expected to contain "{s}" placeholders where the dynamic arguments | |
| /// should be inserted. Each line of the template is treated as a potential insertion point. | |
| /// | |
| /// Returns an allocated string containing the formatted template. | |
| /// Caller owns the returned memory. | |
| pub fn formatTemplate(allocator: std.mem.Allocator, template: []const u8, substitutions: []const []const u8) ![]u8 { | |
| var result = std.ArrayList(u8).init(allocator); | |
| errdefer result.deinit(); | |
| var index: usize = 0; | |
| var line_iter = std.mem.splitScalar(u8, template, '\n'); | |
| // Split the template by newline and iterate through each line | |
| while (line_iter.next()) |line| { | |
| var parts = std.mem.splitSequence(u8, line, "{s}"); // Split each line by the "{s}" placeholder | |
| try result.writer().print("{s}", .{parts.next().?}); // Print the first part | |
| while (parts.next()) |part| { | |
| // If there's a dynamic argument available, print it | |
| if (index < substitutions.len) { | |
| try result.writer().print("{s}", .{substitutions[index]}); | |
| index += 1; | |
| } | |
| try result.writer().print("{s}", .{part}); // Print the next part of the line | |
| } | |
| try result.writer().writeByte('\n'); // Add a newline after each line is processed | |
| } | |
| _ = result.pop(); // Remove the last (unnecessary) newline added by the loop | |
| return result.toOwnedSlice(); | |
| } | |
| /// Invoke an LLM with a given system prompt and user prompt | |
| /// Returns an LLMResponse instance | |
| /// Caller owns returned memory and must call .deinit() | |
| pub fn llmCall(allocator: std.mem.Allocator, system_prompt: []const u8, user_prompt: []const u8) !std.json.Parsed(LLMResponse) { | |
| // Handles all memory allocations for the network request | |
| // This means any derived deinits are all noops, so can be omitted | |
| var request_arena = std.heap.ArenaAllocator.init(std.heap.page_allocator); | |
| defer request_arena.deinit(); | |
| const request_arena_allocator = request_arena.allocator(); | |
| // Create client | |
| var client = std.http.Client{ .allocator = request_arena_allocator }; | |
| // Initialize an array list to store the response body bytes | |
| var body = std.ArrayList(u8).init(request_arena_allocator); | |
| // Parse URI for POST endpoint /v1/chat/completions | |
| const uri = try std.Uri.parse("http://127.0.0.1:1337/v1/chat/completions"); | |
| // Prepare request payload | |
| var messages = [_]Message{ | |
| Message{ .role = "system", .content = system_prompt }, | |
| Message{ .role = "user", .content = user_prompt }, | |
| }; | |
| const request_payload = RequestPayload{ | |
| .model = "Qwen_Qwen3-4B-Instruct-2507-IQ4_XS", | |
| .messages = &messages, | |
| }; | |
| const payload = try std.json.stringifyAlloc(request_arena_allocator, request_payload, .{}); | |
| std.debug.print("{s}\n", .{"=" ** 50}); | |
| std.debug.print("Payload: {s}\n", .{payload}); | |
| // Make the POST request | |
| const response = try client.fetch(.{ | |
| .method = .POST, | |
| .location = .{ .uri = uri }, | |
| .response_storage = .{ .dynamic = &body }, | |
| .payload = payload, | |
| .headers = .{ | |
| .content_type = .{ .override = "application/json" }, | |
| .accept_encoding = .{ .override = "application/json" }, | |
| .authorization = .{ .override = "Bearer so-this-is-an-api-key" }, | |
| }, | |
| }); | |
| // print the response status | |
| std.debug.print("{s}\n", .{"=" ** 50}); | |
| std.debug.print("Response status: {}\n", .{response.status}); | |
| // Do whatever you need to in case of HTTP error. | |
| if (response.status != .ok) { | |
| std.debug.print("HTTP Error: {}\n", .{response.status}); | |
| std.debug.print("Response body: {s}\n", .{body.items}); | |
| std.debug.print("Error connecting to llama-server: {s}\n", .{body.items}); | |
| } | |
| // Deserialize JSON response into a struct | |
| const parsed = try std.json.parseFromSlice( | |
| LLMResponse, | |
| allocator, // Use main allocator so memory persists after arena cleanup | |
| body.items, | |
| .{ | |
| .allocate = .alloc_always, | |
| .parse_numbers = true, | |
| .ignore_unknown_fields = true, | |
| .duplicate_field_behavior = .use_last, | |
| }, | |
| ); | |
| // note: wow an arena is perfect for this typa control flow lol | |
| return parsed; | |
| } | |
| pub fn main() !void { | |
| var gpa = std.heap.GeneralPurposeAllocator(.{}){}; | |
| var allocator = gpa.allocator(); // a.k.a. debug allocator | |
| defer { | |
| if (gpa.deinit() == .leak) { | |
| std.debug.print("Memory leak detected\n", .{}); | |
| std.process.exit(1); | |
| } | |
| } | |
| const system_prompt_template = | |
| \\You are a helpful assistant. | |
| \\The user's name is {s}. | |
| \\And your identity is {s}. | |
| ; | |
| const system_prompt_vars = [_][]const u8{ "raja", "jocasta" }; | |
| const system_prompt = try formatTemplate(allocator, system_prompt_template, &system_prompt_vars); | |
| defer allocator.free(system_prompt); | |
| std.debug.print("system prompt: {s}\n", .{system_prompt}); | |
| std.debug.print("{s}\n", .{"=" ** 50}); | |
| const prompt = "who are we?"; | |
| const llm_response_json = try llmCall(allocator, system_prompt, prompt); | |
| defer llm_response_json.deinit(); | |
| const llm_response = llm_response_json.value; | |
| // Assistant's response | |
| const content = llm_response.choices[0].message.content; | |
| try std.io.getStdOut().writer().print("Assistant: {s}\n", .{content}); | |
| } |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
std.jsonfor (de)serializationstd.httpfor interneting.just make sure you have an installation of llama.cpp, you can
and you have its OpenAI compatible inference server fired up and listening on
http://127.0.0.1:1337and finally that you have a
.GGUFofQwen_Qwen3-4B-Instruct-2507-IQ4_XSdownloaded and ready to go