bkataru · October 17, 2025 11:25 · bkataru · Aug 15, 2025
diff --git a/llama_cpp_client.zig b/llama_cpp_client.zig
 const std = @import("std");

 // DTO for deserialization
 const LLMResponse = struct {
    id: []const u8, // Unique identifier for the response
    object: []const u8, // Type of object returned
    created: u32, // Unix timestamp of when the response was generated
    model: []const u8, // Name of the model used to generate the response
    usage: ?struct { // Usage statistics for the response, optional
        prompt_tokens: u32, // Number of tokens in the prompt
        completion_tokens: u32, // Number of tokens in the completion
        total_tokens: u32, // Total number of tokens used
    } = null,
    timings: ?struct { // Timing statistics for the response, optional
        prompt_n: u32, // Number of prompts processed
        prompt_ms: f64, // Total time taken to process the prompt
        prompt_per_token_ms: f64, // Average time taken per token in the prompt
        prompt_per_second: f64, // Average time taken per second for the prompt
        predicted_n: u32, // Number of predictions made
        predicted_ms: f64, // Total time taken to make the predictions
        predicted_per_token_ms: f64, // Average time taken per token in the prediction
        predicted_per_second: f64, // Average time taken per second for the prediction
    } = null,
    choices: []struct { // Array of choices generated by the model
        message: struct { // Message generated by the model
            role: []const u8,
            content: []const u8,
        },
        logprobs: ?struct { // Log probabilities of the tokens generated, optional
            content: []struct { // Array of token logprob objects
                token: []const u8, // Token ID or string representation of the token
                logprob: f64, // Using f64 for double precision log probabilities
                bytes: []const u8, // Raw bytes of the token
                // top_logprobs is an array of objects, each containing a token and its logprob
                // This is present only if top_logprobs was requested in the API call
                top_logprobs: ?[]struct {
                    token: []const u8,
                    logprob: f64,
                },
            },
        } = null,
        finish_reason: []const u8, // Reason for finishing the response
        index: u32, // Index of the choice in the array
    },
    system_fingerprint: []const u8, // Fingerprint of the system used to generate the response
 };

 // DTO for serialization (when sending requests)
 const Message = struct {
    role: []const u8,
    content: []const u8,
 };

 const RequestPayload = struct {
    model: []const u8,
    messages: []Message,
 };

 /// Formats a multiline string template with a varying number of dynamic string arguments via substitutions
 ///
 /// The template is expected to contain "{s}" placeholders where the dynamic arguments
 /// should be inserted. Each line of the template is treated as a potential insertion point.
 ///
 /// Returns an allocated string containing the formatted template.
 /// Caller owns the returned memory.
 pub fn formatTemplate(allocator: std.mem.Allocator, template: []const u8, substitutions: []const []const u8) ![]u8 {
    var result = std.ArrayList(u8).init(allocator);
    errdefer result.deinit();

    var index: usize = 0;
    var line_iter = std.mem.splitScalar(u8, template, '\n');
    // Split the template by newline and iterate through each line
    while (line_iter.next()) |line| {
        var parts = std.mem.splitSequence(u8, line, "{s}"); // Split each line by the "{s}" placeholder
        try result.writer().print("{s}", .{parts.next().?}); // Print the first part

        while (parts.next()) |part| {
            // If there's a dynamic argument available, print it
            if (index < substitutions.len) {
                try result.writer().print("{s}", .{substitutions[index]});
                index += 1;
            }
            try result.writer().print("{s}", .{part}); // Print the next part of the line
        }
        try result.writer().writeByte('\n'); // Add a newline after each line is processed
    }
    _ = result.pop(); // Remove the last (unnecessary) newline added by the loop

    return result.toOwnedSlice();
 }

 /// Invoke an LLM with a given system prompt and user prompt
 /// Returns an LLMResponse instance
 /// Caller owns returned memory and must call .deinit()
 pub fn llmCall(allocator: std.mem.Allocator, system_prompt: []const u8, user_prompt: []const u8) !std.json.Parsed(LLMResponse) {
    // Handles all memory allocations for the network request
    // This means any derived deinits are all noops, so can be omitted
    var request_arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
    defer request_arena.deinit();
    const request_arena_allocator = request_arena.allocator();

    // Create client
    var client = std.http.Client{ .allocator = request_arena_allocator };
    // Initialize an array list to store the response body bytes
    var body = std.ArrayList(u8).init(request_arena_allocator);
    // Parse URI for POST endpoint /v1/chat/completions
    const uri = try std.Uri.parse("http://127.0.0.1:1337/v1/chat/completions");

    // Prepare request payload
    var messages = [_]Message{
        Message{ .role = "system", .content = system_prompt },
        Message{ .role = "user", .content = user_prompt },
    };
    const request_payload = RequestPayload{
        .model = "Qwen_Qwen3-4B-Instruct-2507-IQ4_XS",
        .messages = &messages,
    };
    const payload = try std.json.stringifyAlloc(request_arena_allocator, request_payload, .{});
    std.debug.print("{s}\n", .{"=" ** 50});
    std.debug.print("Payload: {s}\n", .{payload});

    // Make the POST request
    const response = try client.fetch(.{
        .method = .POST,
        .location = .{ .uri = uri },
        .response_storage = .{ .dynamic = &body },
        .payload = payload,
        .headers = .{
            .content_type = .{ .override = "application/json" },
            .accept_encoding = .{ .override = "application/json" },
            .authorization = .{ .override = "Bearer so-this-is-an-api-key" },
        },
    });

    // print the response status
    std.debug.print("{s}\n", .{"=" ** 50});
    std.debug.print("Response status: {}\n", .{response.status});

    // Do whatever you need to in case of HTTP error.
    if (response.status != .ok) {
        std.debug.print("HTTP Error: {}\n", .{response.status});
        std.debug.print("Response body: {s}\n", .{body.items});
        std.debug.print("Error connecting to llama-server: {s}\n", .{body.items});
    }

    // Deserialize JSON response into a struct
    const parsed = try std.json.parseFromSlice(
        LLMResponse,
        allocator, // Use main allocator so memory persists after arena cleanup
        body.items,
        .{
            .allocate = .alloc_always,
            .parse_numbers = true,
            .ignore_unknown_fields = true,
            .duplicate_field_behavior = .use_last,
        },
    );

    // note: wow an arena is perfect for this typa control flow lol
    return parsed;
 }

 pub fn main() !void {
    var gpa = std.heap.GeneralPurposeAllocator(.{}){};
    var allocator = gpa.allocator(); // a.k.a. debug allocator

    defer {
        if (gpa.deinit() == .leak) {
            std.debug.print("Memory leak detected\n", .{});
            std.process.exit(1);
        }
    }

    const system_prompt_template =
        \\You are a helpful assistant.
        \\The user's name is {s}.
        \\And your identity is {s}.
    ;
    const system_prompt_vars = [_][]const u8{ "raja", "jocasta" };

    const system_prompt = try formatTemplate(allocator, system_prompt_template, &system_prompt_vars);
    defer allocator.free(system_prompt);
    std.debug.print("system prompt: {s}\n", .{system_prompt});

    std.debug.print("{s}\n", .{"=" ** 50});

    const prompt = "who are we?";

    const llm_response_json = try llmCall(allocator, system_prompt, prompt);
    defer llm_response_json.deinit();
    const llm_response = llm_response_json.value;

    // Assistant's response
    const content = llm_response.choices[0].message.content;
    try std.io.getStdOut().writer().print("Assistant: {s}\n", .{content});
 }
	const std = @import("std");

	// DTO for deserialization
	const LLMResponse = struct {
	id: []const u8, // Unique identifier for the response
	object: []const u8, // Type of object returned
	created: u32, // Unix timestamp of when the response was generated
	model: []const u8, // Name of the model used to generate the response
	usage: ?struct { // Usage statistics for the response, optional
	prompt_tokens: u32, // Number of tokens in the prompt
	completion_tokens: u32, // Number of tokens in the completion
	total_tokens: u32, // Total number of tokens used
	} = null,
	timings: ?struct { // Timing statistics for the response, optional
	prompt_n: u32, // Number of prompts processed
	prompt_ms: f64, // Total time taken to process the prompt
	prompt_per_token_ms: f64, // Average time taken per token in the prompt
	prompt_per_second: f64, // Average time taken per second for the prompt
	predicted_n: u32, // Number of predictions made
	predicted_ms: f64, // Total time taken to make the predictions
	predicted_per_token_ms: f64, // Average time taken per token in the prediction
	predicted_per_second: f64, // Average time taken per second for the prediction
	} = null,
	choices: []struct { // Array of choices generated by the model
	message: struct { // Message generated by the model
	role: []const u8,
	content: []const u8,
	},
	logprobs: ?struct { // Log probabilities of the tokens generated, optional
	content: []struct { // Array of token logprob objects
	token: []const u8, // Token ID or string representation of the token
	logprob: f64, // Using f64 for double precision log probabilities
	bytes: []const u8, // Raw bytes of the token
	// top_logprobs is an array of objects, each containing a token and its logprob
	// This is present only if top_logprobs was requested in the API call
	top_logprobs: ?[]struct {
	token: []const u8,
	logprob: f64,
	},
	},
	} = null,
	finish_reason: []const u8, // Reason for finishing the response
	index: u32, // Index of the choice in the array
	},
	system_fingerprint: []const u8, // Fingerprint of the system used to generate the response
	};

	// DTO for serialization (when sending requests)
	const Message = struct {
	role: []const u8,
	content: []const u8,
	};

	const RequestPayload = struct {
	model: []const u8,
	messages: []Message,
	};

	/// Formats a multiline string template with a varying number of dynamic string arguments via substitutions
	///
	/// The template is expected to contain "{s}" placeholders where the dynamic arguments
	/// should be inserted. Each line of the template is treated as a potential insertion point.
	///
	/// Returns an allocated string containing the formatted template.
	/// Caller owns the returned memory.
	pub fn formatTemplate(allocator: std.mem.Allocator, template: []const u8, substitutions: []const []const u8) ![]u8 {
	var result = std.ArrayList(u8).init(allocator);
	errdefer result.deinit();

	var index: usize = 0;
	var line_iter = std.mem.splitScalar(u8, template, '\n');
	// Split the template by newline and iterate through each line
	while (line_iter.next()) \|line\| {
	var parts = std.mem.splitSequence(u8, line, "{s}"); // Split each line by the "{s}" placeholder
	try result.writer().print("{s}", .{parts.next().?}); // Print the first part

	while (parts.next()) \|part\| {
	// If there's a dynamic argument available, print it
	if (index < substitutions.len) {
	try result.writer().print("{s}", .{substitutions[index]});
	index += 1;
	}
	try result.writer().print("{s}", .{part}); // Print the next part of the line
	}
	try result.writer().writeByte('\n'); // Add a newline after each line is processed
	}
	_ = result.pop(); // Remove the last (unnecessary) newline added by the loop

	return result.toOwnedSlice();
	}

	/// Invoke an LLM with a given system prompt and user prompt
	/// Returns an LLMResponse instance
	/// Caller owns returned memory and must call .deinit()
	pub fn llmCall(allocator: std.mem.Allocator, system_prompt: []const u8, user_prompt: []const u8) !std.json.Parsed(LLMResponse) {
	// Handles all memory allocations for the network request
	// This means any derived deinits are all noops, so can be omitted
	var request_arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
	defer request_arena.deinit();
	const request_arena_allocator = request_arena.allocator();

	// Create client
	var client = std.http.Client{ .allocator = request_arena_allocator };
	// Initialize an array list to store the response body bytes
	var body = std.ArrayList(u8).init(request_arena_allocator);
	// Parse URI for POST endpoint /v1/chat/completions
	const uri = try std.Uri.parse("http://127.0.0.1:1337/v1/chat/completions");

	// Prepare request payload
	var messages = [_]Message{
	Message{ .role = "system", .content = system_prompt },
	Message{ .role = "user", .content = user_prompt },
	};
	const request_payload = RequestPayload{
	.model = "Qwen_Qwen3-4B-Instruct-2507-IQ4_XS",
	.messages = &messages,
	};
	const payload = try std.json.stringifyAlloc(request_arena_allocator, request_payload, .{});
	std.debug.print("{s}\n", .{"=" ** 50});
	std.debug.print("Payload: {s}\n", .{payload});

	// Make the POST request
	const response = try client.fetch(.{
	.method = .POST,
	.location = .{ .uri = uri },
	.response_storage = .{ .dynamic = &body },
	.payload = payload,
	.headers = .{
	.content_type = .{ .override = "application/json" },
	.accept_encoding = .{ .override = "application/json" },
	.authorization = .{ .override = "Bearer so-this-is-an-api-key" },
	},
	});

	// print the response status
	std.debug.print("{s}\n", .{"=" ** 50});
	std.debug.print("Response status: {}\n", .{response.status});

	// Do whatever you need to in case of HTTP error.
	if (response.status != .ok) {
	std.debug.print("HTTP Error: {}\n", .{response.status});
	std.debug.print("Response body: {s}\n", .{body.items});
	std.debug.print("Error connecting to llama-server: {s}\n", .{body.items});
	}

	// Deserialize JSON response into a struct
	const parsed = try std.json.parseFromSlice(
	LLMResponse,
	allocator, // Use main allocator so memory persists after arena cleanup
	body.items,
	.{
	.allocate = .alloc_always,
	.parse_numbers = true,
	.ignore_unknown_fields = true,
	.duplicate_field_behavior = .use_last,
	},
	);

	// note: wow an arena is perfect for this typa control flow lol
	return parsed;
	}

	pub fn main() !void {
	var gpa = std.heap.GeneralPurposeAllocator(.{}){};
	var allocator = gpa.allocator(); // a.k.a. debug allocator

	defer {
	if (gpa.deinit() == .leak) {
	std.debug.print("Memory leak detected\n", .{});
	std.process.exit(1);
	}
	}

	const system_prompt_template =
	\\You are a helpful assistant.
	\\The user's name is {s}.
	\\And your identity is {s}.
	;
	const system_prompt_vars = [_][]const u8{ "raja", "jocasta" };

	const system_prompt = try formatTemplate(allocator, system_prompt_template, &system_prompt_vars);
	defer allocator.free(system_prompt);
	std.debug.print("system prompt: {s}\n", .{system_prompt});

	std.debug.print("{s}\n", .{"=" ** 50});

	const prompt = "who are we?";

	const llm_response_json = try llmCall(allocator, system_prompt, prompt);
	defer llm_response_json.deinit();
	const llm_response = llm_response_json.value;

	// Assistant's response
	const content = llm_response.choices[0].message.content;
	try std.io.getStdOut().writer().print("Assistant: {s}\n", .{content});
	}
No results found