CactusCactus

LLM

Text generation, vision, streaming, and model options with Cactus

Basic Completion

import 'cactus.dart';

final model = Cactus.create('/path/to/model.gguf');
final result = model.complete('What is the capital of France?');
print(result.text);
model.dispose();
import com.cactus.*

val model = Cactus.create("/path/to/model")
val result = model.complete("What is the capital of France?")
println(result.text)
model.close()
#include <cactus.h>

cactus_model_t model = cactus_init("path/to/weight/folder", nullptr);

const char* messages = R"([
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What is the capital of France?"}
])";

const char* options = R"({
    "max_tokens": 50,
    "stop_sequences": ["<|im_end|>"]
})";

char response[4096];
int result = cactus_complete(
    model, messages, response, sizeof(response),
    options, nullptr, nullptr, nullptr
);

Response Format:

{
    "success": true,
    "error": null,
    "cloud_handoff": false,
    "response": "The capital of France is Paris.",
    "function_calls": [],
    "confidence": 0.8193,
    "time_to_first_token_ms": 45.23,
    "total_time_ms": 163.67,
    "prefill_tps": 1621.89,
    "decode_tps": 168.42,
    "ram_usage_mb": 245.67,
    "prefill_tokens": 28,
    "decode_tokens": 50,
    "total_tokens": 78
}

Chat Messages

final model = Cactus.create(modelPath);
final result = model.completeMessages([
  Message.system('You are a helpful assistant.'),
  Message.user('What is 2 + 2?'),
]);
print(result.text);
model.dispose();
Cactus.create(modelPath).use { model ->
    val result = model.complete(
        messages = listOf(
            Message.system("You are a helpful assistant."),
            Message.user("What is 2 + 2?")
        )
    )
    println(result.text)
}

Completion Options

final options = CompletionOptions(
  temperature: 0.7,
  topP: 0.9,
  topK: 40,
  maxTokens: 256,
  stopSequences: ['\n\n'],
);

final result = model.complete('Write a haiku:', options: options);
val options = CompletionOptions(
    temperature = 0.7f,
    topP = 0.9f,
    topK = 40,
    maxTokens = 256,
    stopSequences = listOf("\n\n")
)

val result = model.complete("Write a haiku:", options)

Streaming

final result = model.complete(
  'Tell me a story',
  callback: (token, tokenId) {
    print(token);
  },
);
val result = model.complete(
    messages = listOf(Message.user("Tell me a story")),
    callback = TokenCallback { token, tokenId ->
        print(token)
    }
)
void token_callback(const char* token, int token_id, void* user_data) {
    printf("%s", token);
    fflush(stdout);
}

cactus_complete(
    model, messages, response, sizeof(response),
    nullptr, nullptr,
    token_callback,  // streaming callback
    nullptr          // user data
);

Cloud Handoff

When the model lacks confidence, the response signals a cloud handoff:

The CompletionResult includes needsCloudHandoff and confidence fields. Check these to decide whether to route to a cloud API.

The CompletionResult includes needsCloudHandoff and confidence fields. Check these to decide whether to route to a cloud API.

{
    "success": true,
    "cloud_handoff": true,
    "response": null,
    "confidence": 0.42
}

Your application should route to a cloud API when cloud_handoff is true.

Performance Tips

  • Model Selection - Use smaller models (lfm2-350m) for faster inference on mobile
  • Quantization - int4 uses less memory, int8 is more accurate
  • NPU Acceleration - Available on Apple devices for vision and transcription models
  • Memory - Always call dispose() / close() when done to free resources
  • Reuse model handles across requests (don't reinitialize)