#llm #openai #nlp #anthropic #gguf #hugging-face #encoding


A Swiss army knife for LLMs with features for Llama.cpp, Openai, Anthropic, Mistral-rs

6 releases

new 0.0.6 Jun 9, 2024
0.0.5 Jun 6, 2024
0.0.3 May 28, 2024

#38 in Science

Download history 355/week @ 2024-05-23 205/week @ 2024-05-30

560 downloads per month
Used in llm_client

MIT and GPL-3.0-or-later



A Swiss army knife for working with LLMs. Features supporting Llama.cpp, Openai, Anthropic, Mistral-rs. Originally made for the llm_client crate, but split into it's own crate just for you.

  • Estimate GGUF Vram usage.
  • Clean and chunk HTML and text.
  • Build grammars for Llama.cpp.
  • Ensure your prompts are within LLM token limits.


llm_utils = "*"

Tokenizer 🧮

  • Hugging Face's Tokenizer library for local models and Tiktoken-rs for OpenAI and Anthropic (Anthropic doesn't have a publically available tokenizer.)

  • Simple abstract API for encoding and decoding allows for abstract LLM consumption across multiple architechtures.

  • Safely set the max_token param for LLMs to ensure requests don't fail due to exceeding token limits!

    // Get a Tiktoken tokenizer
    let tokenizer: LlmTokenizer = LlmTokenizer::new_tiktoken("gpt-4o");

    // Get a Hugging Face tokenizer from local path
    let tokenizer: LlmTokenizer = LlmTokenizer::new_from_tokenizer_json("path/to/tokenizer.json");
    // Or load from repo
    let tokenizer: LlmTokenizer = LlmTokenizer::new_from_hf_repo(hf_token, "meta-llama/Meta-Llama-3-8B-Instruct");

    // Get tokenizan'
    let token_ids: Vec<u32> = tokenizer.tokenize("Hello there");
    let count: u32 = tokenizer.count_tokens("Hello there");
    let word_probably: String = tokenizer.detokenize_one(token_ids[0])?; 
    let words_probably: String = tokenizer.detokenize_many(token_ids)?; 

    // These function are used for generating logit bias
    let token_id: u32 = tokenizer.try_into_single_token("hello");
    let word_probably: String = tokenizer.try_from_single_token_id(1234);

Text utils 📝

    // Normalize whitespace chars to " " and "\n".
    // Reduce the number of newlines to singles or doubles (paragraphs) or convert them to " ".
    // Optionally, remove all characters besides alphabetic, numbers, and punctuation. 
    let mut text_cleaner: String = llm_utils::text_utils::clean_text::TextCleaner::new();
    let cleaned_text: String = text_cleaner

    // Convert HTML to cleaned text.
    // Uses an implementation of Mozilla's readability mode and HTML2Text.
    let cleaned_text: String = llm_utils::text_utils::clean_html::clean_html(raw_html);

    // Rule based text segmentation for sentences. 
    // Better than unicode-segmentation crate or any other crate I tested.
    // But still not as good as a model based approach like Spacy or other NLP libraries.
    let sentence_splits: Vec<String> = 

    // Split text into balanced chunks as close to the given size as possible.
    // Unlike other implementations this method attempts to keep the chunks the same size.
    // This means you won't end up with an orphan final chunk that is too small to be useful.
    // Implemented with a DFS algo, recursion, memoziation, and heuristic pre-filters.
    // Attempts to split semantically in the following order:
    // Paragraphs, newlines, sentences, words, and finally graphmemes.
    // Please note: this is much slower than other methods. It needs optimizations!
    let chunked_text: Vec<String> = 
        llm_utils::text_utils::chunk::chunk_text(text, chunk_size, Some(overlap_percent));

Model presets 🛤️

  • Presets for Open Source LLMs from Hugging Face, or API models like OpenAI, and Anthropic.

  • Load and/or download a model with metadata, tokenizer, and local path (for local LLMs like llama.cpp, vllm, mistral.rs).

  • Auto-select the largest quantized GGUF that will fit in your vram!

Supported Open Source models:

⚪ Llama 3

⚪ Mistral and Mixtral

⚪ Phi 3

    // Load the largest quantized Mistral-7B-Instruct model that will fit in your vram
    let model: OsLlm = PresetModelBuilder::new()
        .ctx_size(9001) // ctx_size impacts vram usage!

    not_a_real_assert_eq!(model, OsLlm {
        pub model_id: String,
        pub model_url: String,
        pub local_model_path: String, // Use this to load the llama.cpp server
        pub model_config_json: OsLlmConfigJson,
        pub chat_template: OsLlmChatTemplate,
        pub tokenizer: Option<LlmTokenizer>,

    // Or Openai
    let model: OpenAiLlm = OpenAiLlm::gpt_4_o();

    not_a_real_assert_eq!(model, OpenAiLlm {
        model_id: "gpt-4o".to_string(),
        context_length: 128000,
        cost_per_m_in_tokens: 5.00,
        max_tokens_output: 4096,
        cost_per_m_out_tokens: 15.00,
        tokens_per_message: 3,
        tokens_per_name: 1,
        tokenizer: Option<LlmTokenizer>,

    // Or Anthropic
    let model: AnthropicLlm = AnthropicLlm::claude_3_opus();

GGUF models from Hugging Face or local path 🚤

    // From HF
    let model_url = "https://huggingface.co/MaziyarPanahi/Meta-Llama-3-8B-Instruct-GGUF/blob/main/Meta-Llama-3-8B-Instruct.Q6_K.gguf";
    let model: OsLlm = GGUFModelBuilder::new()

    // Note: because we can't instantiate a tokenizer from a GGUF file, the returned model will not have a tokenizer!
    // However, if we provide the base model's repo, we load from there.
    let repo_id = "meta-llama/Meta-Llama-3-8B-Instruct";
    let model: OsLlm = GGUFModelBuilder::new()

    // From Local
    let local_path = "/root/.cache/huggingface/hub/models--MaziyarPanahi--Meta-Llama-3-8B-Instruct-GGUF/blobs/c2ca99d853de276fb25a13e369a0db2fd3782eff8d28973404ffa5ffca0b9267";
    let model: OsLlm = GGUFModelBuilder::new()

    // Again, we require a tokenizer.json. This can also be loaded from a local path.
    let local_config_path = "/llm_utils/src/models/open_source/llama/llama_3_8b_instruct";
    let model: OsLlm = GGUFModelBuilder::new()

Prompting 🎶

  • Generate properly formatted prompts for GGUF models, Openai, and Anthropic.

  • Uses the GGUF's chat template and Jinja templates to format the prompt to model spec.

  • Create prompts from a combination of dynamic inputs and/or static inputs from file.

    // Default formatted prompt (Openai and Anthropic format)
    let default_formatted_prompt: HashMap<String, HashMap<String, String>> = prompting::default_formatted_prompt(
        "You are a nice robot.",
        "Where do robots come from?"

    // Get total tokens in prompt
    let total_prompt_tokens: u32 = model.openai_token_count_of_prompt(&tokenizer, &default_formatted_prompt);

    // Then convert it to be used for a GGUF model
    let gguf_formatted_prompt: String = prompting::convert_default_prompt_to_model_format(

    // Since the GGUF formatted prompt is just a string, we can just use the generic count_tokens function
    let total_prompt_tokens: u32 = tokenizer.count_tokens(&gguf_formatted_prompt);

    // Validate requested max_tokens for a generation. If it exceeds the models limits, reduce max_tokens to a safe value.
    let safe_max_tokens = get_and_check_max_tokens_for_response(
            model.max_tokens_output, // If using a GGUF model use either model.context_length or the ctx_size of the server.

Grammar 🤓

    // Return a list of between 1, 4 items
    let grammar = llm_utils::grammar::create_list_grammar(1, 4);

    // List will be formatted: `- <list text>\n
    let response: String = text_generation_request(&req_config, Some(&grammar)).await?;

    // So you can easily split like:
    let response_items: Vec<String> = response
        .map(|line| line[1..].trim().to_string())

    // Exclude numbers from text generation
    let grammar = llm_utils::grammar::create_text_structured_grammar(vec![RestrictedCharacterSet::PunctuationExtended]);
    let response: String = text_generation_request(&req_config, Some(&grammar)).await?;

    // Exclude a list of common, and commonly unwanted characters from text generation
    let grammar = llm_utils::grammar::create_text_structured_grammar(vec![RestrictedCharacterSet::PunctuationExtended]);
    let response: String = text_generation_request(&req_config, Some(&grammar)).await?;

Logit bias #️⃣

  • Create properly formatted logit bias requests for LlamaCpp and Openai.

  • Functionality to add logit bias from a variety of sources, along with validation.

    // Exclude some tokens from text generation
    let mut words = HashMap::new();
    words.entry("as an ai model").or_insert(-100.0);

    // Build and validate
    let logit_bias = logit_bias::logit_bias_from_words(&tokenizer, &words)
    let validated_logit_bias = logit_bias::validate_logit_bias_values(&logit_bias)?;

    // Convert
    let openai_logit_bias = logit_bias::convert_logit_bias_to_openai_format(&validated_logit_bias)?;
    let llama_logit_bias = logit_bias::convert_logit_bias_to_llama_format(&validated_logit_bias)?;


This project is licensed under the MIT License.


My motivation for publishing is for someone to point out if I'm doing something wrong!


~890K SLoC