diff --git a/common/arg.cpp b/common/arg.cpp index 5528eeb1692..16cb2e03a6f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -105,6 +105,16 @@ bool common_arg::is_exclude(enum llama_example ex) { bool common_arg::get_value_from_env(std::string & output) const { if (env == nullptr) return false; + if (!args_neg.empty()) { + // for compatibility, we need to check LLAMA_ARG_NO_ env as well + std::string neg_env = env; + string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_"); + char * neg_value = std::getenv(neg_env.c_str()); + if (neg_value) { + output = "0"; // falsey + return true; + } + } char * value = std::getenv(env); if (value) { output = value; @@ -114,6 +124,14 @@ bool common_arg::get_value_from_env(std::string & output) const { } bool common_arg::has_value_from_env() const { + if (env != nullptr && !args_neg.empty()) { + // for compatibility, we need to check LLAMA_ARG_NO_ env as well + std::string neg_env = env; + string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_"); + if (std::getenv(neg_env.c_str())) { + return true; + } + } return env != nullptr && std::getenv(env); } @@ -151,9 +169,10 @@ std::string common_arg::to_string() const { std::string leading_spaces(n_leading_spaces, ' '); std::ostringstream ss; - for (const auto arg : args) { - if (arg == args.front()) { - if (args.size() == 1) { + auto all_args = get_args(); // also contains args_neg + for (const auto & arg : all_args) { + if (arg == all_args.front()) { + if (all_args.size() == 1) { ss << arg; } else { // first arg is usually abbreviation, we need padding to make it more beautiful @@ -162,7 +181,7 @@ std::string common_arg::to_string() const { ss << tmp << spaces; } } else { - ss << arg << (arg != args.back() ? ", " : ""); + ss << arg << (arg != all_args.back() ? ", " : ""); } } if (value_hint) ss << " " << value_hint; @@ -181,6 +200,31 @@ std::string common_arg::to_string() const { return ss.str(); } +std::vector common_arg::get_args() const { + std::vector result; + for (const auto & arg : args) { + result.push_back(std::string(arg)); + } + for (const auto & arg : args_neg) { + result.push_back(std::string(arg)); + } + return result; +} + +std::vector common_arg::get_env() const { + std::vector result; + if (env) { + result.push_back(std::string(env)); + } + if (!args_neg.empty() && env) { + // for compatibility, we need to add LLAMA_ARG_NO_ variant + std::string neg_env = env; + string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_"); + result.push_back(neg_env); + } + return result; +} + // // utils // @@ -316,6 +360,16 @@ static std::string get_all_kv_cache_types() { return msg.str(); } +static bool parse_bool_value(const std::string & value) { + if (is_truthy(value)) { + return true; + } else if (is_falsey(value)) { + return false; + } else { + throw std::invalid_argument("invalid boolean value"); + } +} + // // CLI argument parsing functions // @@ -323,10 +377,13 @@ static std::string get_all_kv_cache_types() { static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { common_params & params = ctx_arg.params; - std::unordered_map arg_to_options; + std::unordered_map> arg_to_options; for (auto & opt : ctx_arg.options) { for (const auto & arg : opt.args) { - arg_to_options[arg] = &opt; + arg_to_options[arg] = {&opt, /* is_positive */ true}; + } + for (const auto & arg : opt.args_neg) { + arg_to_options[arg] = {&opt, /* is_positive */ false}; } } @@ -335,12 +392,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context std::string value; if (opt.get_value_from_env(value)) { try { - if (opt.handler_void && (value == "1" || value == "true")) { + if (opt.handler_void && is_truthy(value)) { opt.handler_void(params); } if (opt.handler_int) { opt.handler_int(params, std::stoi(value)); } + if (opt.handler_bool) { + opt.handler_bool(params, parse_bool_value(value)); + } if (opt.handler_string) { opt.handler_string(params, value); continue; @@ -369,7 +429,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context if (arg_to_options.find(arg) == arg_to_options.end()) { throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); } - auto opt = *arg_to_options[arg]; + auto & tmp = arg_to_options[arg]; + auto opt = *tmp.first; + bool is_positive = tmp.second; if (opt.has_value_from_env()) { fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str()); } @@ -378,6 +440,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context opt.handler_void(params); continue; } + if (opt.handler_bool) { + opt.handler_bool(params, is_positive); + continue; + } // arg with single value check_arg(i); @@ -402,7 +468,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context throw std::invalid_argument(string_format( "error while handling argument \"%s\": %s\n\n" "usage:\n%s\n\nto show complete usage, run with -h", - arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str())); + arg.c_str(), e.what(), opt.to_string().c_str())); } } @@ -750,11 +816,11 @@ static std::string list_builtin_chat_templates() { } bool common_arg_utils::is_truthy(const std::string & value) { - return value == "on" || value == "enabled" || value == "1"; + return value == "on" || value == "enabled" || value == "true" || value == "1"; } bool common_arg_utils::is_falsey(const std::string & value) { - return value == "off" || value == "disabled" || value == "0"; + return value == "off" || value == "disabled" || value == "false" || value == "0"; } bool common_arg_utils::is_autoy(const std::string & value) { @@ -839,10 +905,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } )); add_opt(common_arg( + {"--display-prompt"}, {"--no-display-prompt"}, - string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"), - [](common_params & params) { - params.display_prompt = false; + string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"), + [](common_params & params, bool value) { + params.display_prompt = value; } ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( @@ -1055,18 +1122,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.kv_unified = true; } ).set_env("LLAMA_ARG_KV_UNIFIED")); - add_opt(common_arg( - {"--no-context-shift"}, - string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"), - [](common_params & params) { - params.ctx_shift = false; - } - ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); add_opt(common_arg( {"--context-shift"}, - string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"), - [](common_params & params) { - params.ctx_shift = true; + {"--no-context-shift"}, + string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.ctx_shift = value; } ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT")); add_opt(common_arg( @@ -1106,20 +1167,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION})); add_opt(common_arg( + {"--perf"}, {"--no-perf"}, - string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), - [](common_params & params) { - params.no_perf = true; - params.sampling.no_perf = true; + string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"), + [](common_params & params, bool value) { + params.no_perf = !value; + params.sampling.no_perf = !value; } - ).set_env("LLAMA_ARG_NO_PERF")); + ).set_env("LLAMA_ARG_PERF")); add_opt(common_arg( + {"--show-timings"}, {"--no-show-timings"}, - string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"), - [](common_params & params) { - params.show_timings = false; + string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"), + [](common_params & params, bool value) { + params.show_timings = value; } - ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS")); + ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS")); add_opt(common_arg( {"-f", "--file"}, "FNAME", "a file containing the prompt (default: none)", @@ -1171,16 +1234,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_excludes({LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-e", "--escape"}, - string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), - [](common_params & params) { - params.escape = true; - } - )); - add_opt(common_arg( {"--no-escape"}, - "do not process escape sequences", - [](common_params & params) { - params.escape = false; + string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"), + [](common_params & params, bool value) { + params.escape = value; } )); add_opt(common_arg( @@ -1227,19 +1284,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( {"-cnv", "--conversation"}, - "run in conversation mode:\n" + {"-no-cnv", "--no-conversation"}, + "whether to run in conversation mode:\n" "- does not print special tokens and suffix/prefix\n" "- interactive mode is also enabled\n" "(default: auto enabled if chat template is available)", - [](common_params & params) { - params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED; - } - ).set_examples({LLAMA_EXAMPLE_COMPLETION})); - add_opt(common_arg( - {"-no-cnv", "--no-conversation"}, - "force disable conversation mode (default: false)", - [](common_params & params) { - params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED; + [](common_params & params, bool value) { + params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED; } ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI})); add_opt(common_arg( @@ -1297,10 +1348,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( + {"--warmup"}, {"--no-warmup"}, - "skip warming up the model with an empty run", - [](common_params & params) { - params.warmup = false; + string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.warmup = value; } ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( @@ -1702,19 +1754,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION})); add_opt(common_arg( + {"-kvo", "--kv-offload"}, {"-nkvo", "--no-kv-offload"}, - "disable KV offload", - [](common_params & params) { - params.no_kv_offload = true; + string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"), + [](common_params & params, bool value) { + params.no_kv_offload = !value; } - ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); + ).set_env("LLAMA_ARG_KV_OFFLOAD")); add_opt(common_arg( + {"--repack"}, {"-nr", "--no-repack"}, - "disable weight repacking", - [](common_params & params) { - params.no_extra_bufts = true; + string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"), + [](common_params & params, bool value) { + params.no_extra_bufts = !value; } - ).set_env("LLAMA_ARG_NO_REPACK")); + ).set_env("LLAMA_ARG_REPACK")); add_opt(common_arg( {"--no-host"}, "bypass host buffer allowing extra buffers to be used", @@ -1843,18 +1897,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_PARALLEL})); add_opt(common_arg( {"-cb", "--cont-batching"}, - string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), - [](common_params & params) { - params.cont_batching = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING")); - add_opt(common_arg( {"-nocb", "--no-cont-batching"}, - "disable continuous batching", - [](common_params & params) { - params.cont_batching = false; + string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.cont_batching = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING")); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING")); add_opt(common_arg( {"-mm", "--mmproj"}, "FILE", "path to a multimodal projector file. see tools/mtmd/README.md\n" @@ -1871,19 +1919,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL")); add_opt(common_arg( - {"--no-mmproj"}, - "explicitly disable multimodal projector, useful when using -hf", - [](common_params & params) { - params.no_mmproj = true; + {"--mmproj-auto"}, + {"--no-mmproj", "--no-mmproj-auto"}, + string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"), + [](common_params & params, bool value) { + params.no_mmproj = !value; } - ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ")); + ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO")); add_opt(common_arg( + {"--mmproj-offload"}, {"--no-mmproj-offload"}, - "do not offload multimodal projector to GPU", - [](common_params & params) { - params.mmproj_use_gpu = false; + string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.mmproj_use_gpu = value; } - ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD")); + ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD")); add_opt(common_arg( {"--image", "--audio"}, "FILE", "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n", @@ -1923,12 +1973,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_env("LLAMA_ARG_MLOCK")); add_opt(common_arg( + {"--mmap"}, {"--no-mmap"}, - "do not memory-map model (slower load but may reduce pageouts if not using mlock)", - [](common_params & params) { - params.use_mmap = false; + string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.use_mmap = value; } - ).set_env("LLAMA_ARG_NO_MMAP")); + ).set_env("LLAMA_ARG_MMAP")); add_opt(common_arg( {"--numa"}, "TYPE", "attempt optimizations that help on some NUMA systems\n" @@ -2116,10 +2167,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } )); add_opt(common_arg( + {"--op-offload"}, {"--no-op-offload"}, - string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"), - [](common_params & params) { - params.no_op_offload = true; + string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"), + [](common_params & params, bool value) { + params.no_op_offload = !value; } )); add_opt(common_arg( @@ -2315,10 +2367,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( + {"--ppl"}, {"--no-ppl"}, - string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), - [](common_params & params) { - params.compute_ppl = false; + string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"), + [](common_params & params, bool value) { + params.compute_ppl = value; } ).set_examples({LLAMA_EXAMPLE_IMATRIX})); add_opt(common_arg( @@ -2437,12 +2490,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX")); add_opt(common_arg( + {"--webui"}, {"--no-webui"}, - string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"), - [](common_params & params) { - params.webui = false; + string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.webui = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI")); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI")); add_opt(common_arg( {"--embedding", "--embeddings"}, string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"), @@ -2547,18 +2601,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS")); add_opt(common_arg( {"--slots"}, - string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), - [](common_params & params) { - params.endpoint_slots = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); - add_opt(common_arg( {"--no-slots"}, - "disables slots monitoring endpoint", - [](common_params & params) { - params.endpoint_slots = false; + string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.endpoint_slots = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS")); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS")); add_opt(common_arg( {"--slot-save-path"}, "PATH", "path to save slot kv cache (default: disabled)", @@ -2609,26 +2657,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX")); add_opt(common_arg( + {"--models-autoload"}, {"--no-models-autoload"}, - "disables automatic loading of models (default: enabled)", - [](common_params & params) { - params.models_autoload = false; + string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.models_autoload = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD")); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD")); add_opt(common_arg( {"--jinja"}, - string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"), - [](common_params & params) { - params.use_jinja = true; - } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA")); - add_opt(common_arg( {"--no-jinja"}, - string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"), - [](common_params & params) { - params.use_jinja = false; + string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"), + [](common_params & params, bool value) { + params.use_jinja = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA")); + ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA")); add_opt(common_arg( {"--reasoning-format"}, "FORMAT", "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n" @@ -2673,15 +2716,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE")); add_opt(common_arg( + {"--prefill-assistant"}, {"--no-prefill-assistant"}, string_format( "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n" "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n" ), - [](common_params & params) { - params.prefill_assistant = false; + [](common_params & params, bool value) { + params.prefill_assistant = value; } - ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT")); + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT")); add_opt(common_arg( {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), diff --git a/common/arg.h b/common/arg.h index 219c115e635..6db38da488a 100644 --- a/common/arg.h +++ b/common/arg.h @@ -16,6 +16,7 @@ struct common_arg { std::set examples = {LLAMA_EXAMPLE_COMMON}; std::set excludes = {}; std::vector args; + std::vector args_neg; // for negated args like --no-xxx const char * value_hint = nullptr; // help text or example for arg value const char * value_hint_2 = nullptr; // for second arg value const char * env = nullptr; @@ -25,6 +26,7 @@ struct common_arg { void (*handler_string) (common_params & params, const std::string &) = nullptr; void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr; void (*handler_int) (common_params & params, int) = nullptr; + void (*handler_bool) (common_params & params, bool) = nullptr; common_arg() = default; @@ -48,6 +50,13 @@ struct common_arg { void (*handler)(common_params & params) ) : args(args), help(help), handler_void(handler) {} + common_arg( + const std::initializer_list & args, + const std::initializer_list & args_neg, + const std::string & help, + void (*handler)(common_params & params, bool) + ) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {} + // support 2 values for arg common_arg( const std::initializer_list & args, @@ -80,6 +89,10 @@ struct common_arg { } return strcmp(args[0], other.args[0]) == 0; } + + // get all args and env vars (including negated args/env) + std::vector get_args() const; + std::vector get_env() const; }; namespace common_arg_utils { diff --git a/common/preset.cpp b/common/preset.cpp index 09ac171b720..729c27f2cfc 100644 --- a/common/preset.cpp +++ b/common/preset.cpp @@ -23,8 +23,14 @@ std::vector common_preset::to_args() const { if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) { // flag option, no value if (common_arg_utils::is_falsey(value)) { - // skip the flag - args.pop_back(); + // use negative arg if available + if (!opt.args_neg.empty()) { + args.back() = opt.args_neg.back(); + } else { + // otherwise, skip the flag + // TODO: maybe throw an error instead? + args.pop_back(); + } } } if (opt.value_hint != nullptr) { @@ -141,10 +147,10 @@ static std::map> parse_ini_from_ static std::map get_map_key_opt(common_params_context & ctx_params) { std::map mapping; for (const auto & opt : ctx_params.options) { - if (opt.env != nullptr) { - mapping[opt.env] = opt; + for (const auto & env : opt.get_env()) { + mapping[env] = opt; } - for (const auto & arg : opt.args) { + for (const auto & arg : opt.get_args()) { mapping[rm_leading_dashes(arg)] = opt; } } diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp index 420195f1985..e9f7bf93130 100644 --- a/examples/gen-docs/gen-docs.cpp +++ b/examples/gen-docs/gen-docs.cpp @@ -14,12 +14,13 @@ static void write_table_header(std::ofstream & file) { static void write_table_entry(std::ofstream & file, const common_arg & opt) { file << "| `"; // args - for (const auto & arg : opt.args) { - if (arg == opt.args.front()) { + auto all_args = opt.get_args(); + for (const auto & arg : all_args) { + if (arg == all_args.front()) { file << arg; - if (opt.args.size() > 1) file << ", "; + if (all_args.size() > 1) file << ", "; } else { - file << arg << (arg != opt.args.back() ? ", " : ""); + file << arg << (arg != all_args.back() ? ", " : ""); } } // value hint diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index a60ca12fe59..90750b20c2a 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -20,20 +20,20 @@ int main(void) { std::unordered_set seen_env_vars; for (const auto & opt : ctx_arg.options) { // check for args duplications - for (const auto & arg : opt.args) { + for (const auto & arg : opt.get_args()) { if (seen_args.find(arg) == seen_args.end()) { seen_args.insert(arg); } else { - fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg); + fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg.c_str()); exit(1); } } // check for env var duplications - if (opt.env) { - if (seen_env_vars.find(opt.env) == seen_env_vars.end()) { - seen_env_vars.insert(opt.env); + for (const auto & env : opt.get_env()) { + if (seen_env_vars.find(env) == seen_env_vars.end()) { + seen_env_vars.insert(env); } else { - fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env); + fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", env.c_str()); exit(1); } } @@ -115,6 +115,14 @@ int main(void) { assert(params.model.path == "blah.gguf"); assert(params.cpuparams.n_threads == 1010); + printf("test-arg-parser: test negated environment variables\n\n"); + + setenv("LLAMA_ARG_MMAP", "0", true); + setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format + argv = {"binary_name"}; + assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); + assert(params.use_mmap == false); + assert(params.no_perf == true); printf("test-arg-parser: test environment variables being overwritten\n\n"); diff --git a/tools/server/README.md b/tools/server/README.md index 91cafa9425b..073bcd2ccd7 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -54,9 +54,8 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--swa-full` | use full-size SWA cache (default: false)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
(env: LLAMA_ARG_SWA_FULL) | | `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)
(env: LLAMA_ARG_KV_UNIFIED) | | `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')
(env: LLAMA_ARG_FLASH_ATTN) | -| `--no-perf` | disable internal libllama performance timings (default: false)
(env: LLAMA_ARG_NO_PERF) | -| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | -| `--no-escape` | do not process escape sequences | +| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)
(env: LLAMA_ARG_PERF) | +| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) | | `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model
(env: LLAMA_ARG_ROPE_SCALING_TYPE) | | `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N
(env: LLAMA_ARG_ROPE_SCALE) | | `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)
(env: LLAMA_ARG_ROPE_FREQ_BASE) | @@ -66,15 +65,15 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)
(env: LLAMA_ARG_YARN_ATTN_FACTOR) | | `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_SLOW) | | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)
(env: LLAMA_ARG_YARN_BETA_FAST) | -| `-nkvo, --no-kv-offload` | disable KV offload
(env: LLAMA_ARG_NO_KV_OFFLOAD) | -| `-nr, --no-repack` | disable weight repacking
(env: LLAMA_ARG_NO_REPACK) | -| `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_NO_HOST) | +| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)
(env: LLAMA_ARG_KV_OFFLOAD) | +| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)
(env: LLAMA_ARG_REPACK) | +| `--no-host` | bypass host buffer allowing extra buffers to be used
(env: LLAMA_ARG_HOST) | | `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | -| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)
(env: LLAMA_ARG_NO_MMAP) | +| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)
(env: LLAMA_ARG_MMAP) | | `--numa TYPE` | attempt optimizations that help on some NUMA systems
- distribute: spread execution evenly over all nodes
- isolate: only spawn threads on CPUs on the node that execution started on
- numactl: use the CPU map provided by numactl
if run without this previously, it is recommended to drop the system page cache before using this
see https://github.com/ggml-org/llama.cpp/issues/1437
(env: LLAMA_ARG_NUMA) | | `-dev, --device ` | comma-separated list of devices to use for offloading (none = don't offload)
use --list-devices to see a list of available devices
(env: LLAMA_ARG_DEVICE) | | `--list-devices` | print list of available devices and exit | @@ -87,7 +86,7 @@ For the ful list of features, please refer to [server's changelog](https://githu | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)
(env: LLAMA_ARG_MAIN_GPU) | | `--check-tensors` | check model tensor data for invalid values (default: false) | | `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.
types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false | -| `--no-op-offload` | disable offloading host tensor operations to device (default: false) | +| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) | | `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) | | `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) | | `--control-vector FNAME` | add a control vector
note: this argument can be repeated to add multiple control vectors | @@ -157,19 +156,18 @@ For the ful list of features, please refer to [server's changelog](https://githu | -------- | ----------- | | `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)
(env: LLAMA_ARG_CTX_CHECKPOINTS) | | `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)
[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)
(env: LLAMA_ARG_CACHE_RAM) | -| `--no-context-shift` | disables context shift on infinite text generation (default: enabled)
(env: LLAMA_ARG_NO_CONTEXT_SHIFT) | -| `--context-shift` | enables context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | +| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)
(env: LLAMA_ARG_CONTEXT_SHIFT) | | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode
| | `-sp, --special` | special tokens output enabled (default: false) | -| `--no-warmup` | skip warming up the model with an empty run | +| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) | | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) | | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified
(env: LLAMA_ARG_POOLING) | -| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | -| `-nocb, --no-cont-batching` | disable continuous batching
(env: LLAMA_ARG_NO_CONT_BATCHING) | +| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | +| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)
(env: LLAMA_ARG_CONT_BATCHING) | | `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md
note: if -hf is used, this argument can be omitted
(env: LLAMA_ARG_MMPROJ) | | `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md
(env: LLAMA_ARG_MMPROJ_URL) | -| `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf
(env: LLAMA_ARG_NO_MMPROJ) | -| `--no-mmproj-offload` | do not offload multimodal projector to GPU
(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) | +| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)
(env: LLAMA_ARG_MMPROJ_AUTO) | +| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)
(env: LLAMA_ARG_MMPROJ_OFFLOAD) | | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MIN_TOKENS) | | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)
(env: LLAMA_ARG_IMAGE_MAX_TOKENS) | | `--override-tensor-draft, -otd =,...` | override tensor buffer type for draft model | @@ -180,7 +178,7 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--port PORT` | port to listen (default: 8080)
(env: LLAMA_ARG_PORT) | | `--path PATH` | path to serve static files from (default: )
(env: LLAMA_ARG_STATIC_PATH) | | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )
(env: LLAMA_ARG_API_PREFIX) | -| `--no-webui` | Disable the Web UI (default: enabled)
(env: LLAMA_ARG_NO_WEBUI) | +| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)
(env: LLAMA_ARG_WEBUI) | | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)
(env: LLAMA_ARG_EMBEDDINGS) | | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)
(env: LLAMA_ARG_RERANKING) | | `--api-key KEY` | API key to use for authentication (default: none)
(env: LLAMA_API_KEY) | @@ -193,20 +191,19 @@ For the ful list of features, please refer to [server's changelog](https://githu | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)
[(card)](https://ggml.ai/f0.png)
(env: LLAMA_ARG_CACHE_REUSE) | | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)
(env: LLAMA_ARG_ENDPOINT_METRICS) | | `--props` | enable changing global properties via POST /props (default: disabled)
(env: LLAMA_ARG_ENDPOINT_PROPS) | -| `--slots` | enable slots monitoring endpoint (default: enabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) | -| `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) | +| `--slots, --no-slots` | expose slots monitoring endpoint (default: enabled)
(env: LLAMA_ARG_ENDPOINT_SLOTS) | | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) | +| `--media-path PATH` | directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled) | | `--models-dir PATH` | directory containing models for the router server (default: disabled)
(env: LLAMA_ARG_MODELS_DIR) | +| `--models-preset PATH` | path to INI file containing model presets for the router server (default: disabled)
(env: LLAMA_ARG_MODELS_PRESET) | | `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)
(env: LLAMA_ARG_MODELS_MAX) | -| `--models-allow-extra-args` | for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: disabled)
(env: LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS) | -| `--no-models-autoload` | disables automatic loading of models (default: enabled)
(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) | -| `--jinja` | use jinja template for chat (default: enabled)

(env: LLAMA_ARG_JINJA) | -| `--no-jinja` | disable jinja template for chat (default: enabled)

(env: LLAMA_ARG_NO_JINJA) | +| `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)
(env: LLAMA_ARG_MODELS_AUTOLOAD) | +| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)
(env: LLAMA_ARG_JINJA) | | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content`
- deepseek-legacy: keeps `` tags in `message.content` while also populating `message.reasoning_content`
(default: auto)
(env: LLAMA_ARG_THINK) | | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | -| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled

(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) | +| `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled

(env: LLAMA_ARG_PREFILL_ASSISTANT) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)
| | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | | `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) | @@ -236,6 +233,11 @@ For the ful list of features, please refer to [server's changelog](https://githu Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var. +For boolean options like `--mmap` or `--kv-offload`, the environment variable is handled as shown in this example: +- `LLAMA_ARG_MMAP=true` means enabled, other accepted values are: `1`, `on`, `enabled` +- `LLAMA_ARG_MMAP=false` means disabled, other accepted values are: `0`, `off`, `disabled` +- If `LLAMA_ARG_NO_MMAP` is present (no matter the value), it means disabling mmap + Example usage of docker compose with environment variables: ```yml