Skip to content

Commit 0665a7f

Browse files
authored
feat: add hidream o1 image support (#1485)
1 parent eeac950 commit 0665a7f

20 files changed

Lines changed: 1703 additions & 334 deletions

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ API and command-line option may change frequently.***
5858
- [Ovis-Image](./docs/ovis_image.md)
5959
- [Anima](./docs/anima.md)
6060
- [ERNIE-Image](./docs/ernie_image.md)
61+
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
6162
- Image Edit Models
6263
- [FLUX.1-Kontext-dev](./docs/kontext.md)
6364
- [Qwen Image Edit series](./docs/qwen_image_edit.md)
@@ -148,6 +149,7 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
148149
- [Ovis-Image](./docs/ovis_image.md)
149150
- [Anima](./docs/anima.md)
150151
- [ERNIE-Image](./docs/ernie_image.md)
152+
- [HiDream-O1-Image](./docs/hidream_o1_image.md)
151153
- [LoRA](./docs/lora.md)
152154
- [LCM/LCM-LoRA](./docs/lcm.md)
153155
- [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)

assets/hidream-o1/dev_example.png

2.17 MB
Loading

docs/hidream_o1_image.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# How to Use
2+
3+
## Download weights
4+
5+
- Download HiDream-O1-Image-Dev
6+
- safetensors: https://huggingface.co/Comfy-Org/HiDream-O1-Image/tree/main/checkpoints
7+
- Download HiDream-O1-Image
8+
- safetensors: https://huggingface.co/Comfy-Org/HiDream-O1-Image/tree/main/checkpoints
9+
10+
## Examples
11+
12+
### HiDream-O1-Image-Dev
13+
14+
```
15+
.\bin\Release\sd-cli.exe -m ..\..\ComfyUI\models\diffusion_models\hidream_o1_image_dev_bf16.safetensors -p "a lovely cat holding a sign says
16+
'hidream o1 cpp'" --cfg-scale 1.0 -v -H 1024 -W 1024
17+
```
18+
19+
<img width="256" alt="HiDream-O1-Image-Dev example" src="../assets/hidream-o1/dev_example.png" />
20+

examples/cli/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ Generation Options:
103103
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
104104
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
105105
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
106+
--extra-sample-args <string> extra sampler args, key=value list. Currently lcm supports noise_clip_std,
107+
noise_scale_start, noise_scale_end
106108
-H, --height <int> image height, in pixel space (default: 512)
107109
-W, --width <int> image width, in pixel space (default: 512)
108110
--steps <int> number of sample steps (default: 20)

examples/common/common.cpp

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -807,6 +807,10 @@ ArgOptions SDGenerationParams::get_options() {
807807
"Latent (antialiased), Latent (bicubic), Latent (bicubic antialiased), or a model name "
808808
"under --hires-upscalers-dir (default: Latent)",
809809
&hires_upscaler},
810+
{"",
811+
"--extra-sample-args",
812+
"extra sampler args, key=value list. Currently lcm supports noise_clip_std, noise_scale_start, noise_scale_end",
813+
&extra_sample_args},
810814
};
811815

812816
options.int_options = {
@@ -1607,6 +1611,7 @@ bool SDGenerationParams::from_json_str(
16071611

16081612
auto parse_sample_params_json = [&](const json& sample_json,
16091613
sd_sample_params_t& target_params,
1614+
std::string& target_extra_sample_args,
16101615
std::vector<int>& target_skip_layers,
16111616
std::vector<float>* target_custom_sigmas) {
16121617
if (sample_json.contains("sample_steps") && sample_json["sample_steps"].is_number_integer()) {
@@ -1621,6 +1626,9 @@ bool SDGenerationParams::from_json_str(
16211626
if (sample_json.contains("flow_shift") && sample_json["flow_shift"].is_number()) {
16221627
target_params.flow_shift = sample_json["flow_shift"];
16231628
}
1629+
if (sample_json.contains("extra_sample_args") && sample_json["extra_sample_args"].is_string()) {
1630+
target_extra_sample_args = sample_json["extra_sample_args"].get<std::string>();
1631+
}
16241632
if (target_custom_sigmas != nullptr &&
16251633
sample_json.contains("custom_sigmas") &&
16261634
sample_json["custom_sigmas"].is_array()) {
@@ -1668,11 +1676,12 @@ bool SDGenerationParams::from_json_str(
16681676
};
16691677

16701678
if (j.contains("sample_params") && j["sample_params"].is_object()) {
1671-
parse_sample_params_json(j["sample_params"], sample_params, skip_layers, &custom_sigmas);
1679+
parse_sample_params_json(j["sample_params"], sample_params, extra_sample_args, skip_layers, &custom_sigmas);
16721680
}
16731681
if (j.contains("high_noise_sample_params") && j["high_noise_sample_params"].is_object()) {
16741682
parse_sample_params_json(j["high_noise_sample_params"],
16751683
high_noise_sample_params,
1684+
high_noise_extra_sample_args,
16761685
high_noise_skip_layers,
16771686
nullptr);
16781687
}
@@ -2099,6 +2108,8 @@ sd_img_gen_params_t SDGenerationParams::to_sd_img_gen_params_t() {
20992108
high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size();
21002109
sample_params.custom_sigmas = custom_sigmas.empty() ? nullptr : custom_sigmas.data();
21012110
sample_params.custom_sigmas_count = static_cast<int>(custom_sigmas.size());
2111+
sample_params.extra_sample_args = extra_sample_args.empty() ? nullptr : extra_sample_args.c_str();
2112+
high_noise_sample_params.extra_sample_args = high_noise_extra_sample_args.empty() ? nullptr : high_noise_extra_sample_args.c_str();
21022113
cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str();
21032114

21042115
sd_pm_params_t pm_params = {
@@ -2168,6 +2179,8 @@ sd_vid_gen_params_t SDGenerationParams::to_sd_vid_gen_params_t() {
21682179
high_noise_sample_params.guidance.slg.layer_count = high_noise_skip_layers.size();
21692180
sample_params.custom_sigmas = custom_sigmas.empty() ? nullptr : custom_sigmas.data();
21702181
sample_params.custom_sigmas_count = static_cast<int>(custom_sigmas.size());
2182+
sample_params.extra_sample_args = extra_sample_args.empty() ? nullptr : extra_sample_args.c_str();
2183+
high_noise_sample_params.extra_sample_args = high_noise_extra_sample_args.empty() ? nullptr : high_noise_extra_sample_args.c_str();
21712184
cache_params.scm_mask = scm_mask.empty() ? nullptr : scm_mask.c_str();
21722185

21732186
params.loras = lora_vec.empty() ? nullptr : lora_vec.data();
@@ -2306,6 +2319,7 @@ static json build_sampling_metadata_json(const sd_sample_params_t& sample_params
23062319
{"eta", sample_params.eta},
23072320
{"shifted_timestep", sample_params.shifted_timestep},
23082321
{"flow_shift", sample_params.flow_shift},
2322+
{"extra_sample_args", safe_json_string(sample_params.extra_sample_args)},
23092323
{"guidance",
23102324
{
23112325
{"txt_cfg", sample_params.guidance.txt_cfg},
@@ -2497,6 +2511,9 @@ std::string get_image_params(const SDContextParams& ctx_params,
24972511
}
24982512
parameter_string += "Guidance: " + std::to_string(gen_params.sample_params.guidance.distilled_guidance) + ", ";
24992513
parameter_string += "Eta: " + std::to_string(gen_params.sample_params.eta) + ", ";
2514+
if (!gen_params.extra_sample_args.empty()) {
2515+
parameter_string += "Extra sample args: " + gen_params.extra_sample_args + ", ";
2516+
}
25002517
parameter_string += "Seed: " + std::to_string(seed) + ", ";
25012518
parameter_string += "Size: " + std::to_string(gen_params.get_resolved_width()) + "x" + std::to_string(gen_params.get_resolved_height()) + ", ";
25022519
parameter_string += "Model: " + sd_basename(ctx_params.model_path) + ", ";

examples/common/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,8 @@ struct SDGenerationParams {
168168

169169
sd_sample_params_t sample_params;
170170
sd_sample_params_t high_noise_sample_params;
171+
std::string extra_sample_args;
172+
std::string high_noise_extra_sample_args;
171173
std::vector<int> skip_layers = {7, 8, 9};
172174
std::vector<int> high_noise_skip_layers = {7, 8, 9};
173175

examples/server/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,8 @@ Default Generation Options:
205205
--hires-upscaler <string> highres fix upscaler, Lanczos, Nearest, Latent, Latent (nearest), Latent
206206
(nearest-exact), Latent (antialiased), Latent (bicubic), Latent (bicubic
207207
antialiased), or a model name under --hires-upscalers-dir (default: Latent)
208+
--extra-sample-args <string> extra sampler args, key=value list. Currently lcm supports noise_clip_std,
209+
noise_scale_start, noise_scale_end
208210
-H, --height <int> image height, in pixel space (default: 512)
209211
-W, --width <int> image width, in pixel space (default: 512)
210212
--steps <int> number of sample steps (default: 20)

include/stable-diffusion.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,7 @@ typedef struct {
240240
float* custom_sigmas;
241241
int custom_sigmas_count;
242242
float flow_shift;
243+
const char* extra_sample_args;
243244
} sd_sample_params_t;
244245

245246
typedef struct {

src/conditioner.hpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,12 @@ struct SDCondition {
1414
sd::Tensor<float> c_concat;
1515
sd::Tensor<int32_t> c_t5_ids;
1616
sd::Tensor<float> c_t5_weights;
17+
sd::Tensor<int32_t> c_input_ids;
18+
sd::Tensor<int32_t> c_position_ids;
19+
sd::Tensor<int32_t> c_token_types;
20+
sd::Tensor<int32_t> c_vinput_mask;
21+
std::vector<std::pair<int, sd::Tensor<float>>> c_image_embeds;
22+
std::vector<sd::Tensor<float>> c_ref_images;
1723

1824
std::vector<sd::Tensor<float>> extra_c_crossattns;
1925

@@ -26,10 +32,24 @@ struct SDCondition {
2632

2733
bool empty() const {
2834
if (!c_crossattn.empty() || !c_vector.empty() || !c_concat.empty() ||
29-
!c_t5_ids.empty() || !c_t5_weights.empty()) {
35+
!c_t5_ids.empty() || !c_t5_weights.empty() ||
36+
!c_input_ids.empty() || !c_position_ids.empty() ||
37+
!c_token_types.empty() || !c_vinput_mask.empty()) {
3038
return false;
3139
}
3240

41+
for (const auto& image_embed : c_image_embeds) {
42+
if (!image_embed.second.empty()) {
43+
return false;
44+
}
45+
}
46+
47+
for (const auto& tensor : c_ref_images) {
48+
if (!tensor.empty()) {
49+
return false;
50+
}
51+
}
52+
3353
for (const auto& tensor : extra_c_crossattns) {
3454
if (!tensor.empty()) {
3555
return false;

src/denoiser.hpp

Lines changed: 109 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define __DENOISER_HPP__
33

44
#include <cmath>
5+
#include <string>
56
#include <utility>
67

78
#include "ggml_extend.hpp"
@@ -1148,7 +1149,80 @@ static sd::Tensor<float> sample_lcm(denoise_cb_t model,
11481149
sd::Tensor<float> x,
11491150
const std::vector<float>& sigmas,
11501151
std::shared_ptr<RNG> rng,
1151-
bool is_flow_denoiser) {
1152+
bool is_flow_denoiser,
1153+
const char* extra_sample_args = nullptr) {
1154+
struct LCMSampleArgs {
1155+
float noise_clip_std = 0.0f;
1156+
float noise_scale_start = 1.0f;
1157+
float noise_scale_end = 1.0f;
1158+
};
1159+
1160+
auto trim = [](std::string value) -> std::string {
1161+
const char* whitespace = " \t\r\n";
1162+
size_t begin = value.find_first_not_of(whitespace);
1163+
if (begin == std::string::npos) {
1164+
return "";
1165+
}
1166+
size_t end = value.find_last_not_of(whitespace);
1167+
return value.substr(begin, end - begin + 1);
1168+
};
1169+
1170+
LCMSampleArgs args;
1171+
if (extra_sample_args != nullptr && extra_sample_args[0] != '\0') {
1172+
std::string raw(extra_sample_args);
1173+
size_t start = 0;
1174+
bool noise_scale_end_was_set = false;
1175+
bool noise_scale_start_was_set = false;
1176+
auto parse_arg = [&](const std::string& item) {
1177+
std::string token = trim(item);
1178+
if (token.empty()) {
1179+
return;
1180+
}
1181+
size_t eq = token.find('=');
1182+
if (eq == std::string::npos) {
1183+
LOG_WARN("ignoring invalid lcm extra sample arg '%s'", token.c_str());
1184+
return;
1185+
}
1186+
1187+
std::string key = trim(token.substr(0, eq));
1188+
std::string value = trim(token.substr(eq + 1));
1189+
float parsed = 0.0f;
1190+
try {
1191+
size_t consumed = 0;
1192+
parsed = std::stof(value, &consumed);
1193+
if (trim(value.substr(consumed)).size() != 0) {
1194+
LOG_WARN("ignoring invalid lcm extra sample arg '%s'", token.c_str());
1195+
return;
1196+
}
1197+
} catch (const std::exception&) {
1198+
LOG_WARN("ignoring invalid lcm extra sample arg '%s'", token.c_str());
1199+
return;
1200+
}
1201+
1202+
if (key == "noise_clip_std") {
1203+
args.noise_clip_std = parsed;
1204+
} else if (key == "noise_scale_start") {
1205+
args.noise_scale_start = parsed;
1206+
noise_scale_start_was_set = true;
1207+
} else if (key == "noise_scale_end") {
1208+
args.noise_scale_end = parsed;
1209+
noise_scale_end_was_set = true;
1210+
} else {
1211+
LOG_WARN("ignoring unknown lcm extra sample arg '%s'", key.c_str());
1212+
}
1213+
};
1214+
1215+
for (size_t pos = 0; pos <= raw.size(); ++pos) {
1216+
if (pos == raw.size() || raw[pos] == ',' || raw[pos] == ';') {
1217+
parse_arg(raw.substr(start, pos - start));
1218+
start = pos + 1;
1219+
}
1220+
}
1221+
if (noise_scale_start_was_set && !noise_scale_end_was_set) {
1222+
args.noise_scale_end = args.noise_scale_start;
1223+
}
1224+
}
1225+
11521226
int steps = static_cast<int>(sigmas.size()) - 1;
11531227
for (int i = 0; i < steps; i++) {
11541228
auto denoised_opt = model(x, sigmas[i], i + 1, nullptr);
@@ -1160,7 +1234,27 @@ static sd::Tensor<float> sample_lcm(denoise_cb_t model,
11601234
if (is_flow_denoiser) {
11611235
x *= (1 - sigmas[i + 1]);
11621236
}
1163-
x += sd::Tensor<float>::randn_like(x, rng) * sigmas[i + 1];
1237+
auto noise = sd::Tensor<float>::randn_like(x, rng);
1238+
if (args.noise_clip_std > 0.0f && noise.numel() > 0) {
1239+
double mean = 0.0;
1240+
for (int64_t j = 0; j < noise.numel(); ++j) {
1241+
mean += static_cast<double>(noise[j]);
1242+
}
1243+
mean /= static_cast<double>(noise.numel());
1244+
1245+
double variance = 0.0;
1246+
for (int64_t j = 0; j < noise.numel(); ++j) {
1247+
double centered = static_cast<double>(noise[j]) - mean;
1248+
variance += centered * centered;
1249+
}
1250+
variance /= static_cast<double>(noise.numel());
1251+
1252+
float clip_val = args.noise_clip_std * static_cast<float>(std::sqrt(variance));
1253+
noise = sd::ops::clamp(noise, -clip_val, clip_val);
1254+
}
1255+
float t = steps > 1 ? static_cast<float>(i) / static_cast<float>(steps - 1) : 0.0f;
1256+
float noise_scale = args.noise_scale_start + (args.noise_scale_end - args.noise_scale_start) * t;
1257+
x += noise * (sigmas[i + 1] * noise_scale);
11641258
}
11651259
}
11661260
return x;
@@ -1656,15 +1750,15 @@ static sd::Tensor<float> sample_euler_cfg_pp(denoise_cb_t model,
16561750
for (int i = 0; i < steps; i++) {
16571751
float sigma = sigmas[i];
16581752
sd::Tensor<float> uncond_denoised;
1659-
1753+
16601754
auto denoised_opt = model(x, sigma, i + 1, &uncond_denoised);
16611755
if (denoised_opt.empty() || uncond_denoised.empty()) {
16621756
return {};
16631757
}
1664-
1758+
16651759
sd::Tensor<float> denoised = std::move(denoised_opt);
1666-
sd::Tensor<float> d = (x - uncond_denoised) / sigma;
1667-
1760+
sd::Tensor<float> d = (x - uncond_denoised) / sigma;
1761+
16681762
x = denoised + d * sigmas[i + 1];
16691763
}
16701764
return x;
@@ -1679,19 +1773,19 @@ static sd::Tensor<float> sample_euler_ancestral_cfg_pp(denoise_cb_t model,
16791773
for (int i = 0; i < steps; i++) {
16801774
float sigma = sigmas[i];
16811775
sd::Tensor<float> uncond_denoised;
1682-
1776+
16831777
auto denoised_opt = model(x, sigma, i + 1, &uncond_denoised);
16841778
if (denoised_opt.empty() || uncond_denoised.empty()) {
16851779
return {};
16861780
}
1687-
1781+
16881782
sd::Tensor<float> denoised = std::move(denoised_opt);
1689-
sd::Tensor<float> d = (x - uncond_denoised) / sigma;
1690-
1783+
sd::Tensor<float> d = (x - uncond_denoised) / sigma;
1784+
16911785
auto [sigma_down, sigma_up] = get_ancestral_step(sigmas[i], sigmas[i + 1], eta);
1692-
1786+
16931787
x = denoised + d * sigma_down;
1694-
1788+
16951789
if (sigmas[i + 1] > 0) {
16961790
x += sd::Tensor<float>::randn_like(x, rng) * sigma_up;
16971791
}
@@ -1706,7 +1800,8 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
17061800
std::vector<float> sigmas,
17071801
std::shared_ptr<RNG> rng,
17081802
float eta,
1709-
bool is_flow_denoiser) {
1803+
bool is_flow_denoiser,
1804+
const char* extra_sample_args) {
17101805
switch (method) {
17111806
case EULER_A_SAMPLE_METHOD:
17121807
if (is_flow_denoiser)
@@ -1729,7 +1824,7 @@ static sd::Tensor<float> sample_k_diffusion(sample_method_t method,
17291824
case DPMPP2Mv2_SAMPLE_METHOD:
17301825
return sample_dpmpp_2m_v2(model, std::move(x), sigmas);
17311826
case LCM_SAMPLE_METHOD:
1732-
return sample_lcm(model, std::move(x), sigmas, rng, is_flow_denoiser);
1827+
return sample_lcm(model, std::move(x), sigmas, rng, is_flow_denoiser, extra_sample_args);
17331828
case IPNDM_SAMPLE_METHOD:
17341829
return sample_ipndm(model, std::move(x), sigmas);
17351830
case IPNDM_V_SAMPLE_METHOD:

0 commit comments

Comments
 (0)