From d629fb37a09b04b37ac80eac9411f049f1156d2d Mon Sep 17 00:00:00 2001 From: Orchids Agent Date: Sun, 12 Apr 2026 21:41:56 +0000 Subject: [PATCH] feat: add CPU offload toggle to performance settings Adds a 'CPU Model Loading' checkbox in Performance settings that sends cpu_offload in the WebSocket init message. When enabled, the world_engine server builds the model on CPU before moving to GPU, reducing peak VRAM during initialization. Essential for systems with limited GPU memory. Changes: - Top-level cpu_offload setting (default: false) - Checkbox in Performance section with i18n (en/ja/zh/goose) - WebSocket init message includes cpu_offload flag - Lifecycle model key encodes cpu_offload so toggling triggers reconnect - Mode-switch modal shown when toggling during active streaming - Server passes cpu_offload through to WorldEngine constructor Companion PR: Overworldai/world_engine#40 --- package-lock.json | 4 ++-- server-components/engine_manager.py | 3 ++- server-components/server.py | 5 +++-- src/components/MenuSettingsView.tsx | 15 ++++++++++++++- src/context/StreamingContext.tsx | 11 ++++++++--- src/context/streamingLifecyclePayload.ts | 8 +++++--- src/i18n/en.ts | 3 +++ src/i18n/goose.ts | 3 +++ src/i18n/ja.ts | 2 ++ src/i18n/zh.ts | 2 ++ src/types/settings.ts | 1 + src/types/ws.ts | 1 + 12 files changed, 46 insertions(+), 12 deletions(-) diff --git a/package-lock.json b/package-lock.json index f351ecff..b326a1a6 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "biome", - "version": "1.0.0-rc3", + "version": "1.0.1", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "biome", - "version": "1.0.0-rc3", + "version": "1.0.1", "dependencies": { "@tailwindcss/vite": "^4.2.1", "framer-motion": "^12.35.0", diff --git a/server-components/engine_manager.py b/server-components/engine_manager.py index fe732185..24b5e08e 100755 --- a/server-components/engine_manager.py +++ b/server-components/engine_manager.py @@ -324,7 +324,7 @@ async def load_seed_from_base64(self, base64_data: str) -> torch.Tensor: lambda: self._load_seed_from_base64_sync(base64_data) ) - async def load_engine(self, model_uri: str, quant: str | None = None): + async def load_engine(self, model_uri: str, quant: str | None = None, cpu_offload: bool = False): """Initialize or switch the WorldEngine model. model_uri is required — the server does not have a default model. @@ -400,6 +400,7 @@ def _create_engine(): device=DEVICE, quant=requested_quant, dtype=dtype, + cpu_offload=cpu_offload, ) new_engine = await self._run_on_cuda_thread(_create_engine) diff --git a/server-components/server.py b/server-components/server.py index 0144b5a7..d534cdaf 100755 --- a/server-components/server.py +++ b/server-components/server.py @@ -591,7 +591,7 @@ async def websocket_endpoint(websocket: WebSocket): Client -> Server: {"type": "control", "buttons": [str], "mouse_dx": float, "mouse_dy": float, "ts": float} - {"type": "init", "req_id": "...", "model": str, "seed_image_data": str, "seed_filename": str, "scene_edit": bool, "action_logging": bool, "quant": str|null} + {"type": "init", "req_id": "...", "model": str, "seed_image_data": str, "seed_filename": str, "scene_edit": bool, "action_logging": bool, "quant": str|null, "cpu_offload": bool} {"type": "reset"} {"type": "pause"} {"type": "resume"} @@ -798,6 +798,7 @@ async def handle_init(msg: dict, is_game_loop: bool = False) -> tuple[bool, bool seed_data = msg.get("seed_image_data") seed_filename = msg.get("seed_filename") quant = msg.get("quant") + cpu_offload = msg.get("cpu_offload", False) # Update flags if "scene_edit" in msg: @@ -832,7 +833,7 @@ async def handle_init(msg: dict, is_game_loop: bool = False) -> tuple[bool, bool if model_uri and (model_uri != getattr(world_engine, "model_uri", None) or quant_changed): logger.info(f"[{client_host}] {'Live model switch' if is_game_loop else 'Requested model'}: {model_uri} (quant={quant})") world_engine.set_progress_callback(progress_callback, asyncio.get_running_loop()) - await world_engine.load_engine(model_uri, quant=quant) + await world_engine.load_engine(model_uri, quant=quant, cpu_offload=cpu_offload) world_engine.set_progress_callback(None) world_engine.seed_frame = None session.perceptual_frame_count = 0 diff --git a/src/components/MenuSettingsView.tsx b/src/components/MenuSettingsView.tsx index ea4ce72a..5752a71c 100644 --- a/src/components/MenuSettingsView.tsx +++ b/src/components/MenuSettingsView.tsx @@ -100,6 +100,7 @@ const MenuSettingsView = ({ onBack, wide }: MenuSettingsViewProps) => { const [showCredits, setShowCredits] = useState(false) const [menuQuant, setMenuQuant] = useState(settings.engine_quant ?? 'none') + const [menuCpuOffload, setMenuCpuOffload] = useState(() => settings.cpu_offload ?? false) const [menuCapInferenceFps, setMenuCapInferenceFps] = useState(() => settings.cap_inference_fps ?? true) const [menuKeybindings, setMenuKeybindings] = useState(() => ({ ...settings.keybindings })) const [menuSceneEditEnabled, setMenuSceneEditEnabled] = useState( @@ -229,6 +230,7 @@ const MenuSettingsView = ({ onBack, wide }: MenuSettingsViewProps) => { setMenuMouseSensitivity(streamingToMenu(settings.mouse_sensitivity ?? mouseSensitivity)) setMenuServerUrl(configServerUrl) setMenuQuant(settings.engine_quant ?? 'none') + setMenuCpuOffload(settings.cpu_offload ?? false) setMenuKeybindings({ ...settings.keybindings }) setMenuSceneEditEnabled(settings.experimental?.scene_edit_enabled ?? false) setMenuPerformanceStats(settings.debug_overlays.performance_stats) @@ -358,6 +360,7 @@ const MenuSettingsView = ({ onBack, wide }: MenuSettingsViewProps) => { engine_mode: engineModeValue, engine_model: menuWorldModel, engine_quant: menuQuant, + cpu_offload: menuCpuOffload, cap_inference_fps: menuCapInferenceFps, mouse_sensitivity: streamingValue, keybindings: menuKeybindings, @@ -382,6 +385,7 @@ const MenuSettingsView = ({ onBack, wide }: MenuSettingsViewProps) => { menuServerUrl, menuWorldModel, menuQuant, + menuCpuOffload, menuCapInferenceFps, menuKeybindings, menuSceneEditEnabled, @@ -397,13 +401,14 @@ const MenuSettingsView = ({ onBack, wide }: MenuSettingsViewProps) => { const hasEngineModeChanged = menuEngineMode !== (configEngineMode === ENGINE_MODES.SERVER ? 'server' : 'standalone') const hasWorldModelChanged = menuWorldModel !== configWorldModel const hasQuantChanged = menuQuant !== (settings.engine_quant ?? 'none') + const hasCpuOffloadChanged = menuCpuOffload !== (settings.cpu_offload ?? false) const handleBackClick = useCallback(async () => { if (menuEngineMode === 'server' && (!menuServerUrl.trim() || serverUrlStatus !== 'valid')) { setShowServerErrorModal(true) return } - if (isStreaming && (hasEngineModeChanged || hasWorldModelChanged || hasQuantChanged)) { + if (isStreaming && (hasEngineModeChanged || hasWorldModelChanged || hasQuantChanged || hasCpuOffloadChanged)) { setShowModeSwitchModal(true) return } @@ -416,6 +421,8 @@ const MenuSettingsView = ({ onBack, wide }: MenuSettingsViewProps) => { isStreaming, hasEngineModeChanged, hasWorldModelChanged, + hasQuantChanged, + hasCpuOffloadChanged, applyDraftSettings, onBack ]) @@ -586,6 +593,12 @@ const MenuSettingsView = ({ onBack, wide }: MenuSettingsViewProps) => { onChange={(v) => setMenuQuant(v as QuantOption)} /> + { // Set lastAppliedModel before await to prevent the lifecycle machine from // seeing a model mismatch during the re-render triggered by setInitMetrics. const quant = settings.engine_quant ?? 'none' + const cpuOffload = settings.cpu_offload ?? false lastAppliedModelRef.current = settings.experimental?.scene_edit_enabled - ? `${selectedModel}+scene_edit+${quant}` - : `${selectedModel}+${quant}` + ? `${selectedModel}+scene_edit+${quant}+cpu${cpuOffload ? '1' : '0'}` + : `${selectedModel}+${quant}+cpu${cpuOffload ? '1' : '0'}` const metrics = await sendInit({ model: selectedModel, @@ -257,6 +258,7 @@ export const StreamingProvider = ({ children }: { children: ReactNode }) => { scene_edit: settings.experimental?.scene_edit_enabled ?? false, action_logging: settings.debug_overlays?.action_logging ?? false, quant: quant !== 'none' ? quant : null, + cpu_offload: cpuOffload, cap_inference_fps: settings.cap_inference_fps ?? true }) setInitMetrics(metrics) @@ -269,6 +271,7 @@ export const StreamingProvider = ({ children }: { children: ReactNode }) => { isConnected, settings?.engine_model, settings?.engine_quant, + settings?.cpu_offload, settings?.cap_inference_fps, settings.experimental?.scene_edit_enabled, settings.debug_overlays?.action_logging, @@ -360,7 +363,8 @@ export const StreamingProvider = ({ children }: { children: ReactNode }) => { isPaused, sceneEditActive: sceneEditGrace, sceneEditEnabled: settings.experimental?.scene_edit_enabled, - engineQuant: settings.engine_quant + engineQuant: settings.engine_quant, + cpuOffload: settings.cpu_offload }) }) }, [ @@ -369,6 +373,7 @@ export const StreamingProvider = ({ children }: { children: ReactNode }) => { error, settings?.engine_model, settings?.engine_quant, + settings?.cpu_offload, settings.experimental?.scene_edit_enabled, engineError, hasReceivedFrame, diff --git a/src/context/streamingLifecyclePayload.ts b/src/context/streamingLifecyclePayload.ts index 044468df..1f733ea0 100644 --- a/src/context/streamingLifecyclePayload.ts +++ b/src/context/streamingLifecyclePayload.ts @@ -18,17 +18,19 @@ type BuildStreamingLifecycleSyncPayloadArgs = { sceneEditActive: boolean sceneEditEnabled?: boolean engineQuant?: string + cpuOffload?: boolean } export const buildStreamingLifecycleSyncPayload = ( args: BuildStreamingLifecycleSyncPayloadArgs ): StreamingLifecycleSyncPayload => { - // Encode scene_edit_enabled and quant into the model key so toggling - // either triggers the same intentional-reconnect flow as switching models. + // Encode scene_edit_enabled, quant, and cpu_offload into the model key so + // toggling any of them triggers the same intentional-reconnect flow as + // switching models. const baseModel = args.engineModel || DEFAULT_WORLD_ENGINE_MODEL const quant = args.engineQuant ?? 'none' let selectedModel = args.sceneEditEnabled ? `${baseModel}+scene_edit` : baseModel - selectedModel = `${selectedModel}+${quant}` + selectedModel = `${selectedModel}+${quant}+cpu${args.cpuOffload ? '1' : '0'}` return { portalState: args.portalState, diff --git a/src/i18n/en.ts b/src/i18n/en.ts index 53c70b5c..85f7f502 100644 --- a/src/i18n/en.ts +++ b/src/i18n/en.ts @@ -160,6 +160,9 @@ const en = { quantization: 'Quantization', quantizationDescription: 'Reduces model precision for faster inference and lower memory usage, at the cost of some visual quality.\nFirst use of INT8 quantization can take 1-2 hours while inference kernels are optimized - this is a one-time cost.', + cpuOffload: 'CPU Model Loading', + cpuOffloadDescription: + 'Builds the model on CPU before moving it to GPU. Essential for systems with low VRAM.', capInferenceFps: 'Cap Inference FPS', capInferenceFpsDescription: "Limits the generation rate to the model's trained framerate. Without this, the game may run faster than intended." diff --git a/src/i18n/goose.ts b/src/i18n/goose.ts index 950af495..4202ca3f 100644 --- a/src/i18n/goose.ts +++ b/src/i18n/goose.ts @@ -161,6 +161,9 @@ const goose = { quantization: 'Feather compression', quantizationDescription: 'Plucks a few feathers for faster waddling with less nest space, at the cost of some plumage quality.\nFirst INT8 plucking takes 1-2 hours while the goose optimizes its molt - this is a one-time cost.', + cpuOffload: 'Nest on land first', + cpuOffloadDescription: + 'Builds the goose on land before sending it to the pond. Essential for ponds with limited space.', capInferenceFps: 'Cap honk rate', capInferenceFpsDescription: "Limits the waddling rate to the flock's trained pace. Turning this off may result in the goose waddling faster than intended." diff --git a/src/i18n/ja.ts b/src/i18n/ja.ts index ea3dd2aa..a821bbeb 100644 --- a/src/i18n/ja.ts +++ b/src/i18n/ja.ts @@ -159,6 +159,8 @@ const ja = { quantization: '量子化', quantizationDescription: 'モデルの精度を下げて推論速度を向上させ、メモリ使用量を削減します。画質がわずかに低下します。\nINT8量子化の初回使用時は、推論カーネルの最適化に1-2時間かかる場合がありますが、これは一度だけのコストです。', + cpuOffload: 'CPUモデル読み込み', + cpuOffloadDescription: 'モデルをCPU上で構築してからGPUに転送します。VRAM不足のシステムでは必須です。', capInferenceFps: '推論FPSを制限', capInferenceFpsDescription: 'モデルの学習フレームレートに合わせて生成速度を制限します。オフにすると、ゲーム速度が意図より速くなる場合があります。' diff --git a/src/i18n/zh.ts b/src/i18n/zh.ts index ebf089e1..8ced0b9c 100644 --- a/src/i18n/zh.ts +++ b/src/i18n/zh.ts @@ -154,6 +154,8 @@ const zh = { quantization: '量化', quantizationDescription: '降低模型精度以加快推理速度并减少显存占用,但会略微降低画质。\n首次使用INT8量化时,推理内核优化可能需要1-2小时,但这是一次性的。', + cpuOffload: 'CPU模型加载', + cpuOffloadDescription: '在CPU上构建模型后再移至GPU。对于显存不足的系统必不可少。', capInferenceFps: '限制推理帧率', capInferenceFpsDescription: '将生成速率限制为模型的训练帧率。关闭此选项可能导致游戏速度快于预期。' }, diff --git a/src/types/settings.ts b/src/types/settings.ts index da756d90..84ebde1f 100644 --- a/src/types/settings.ts +++ b/src/types/settings.ts @@ -43,6 +43,7 @@ export const settingsSchema = z.object({ engine_mode: z.enum(['standalone', 'server']).default('standalone'), engine_model: z.string().default(DEFAULT_WORLD_ENGINE_MODEL), engine_quant: z.enum(QUANT_OPTIONS).default('none'), + cpu_offload: z.boolean().default(false), cap_inference_fps: z.boolean().default(true), custom_models: z.array(z.string()).default([]), mouse_sensitivity: z.number().min(0.1).max(3.0).default(1.8), diff --git a/src/types/ws.ts b/src/types/ws.ts index 87b1b99b..274f61d4 100644 --- a/src/types/ws.ts +++ b/src/types/ws.ts @@ -21,6 +21,7 @@ export type InitMessage = { scene_edit?: boolean action_logging?: boolean quant?: string | null + cpu_offload?: boolean cap_inference_fps?: boolean } export type InitResponse = {