Files
blender-portable-repo/scripts/addons/HoloMARI_MultiRender.py
T
2026-04-20 11:57:06 -05:00

5675 lines
220 KiB
Python

bl_info = {
"name": "Multi-Headless Instance Renderer | HoloMARI Platform",
"author": "HP Park",
"version": (1, 6, 5),
"blender": (2, 93, 0),
"location": "Render Properties â–¸ Multi-Instance Frames",
"description": "Spawns one or more headless Blender workers per GPU to maximize available compute resources.",
"category": "Render",
}
import bpy
import os
import sys
import re
import shutil
import tempfile
import time
import threading
import queue
import subprocess
import signal
import socket
import json
import random
import addon_utils
import string
import shlex
import hashlib
from pathlib import Path
from collections import Counter, deque
ADDON_KEY = "multi_instance_render"
_KM_ITEMS = []
_MANAGER = None
IS_WIN = (os.name == "nt")
IS_MAC = (sys.platform == "darwin")
VIDEO_FORMATS = {"FFMPEG", "AVI_JPEG", "AVI_RAW", "FRAME_SERVER"}
# ----------------------- helpers -----------------------
def _log(msg):
print(f"[MGPU] {msg}")
def _manager_has_active_workers(manager):
if not manager:
return False
try:
for w in getattr(manager, "workers", []):
try:
if w.running:
return True
except Exception:
continue
except Exception:
pass
return False
def _cleanup_stale_manager():
global _MANAGER
if _MANAGER and not _manager_has_active_workers(_MANAGER):
try:
_MANAGER.stop()
except Exception:
pass
_MANAGER = None
def _cycles_prefs():
try:
return bpy.context.preferences.addons["cycles"].preferences
except Exception:
return None
def _current_compute_type():
cp = _cycles_prefs()
if not cp:
return "CUDA"
return getattr(cp, "compute_device_type", "CUDA") or "CUDA"
def _cycles_cpu_device_selected():
cp = _cycles_prefs()
if not cp:
return False
try:
cp.refresh_devices()
except Exception:
pass
for d in getattr(cp, "devices", []):
try:
if str(getattr(d, "type", "") or "").upper() == "CPU" and bool(getattr(d, "use", False)):
return True
except Exception:
continue
return False
def _fmt_bytes(n):
try:
for unit in ["B","KiB","MiB","GiB","TiB"]:
if n < 1024: return f"{n:.1f}{unit}"
n /= 1024.0
except Exception:
pass
return "?"
def _median(values):
vals = []
for v in (values or []):
try:
fv = float(v)
if fv > 0:
vals.append(fv)
except Exception:
pass
if not vals:
return None
vals.sort()
n = len(vals)
m = n // 2
if n % 2 == 1:
return vals[m]
return (vals[m - 1] + vals[m]) * 0.5
_RENDERTIME_GUARD_PROFILES = {
"OFF": {
"enabled": False,
"warmup_completed_jobs": 1,
"warmup_per_worker_jobs": 1,
"periodic_recycle_enabled": False,
"periodic_recycle_points": [],
},
"CONSERVATIVE": {
"enabled": True,
"warmup_completed_jobs": 1,
"warmup_per_worker_jobs": 1,
"min_samples_soft": 6,
"soft_mult": 3.8,
"soft_min_s": 150.0,
"hard_mult": 7.0,
"hard_min_s": 420.0,
"progress_stall_s": 120.0,
"hedge_grace_s": 90.0,
"hedge_max_per_job": 1,
"restart_max_per_job": 1,
"worker_restart_cooldown_s": 300.0,
"worker_restart_budget": 1,
"worker_restart_window_frames": 20,
"global_restart_limit": 2,
"global_restart_window_s": 180.0,
"single_worker_min_stall_s": 240.0,
"min_baseline_s": 20.0,
"periodic_recycle_enabled": False,
"periodic_recycle_points": [0.25, 0.50, 0.75],
"periodic_recycle_min_completed_jobs": 32,
},
"BALANCED": {
"enabled": True,
"warmup_completed_jobs": 1,
"warmup_per_worker_jobs": 1,
"min_samples_soft": 4,
"soft_mult": 3.0,
"soft_min_s": 90.0,
"hard_mult": 5.5,
"hard_min_s": 300.0,
"progress_stall_s": 90.0,
"hedge_grace_s": 60.0,
"hedge_max_per_job": 1,
"restart_max_per_job": 1,
"worker_restart_cooldown_s": 180.0,
"worker_restart_budget": 2,
"worker_restart_window_frames": 20,
"global_restart_limit": 3,
"global_restart_window_s": 150.0,
"single_worker_min_stall_s": 180.0,
"min_baseline_s": 20.0,
"periodic_recycle_enabled": False,
"periodic_recycle_points": [0.25, 0.50, 0.75],
"periodic_recycle_min_completed_jobs": 32,
},
"AGGRESSIVE": {
"enabled": True,
"warmup_completed_jobs": 1,
"warmup_per_worker_jobs": 1,
"min_samples_soft": 2,
"soft_mult": 1.6,
"soft_min_s": 35.0,
"hard_mult": 2.1,
"hard_min_s": 95.0,
"progress_stall_s": 35.0,
"hedge_grace_s": 20.0,
"hedge_max_per_job": 1,
"restart_max_per_job": 1,
"worker_restart_cooldown_s": 90.0,
"worker_restart_budget": 3,
"worker_restart_window_frames": 20,
"global_restart_limit": 8,
"global_restart_window_s": 180.0,
"single_worker_min_stall_s": 100.0,
"min_baseline_s": 20.0,
"periodic_recycle_enabled": True,
"periodic_recycle_points": [0.25, 0.50, 0.75],
"periodic_recycle_min_completed_jobs": 32,
},
}
def _rendertime_guard_profile(tier: str):
key = str(tier or "AGGRESSIVE").upper()
base = _RENDERTIME_GUARD_PROFILES.get(key) or _RENDERTIME_GUARD_PROFILES["AGGRESSIVE"]
out = dict(base)
out["tier"] = key
return out
def _classify_launch_exception(exc: Exception):
s = str(exc or "")
low = s.lower()
if ("1455" in low) or ("paging file" in low) or ("not enough memory" in low):
return "SYSTEM_RAM_OR_COMMIT_EXHAUSTED"
if ("access is denied" in low) or ("permission denied" in low):
return "ACCESS_DENIED"
if ("file not found" in low) or ("no such file" in low):
return "BINARY_OR_PATH_NOT_FOUND"
if ("is not recognized" in low):
return "BINARY_NOT_EXECUTABLE"
return "PROCESS_START_EXCEPTION"
def _classify_runtime_exit_reason(last_line: str, returncode):
rc = "" if returncode is None else str(returncode)
ll = (last_line or "").lower()
if ("out of memory" in ll) or ("not enough memory" in ll):
return "EXIT_OUT_OF_MEMORY"
if ("cuda error out of memory" in ll) or ("optix error out of memory" in ll):
return "EXIT_GPU_VRAM_OOM"
if ("failed to create" in ll and "context" in ll):
return "EXIT_GPU_CONTEXT_INIT_FAILED"
if rc == "-1073741819":
return "EXIT_ACCESS_VIOLATION"
if rc == "-1073740791":
return "EXIT_STACK_BUFFER_OVERRUN"
return "EXIT_BEFORE_HANDSHAKE"
def _mgpu_is_video(scene):
try:
img = scene.render.image_settings
fmt = str(getattr(img, "file_format", "") or "").upper()
media = str(getattr(img, "media_type", "") or "").upper()
return (fmt in VIDEO_FORMATS or media == "VIDEO")
except Exception:
return False
def _mgpu_video_output_path(scene):
try:
return bpy.path.abspath(scene.render.filepath)
except Exception:
return ""
def _mgpu_first_frame_path(scene):
try:
return bpy.path.abspath(scene.render.frame_path(frame=scene.frame_start))
except Exception:
try:
return bpy.path.abspath(scene.render.filepath)
except Exception:
return ""
def _mgpu_dir_has_entries(path):
try:
if not os.path.isdir(path):
return False
with os.scandir(path) as it:
for _entry in it:
return True
except Exception:
return False
return False
def _mgpu_sequence_exists(first_frame_path):
try:
dir_path = os.path.dirname(first_frame_path)
if not os.path.isdir(dir_path):
return False
base = os.path.basename(first_frame_path)
m = re.search(r"(\\d+)(\\.[^.]+)?$", base)
if not m:
return False
prefix = base[:m.start(1)]
suffix = m.group(2) or ""
for name in os.listdir(dir_path):
if not name.startswith(prefix):
continue
if suffix and not name.endswith(suffix):
continue
return True
except Exception:
return False
return False
def _mgpu_video_temp_dir_for(scene, use_target_dir=True):
out_path = _mgpu_video_output_path(scene) or "mgpu_video"
base = os.path.splitext(os.path.basename(out_path))[0] or "render"
safe = re.sub(r"[^A-Za-z0-9._-]+", "_", base)
h = hashlib.md5(out_path.encode("utf-8", "ignore")).hexdigest()[:8]
if use_target_dir:
out_dir = os.path.dirname(out_path)
if out_dir:
return os.path.join(out_dir, f"{safe}_TEMP")
return os.path.join(tempfile.gettempdir(), f"mgpu_frames_{safe}_{h}")
def _mgpu_overwrite_warnings(scene, is_video, temp_dir=None):
warnings = []
if is_video:
out_path = _mgpu_video_output_path(scene)
if out_path and os.path.exists(out_path):
warnings.append(f"Output file exists: {out_path}")
else:
first_frame = _mgpu_first_frame_path(scene)
if first_frame and os.path.exists(first_frame):
warnings.append(f"Output frame exists: {first_frame}")
if first_frame and _mgpu_sequence_exists(first_frame):
warnings.append(f"Existing frame files detected in: {os.path.dirname(first_frame)}")
if temp_dir and _mgpu_dir_has_entries(temp_dir):
warnings.append(f"Temp frame folder has files: {temp_dir}")
return warnings
def _mgpu_has_mari_addon():
mari_mod = None
try:
for mod in addon_utils.modules():
if getattr(mod, "addon_prefix", None) == "mari":
mari_mod = mod
break
bi = getattr(mod, "bl_info", {}) or {}
if (bi.get("name") or "").strip().lower() == "mari advanced":
mari_mod = mod
break
except Exception:
pass
if not mari_mod:
return False
try:
name = getattr(mari_mod, "__name__", None)
if name:
_loaded, enabled = addon_utils.check(name)
return bool(enabled)
except Exception:
pass
return False
def _mgpu_enabled_addons_snapshot():
"""Capture add-ons currently enabled in this Blender session."""
records = []
names = []
try:
prefs_addons = getattr(bpy.context.preferences, "addons", None)
if prefs_addons is not None:
names.extend(list(prefs_addons.keys()))
except Exception:
pass
try:
for meta in addon_utils.modules():
mod_name = getattr(meta, "__name__", None)
if not mod_name:
continue
enabled = False
try:
state = addon_utils.check(mod_name)
if isinstance(state, tuple):
enabled = any(bool(v) for v in state)
else:
enabled = bool(state)
except Exception:
enabled = False
if enabled:
names.append(mod_name)
except Exception:
pass
dedup = []
seen = set()
for n in names:
if not n or n in seen:
continue
seen.add(n)
src = ""
is_pkg = False
try:
mod = sys.modules.get(n)
if mod is None:
for meta in addon_utils.modules():
if getattr(meta, "__name__", None) == n:
mod = meta
break
src = str(getattr(mod, "__file__", "") or "")
if src:
src = os.path.abspath(src)
is_pkg = os.path.basename(src).lower() == "__init__.py"
except Exception:
src = ""
is_pkg = False
rec = {"module": n}
if src:
rec["file"] = src
rec["is_package"] = bool(is_pkg)
records.append(rec)
return records
def _mgpu_enabled_addon_module_names(records):
names = []
seen = set()
for entry in (records or []):
if isinstance(entry, str):
mod_name = str(entry or "")
elif isinstance(entry, dict):
mod_name = str(entry.get("module") or "")
else:
mod_name = ""
if not mod_name or mod_name in seen:
continue
seen.add(mod_name)
names.append(mod_name)
return names
def _proc_rss_bytes():
if IS_WIN:
try:
import ctypes, ctypes.wintypes as wt
class PROCESS_MEMORY_COUNTERS(ctypes.Structure):
_fields_ = [
("cb", wt.DWORD), ("PageFaultCount", wt.DWORD),
("PeakWorkingSetSize", ctypes.c_size_t), ("WorkingSetSize", ctypes.c_size_t),
("QuotaPeakPagedPoolUsage", ctypes.c_size_t), ("QuotaPagedPoolUsage", ctypes.c_size_t),
("QuotaPeakNonPagedPoolUsage", ctypes.c_size_t), ("QuotaNonPagedPoolUsage", ctypes.c_size_t),
("PagefileUsage", ctypes.c_size_t), ("PeakPagefileUsage", ctypes.c_size_t),
]
GetProcessMemoryInfo = ctypes.windll.psapi.GetProcessMemoryInfo
GetCurrentProcess = ctypes.windll.kernel32.GetCurrentProcess
h = GetCurrentProcess()
counters = PROCESS_MEMORY_COUNTERS()
counters.cb = ctypes.sizeof(PROCESS_MEMORY_COUNTERS)
if GetProcessMemoryInfo(h, ctypes.byref(counters), counters.cb):
return int(counters.WorkingSetSize)
except Exception:
return None
else:
try:
import resource
r = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
if IS_MAC:
return int(r)
return int(r * 1024)
except Exception:
return None
def _sys_mem_available_bytes():
if IS_WIN:
try:
import ctypes, ctypes.wintypes as wt
class MEMORYSTATUSEX(ctypes.Structure):
_fields_ = [
("dwLength", wt.DWORD),
("dwMemoryLoad", wt.DWORD),
("ullTotalPhys", ctypes.c_ulonglong),
("ullAvailPhys", ctypes.c_ulonglong),
("ullTotalPageFile", ctypes.c_ulonglong),
("ullAvailPageFile", ctypes.c_ulonglong),
("ullTotalVirtual", ctypes.c_ulonglong),
("ullAvailVirtual", ctypes.c_ulonglong),
("ullAvailExtendedVirtual", ctypes.c_ulonglong),
]
stat = MEMORYSTATUSEX()
stat.dwLength = ctypes.sizeof(MEMORYSTATUSEX)
ctypes.windll.kernel32.GlobalMemoryStatusEx(ctypes.byref(stat))
return int(stat.ullAvailPhys)
except Exception:
return None
try:
import psutil
return int(psutil.virtual_memory().available)
except Exception:
return None
# ----------------------- GPU detection -----------------------
def _normalize_gpu_name(n: str) -> str:
n = (n or "")
n = re.sub(r"\s*\(Display.*?\)", "", n)
n = n.replace("NVIDIA", "").replace("GeForce", "").strip()
return re.sub(r"\s+", " ", n)
def _normalize_pci_bus_id(raw: str) -> str:
s = (str(raw or "").strip().lower())
if not s:
return ""
m = re.search(r"([0-9a-f]{4,8})?:?([0-9a-f]{1,2}):([0-9a-f]{1,2})(?:\.([0-7]))?", s)
if not m:
return ""
dom = (m.group(1) or "00000000")
if len(dom) == 4:
dom = "0000" + dom
elif len(dom) < 8:
dom = dom.rjust(8, "0")
bus = m.group(2).rjust(2, "0")
dev = m.group(3).rjust(2, "0")
fn = m.group(4) or "0"
return f"{dom}:{bus}:{dev}.{fn}"
def _extract_pci_bus_id_from_dev(dev) -> str | None:
pat = re.compile(r"(?:^|[_\s:])([0-9A-Fa-f]{4,8}:[0-9A-Fa-f]{1,2}:[0-9A-Fa-f]{1,2}(?:\.[0-7])?)")
for field in (getattr(dev, "id", ""), getattr(dev, "name", "")):
m = pat.search(str(field) or "")
if m:
norm = _normalize_pci_bus_id(m.group(1))
if norm:
return norm
return None
def _win_query_nvidia_smi_detailed():
if not IS_WIN:
return None
try:
out = subprocess.check_output(
["nvidia-smi", "--query-gpu=index,uuid,pci.bus_id,name", "--format=csv,noheader"],
encoding="utf-8", errors="ignore"
)
phys = []
for line in out.strip().splitlines():
parts = [p.strip() for p in line.split(",")]
if len(parts) >= 4:
idx = int(parts[0])
uuid = parts[1]
bus = _normalize_pci_bus_id(parts[2]) or parts[2].lower()
name = ",".join(parts[3:]).strip()
phys.append({"index": idx, "uuid": uuid, "bus": bus, "name": name})
return phys
except Exception:
return None
def _dev_key(name: str, bus: str) -> tuple:
bus = _normalize_pci_bus_id(bus)
if bus:
return ("bus", bus)
return ("name", _normalize_gpu_name(name))
# LEGACY: broad scan (often contains 'ghost' GPU)
def _detect_gpu_devices_legacy(selected_only=False):
cp = _cycles_prefs()
if not cp:
return []
backend = getattr(cp, "compute_device_type", None)
try:
cp.refresh_devices()
except Exception:
pass
devs = []
for d in getattr(cp, "devices", []):
if getattr(d, "type", "") != backend:
continue
sel = bool(getattr(d, "use", False))
if selected_only and not sel:
continue
devs.append(d)
if not devs:
return []
phys = _win_query_nvidia_smi_detailed()
out = []
if phys:
bus_to_idx = {g["bus"]: g["index"] for g in phys}
used = set()
matched_dev_ids = set()
# Pass 1: bus
for d in devs:
bus = _extract_pci_bus_id_from_dev(d)
if bus and bus in bus_to_idx:
idx = bus_to_idx[bus]; used.add(idx)
out.append((idx, getattr(d,"name","?"), backend, bool(getattr(d,"use",False)), bus))
matched_dev_ids.add(id(d))
# Pass 2: name fallback
for d in devs:
if id(d) in matched_dev_ids:
continue
dn = _normalize_gpu_name(getattr(d,"name",""))
cand = None
for g in phys:
if g["index"] in used: continue
if _normalize_gpu_name(g["name"]) == dn:
cand = g; break
if cand:
out.append((cand["index"], getattr(d,"name","?"), backend, bool(getattr(d,"use",False)), cand["bus"]))
used.add(cand["index"])
else:
out.append((999, getattr(d,"name","?"), backend, bool(getattr(d,"use",False)), _extract_pci_bus_id_from_dev(d) or ""))
out.sort(key=lambda t: (t[0]==999, t[0]))
return out
return [(i, getattr(d,"name","?"), backend, bool(getattr(d,"use",False)), _extract_pci_bus_id_from_dev(d) or "") for i,d in enumerate(devs)]
def _detect_gpu_devices_strict(selected_only=True):
cp = _cycles_prefs()
if not cp:
return []
backend = getattr(cp, "compute_device_type", None)
try:
cp.refresh_devices()
except Exception:
pass
devs = []
for d in getattr(cp, "devices", []):
if getattr(d, "type", "") != backend:
continue
sel = bool(getattr(d, "use", False))
if selected_only and not sel:
continue
devs.append(d)
if not devs:
return []
phys = _win_query_nvidia_smi_detailed()
out = []
if phys:
bus_to_idx = {g["bus"]: g["index"] for g in phys}
used = set()
for d in devs:
bus = _extract_pci_bus_id_from_dev(d)
if bus and bus in bus_to_idx:
idx = bus_to_idx[bus]; used.add(idx)
out.append((idx, getattr(d,"name","?"), backend, True, bus))
else:
dn = _normalize_gpu_name(getattr(d,"name",""))
cand = None
for g in phys:
if g["index"] in used: continue
if _normalize_gpu_name(g["name"]) == dn:
cand = g; break
if cand:
used.add(cand["index"])
out.append((cand["index"], getattr(d,"name","?"), backend, True, cand["bus"]))
else:
out.append((999, getattr(d,"name","?"), backend, True, _extract_pci_bus_id_from_dev(d) or ""))
out.sort(key=lambda t: (t[0]==999, t[0]))
return out
return [(i, getattr(d,"name","?"), backend, True, _extract_pci_bus_id_from_dev(d) or "") for i,d in enumerate(devs)]
def _multiset_subtract(primary_list, subtract_list):
sub_counts = Counter(_dev_key(name, bus) for (_i, name, _t, _sel, bus) in subtract_list)
result = []
for tpl in primary_list:
(_i, name, _t, _sel, bus) = tpl
k = _dev_key(name, bus)
if sub_counts[k] > 0:
sub_counts[k] -= 1
else:
result.append(tpl)
return result
def _dedupe_selection_by_bus(rows):
rows = list(rows or [])
if not rows:
return rows
phys = _win_query_nvidia_smi_detailed() or []
phys_name_by_bus = {
_normalize_pci_bus_id(g.get("bus")): _normalize_gpu_name(g.get("name"))
for g in phys
if _normalize_pci_bus_id(g.get("bus"))
}
out = []
bus_pos = {}
for row in rows:
try:
idx, name, backend, selected, bus = row
except Exception:
out.append(row)
continue
nbus = _normalize_pci_bus_id(bus)
if not nbus:
out.append(row)
continue
phys_norm = phys_name_by_bus.get(nbus, "")
cur_norm = _normalize_gpu_name(name)
def _score(_row, _norm):
try:
_idx, _name, _backend, _selected, _bus = _row
except Exception:
return -999
s = 0
if bool(_selected):
s += 2
if _idx != 999:
s += 1
if _norm and _normalize_gpu_name(_name) == _norm:
s += 4
return s
if nbus not in bus_pos:
bus_pos[nbus] = len(out)
out.append((idx, name, backend, selected, nbus))
continue
pos = bus_pos[nbus]
prev = out[pos]
if _score((idx, name, backend, selected, nbus), phys_norm) > _score(prev, phys_norm):
out[pos] = (idx, name, backend, selected, nbus)
return out
def _detect_gpu_devices_final_from_lists(mode: str, legacy, strict):
legacy = list(legacy or [])
strict = list(strict or [])
if mode == "LEGACY_ONLY":
return _dedupe_selection_by_bus(legacy)
if mode == "STRICT_ONLY":
return _dedupe_selection_by_bus(strict)
if mode == "LEGACY_MINUS_STRICT":
return _dedupe_selection_by_bus(_multiset_subtract(legacy, strict))
final = _multiset_subtract(strict, legacy)
# Safety: never silently drop explicitly selected strict GPUs.
if strict and len(final) < len(strict):
return _dedupe_selection_by_bus(strict)
return _dedupe_selection_by_bus(final)
def _detect_gpu_devices_final(mode: str = "STRICT_MINUS_LEGACY"):
"""
mode:
- 'STRICT_MINUS_LEGACY' -> strict(all) minus legacy(ghost) [default]
- 'LEGACY_MINUS_STRICT' -> legacy minus strict
- 'STRICT_ONLY' -> just strict
- 'LEGACY_ONLY' -> just legacy
"""
strict = _detect_gpu_devices_strict(selected_only=True)
legacy = _detect_gpu_devices_legacy(selected_only=False)
return _detect_gpu_devices_final_from_lists(mode, legacy, strict)
def _cycles_device_snapshot():
cp = _cycles_prefs()
if not cp:
return {"backend": None, "rows": []}
backend = getattr(cp, "compute_device_type", None)
try:
cp.refresh_devices()
except Exception:
pass
rows = []
for d in getattr(cp, "devices", []):
rows.append({
"name": getattr(d, "name", "?"),
"type": getattr(d, "type", "?"),
"use": bool(getattr(d, "use", False)),
"bus": _extract_pci_bus_id_from_dev(d) or "",
"id": str(getattr(d, "id", "") or ""),
})
return {"backend": backend, "rows": rows}
# -------- map selection to physical UUIDs --------
def _map_selection_to_uuids(sel_tuples):
phys = _win_query_nvidia_smi_detailed()
if not phys:
out = []
for (idx, name, _t, _sel, bus) in sel_tuples:
out.append({
"index": idx, "name": name, "cycles_name": name,
"bus": _normalize_pci_bus_id(bus) or bus or "", "uuid": None,
"phys_index": idx if idx != 999 else None
})
return out
bus_map = {g["bus"]: g for g in phys}
name_buckets = {}
for g in phys:
name_buckets.setdefault(_normalize_gpu_name(g["name"]), []).append(g)
used_ids = set()
out = []
for (idx, name, _t, _sel, bus) in sel_tuples:
bus = _normalize_pci_bus_id(bus) or (bus or "")
g = None
if bus and bus in bus_map and bus_map[bus]["uuid"] not in used_ids:
g = bus_map[bus]
if (g is None) and (idx is not None) and isinstance(idx, int):
for cand in phys:
if cand["index"] == idx and cand["uuid"] not in used_ids:
g = cand; break
if g is None:
nb = name_buckets.get(_normalize_gpu_name(name), [])
for cand in nb:
if cand["uuid"] not in used_ids:
g = cand; break
if g:
used_ids.add(g["uuid"])
out.append({
"index": g["index"],
# Prefer physical inventory naming from nvidia-smi.
"name": g.get("name") or name,
"cycles_name": name,
"bus": g["bus"],
"uuid": g["uuid"],
"phys_index": g["index"],
})
else:
out.append({
"index": idx, "name": name, "cycles_name": name,
"bus": bus or "", "uuid": None, "phys_index": None
})
return out
def _filter_known_mapped_gpus(mapped):
"""Hide unresolved mapped entries (phys_index=None => shown as '?')."""
keep = []
dropped = []
for m in list(mapped or []):
if m.get("phys_index") is None:
dropped.append(m)
else:
keep.append(m)
return keep, dropped
BANNER_ASCII = r"""
▒▒▒▒▒▒▒▒▒▒▒ ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░
▒▌▄ ███ ▄▐▒ ░█░█░███░█░░███░█████░███░███░█░
▒▌█▌███▐█▐▒ ░█░█░█░█░█░░█░█░█░█░█░█░█░█░█░█░
▒▌█▌███▐█▐▒ ░███░█░█░█░░█░█░█░█░█░███░██░░█░
▒▌█▌███▐█▐▒ ░█░█░█░█░█░░█░█░█░█░█░█░█░█░█░█░
▒▌█▌███▐█▐▒ ░█░█░███░██░███░█░█░█░█░█░█░█░█░
▒▌▀ ███ ▀▐▒ ░▒░▒░▒▒▒░▒▒░▒▒▒░▒░▒░▒░▒░▒░▒░▒░▒░
▒▒▒▒▒▒▒▒▒▒▒ BLENDER MULTI-INSTANCE RENDERER
[[Provided By HoloMARI - Holographic Media Creators Platform]]
Check our Holographic image rendering/sharing at: holomari.com
"""
# Manager-side diagnostics console banner (line 3 labels this as manager console).
BANNER_MANAGER_ASCII = "\n\nRENDER MANAGER CONSOLE\n\n" + BANNER_ASCII.lstrip("\n")
_WORKER_BANNER_REPEAT_EVERY_LINES = 30
_WORKER_BANNER_REPEAT_TEXT = BANNER_ASCII.lstrip("\n").rstrip("\n") + "\n"
# ----------------------- child script -----------------------
_CHILD_SCRIPT_SRC = r"""
import bpy, sys, json, socket, os, re, time, traceback
try:
import addon_utils
except Exception:
addon_utils = None
# Make all prints flush immediately so the parent can time frames
try:
sys.stdout.reconfigure(line_buffering=True, write_through=True)
except Exception:
pass
HOST = "127.0.0.1"
args = sys.argv[sys.argv.index("--")+1:] if "--" in sys.argv else []
def _argval(flag, default=None):
if flag in args:
i = args.index(flag)
return args[i+1] if i+1 < len(args) else default
return default
PORT = int(_argval("--mgpu-port", "0"))
TOKEN = _argval("--mgpu-token", "")
TAG = _argval("--mgpu-tag", "worker")
DEVICE = _argval("--mgpu-device", None)
FALLBACK_DEVICE = _argval("--mgpu-fallback-device", "")
TARGET_GPU_BUS = _argval("--mgpu-gpu-bus", "") or ""
TARGET_GPU_NAME = _argval("--mgpu-gpu-name", "") or ""
THREADS = int(_argval("--mgpu-threads", "0") or "0")
USECPU = int(_argval("--mgpu-usecpu", "0") or "0")
DENOISE_GPU = int(_argval("--mgpu-denoise-gpu", "1") or "1")
PERSIST = int(_argval("--mgpu-persistent", "1") or "1")
MODE = _argval("--mgpu-mode", "FRAMES")
SRC_DIR = _argval("--src-dir", "") or ""
SEQ_DIR = _argval("--mgpu-seq-dir", "") or ""
SEQ_FMT = (_argval("--mgpu-seq-format", "PNG") or "PNG").upper()
SEQ_EXT = (_argval("--mgpu-seq-ext", ".png") or ".png").strip() or ".png"
if not SEQ_EXT.startswith("."):
SEQ_EXT = "." + SEQ_EXT
PRECHECKED_EXISTING = int(_argval("--mgpu-prechecked-existing", "0") or "0")
ADDONS_FILE = _argval("--mgpu-enabled-addons-file", "") or ""
MARI_PATH = _argval("--mari-path", "")
def _normalize_gpu_name(n):
n = str(n or "")
n = n.replace("NVIDIA", "").replace("GeForce", "").strip()
n = re.sub(r"\s+", " ", n)
return n.lower()
def _normalize_pci_bus_id(raw):
s = (str(raw or "").strip().lower())
if not s:
return ""
m = re.search(r"([0-9a-f]{4,8})?:?([0-9a-f]{1,2}):([0-9a-f]{1,2})(?:\.([0-7]))?", s)
if not m:
return ""
dom = (m.group(1) or "00000000")
if len(dom) == 4:
dom = "0000" + dom
elif len(dom) < 8:
dom = dom.rjust(8, "0")
bus = m.group(2).rjust(2, "0")
dev = m.group(3).rjust(2, "0")
fn = m.group(4) or "0"
return f"{dom}:{bus}:{dev}.{fn}"
def _extract_pci_bus_id_from_dev(dev):
pat = re.compile(r"([0-9A-Fa-f]{4,8}:[0-9A-Fa-f]{1,2}:[0-9A-Fa-f]{1,2}(?:\.[0-7])?)")
for field in (getattr(dev, "id", ""), getattr(dev, "name", "")):
m = pat.search(str(field) or "")
if m:
norm = _normalize_pci_bus_id(m.group(1))
if norm:
return norm
return ""
def _addon_record_fields(entry):
if isinstance(entry, str):
return entry, "", False
if isinstance(entry, dict):
return (
str(entry.get("module") or ""),
str(entry.get("file") or ""),
bool(entry.get("is_package", False)),
)
return "", "", False
def _load_addon_from_source(mod_name, src_path, is_package=False):
try:
import importlib.util
if not mod_name or not src_path:
return False
src_path = os.path.abspath(src_path)
if os.path.isdir(src_path):
is_package = True
init_path = os.path.join(src_path, "__init__.py")
if not os.path.isfile(init_path):
return False
spec = importlib.util.spec_from_file_location(
mod_name, init_path, submodule_search_locations=[src_path]
)
elif is_package or os.path.basename(src_path).lower() == "__init__.py":
pkg_dir = os.path.dirname(src_path)
spec = importlib.util.spec_from_file_location(
mod_name, src_path, submodule_search_locations=[pkg_dir]
)
else:
spec = importlib.util.spec_from_file_location(mod_name, src_path)
if not spec or not spec.loader:
return False
mod = sys.modules.get(mod_name)
if mod is None:
mod = importlib.util.module_from_spec(spec)
sys.modules[mod_name] = mod
spec.loader.exec_module(mod)
if hasattr(mod, "register"):
try:
mod.register()
except Exception:
pass
return True
except Exception:
return False
def _enable_parent_addons():
activated = []
if not ADDONS_FILE or not addon_utils:
return activated
try:
with open(ADDONS_FILE, "r", encoding="utf-8") as fp:
payload = json.load(fp)
mods = payload.get("addons", []) if isinstance(payload, dict) else payload
if not isinstance(mods, (list, tuple)):
mods = []
req = len(mods)
ok = 0
fail = 0
loaded_from_source = 0
already_enabled = 0
for entry in mods:
mod_name, src_path, is_package = _addon_record_fields(entry)
if not mod_name:
continue
try:
st = addon_utils.check(mod_name)
if isinstance(st, tuple) and any(bool(v) for v in st):
ok += 1
already_enabled += 1
activated.append(mod_name)
continue
except Exception:
pass
try:
addon_utils.enable(mod_name, default_set=False, persistent=False)
ok += 1
activated.append(mod_name)
except Exception:
if _load_addon_from_source(mod_name, src_path, is_package=is_package):
ok += 1
loaded_from_source += 1
activated.append(mod_name)
else:
fail += 1
print(f"[MGPU-CHILD] Add-on sync: requested={req} enabled={ok} already_enabled={already_enabled} loaded_from_source={loaded_from_source} failed={fail}")
sys.stdout.flush()
except Exception as e:
print(f"[MGPU-CHILD] WARNING: addon sync failed: {e}")
sys.stdout.flush()
return activated
_SYNCED_ADDON_MODULES = _enable_parent_addons()
try:
ops = dir(bpy.ops.mari)
print("[MGPU-CHILD] bpy.ops.mari ->", ", ".join(ops))
except Exception:
print("[MGPU-CHILD] bpy.ops.mari namespace missing")
if MARI_PATH:
try:
if MARI_PATH not in sys.path:
sys.path.insert(0, MARI_PATH)
try:
import holo_mari_addon as _hma
except Exception:
import importlib.util
p = os.path.join(MARI_PATH, "__init__.py")
spec = importlib.util.spec_from_file_location("holo_mari_addon", p)
_hma = importlib.util.module_from_spec(spec)
sys.modules["holo_mari_addon"] = _hma
spec.loader.exec_module(_hma)
if hasattr(_hma, "register"):
_hma.register()
_SYNCED_ADDON_MODULES.append("holo_mari_addon")
print("[MGPU-CHILD] Loaded MARI addon from path:", MARI_PATH); sys.stdout.flush()
except Exception as e:
print("[MGPU-CHILD] ERROR loading MARI addon path:", e); sys.stdout.flush()
def _ensure_threads():
try:
if THREADS and THREADS > 0:
try:
bpy.context.preferences.system.threads = THREADS
except Exception:
pass
except Exception:
pass
_ensure_threads()
def _scene_override_kwargs(scn):
kw = {}
if scn is None:
return kw
kw["scene"] = scn
try:
vls = getattr(scn, "view_layers", None)
if vls:
active_name = ""
try:
active_name = str(getattr(bpy.context.view_layer, "name", "") or "")
except Exception:
active_name = ""
if active_name and active_name in vls:
kw["view_layer"] = vls[active_name]
else:
kw["view_layer"] = vls[0]
except Exception:
pass
return kw
def _run_with_scene_override(scn, fn):
kw = _scene_override_kwargs(scn)
if kw:
try:
ctx = bpy.context.temp_override(**kw)
except Exception:
ctx = None
if ctx is not None:
with ctx:
return fn()
win = getattr(bpy.context, "window", None)
prev_scene = None
try:
if win and scn is not None:
prev_scene = win.scene
win.scene = scn
except Exception:
prev_scene = None
try:
return fn()
finally:
try:
if win and prev_scene is not None:
win.scene = prev_scene
except Exception:
pass
def _handler_matches_modules(handler, modules):
hmod = str(getattr(handler, "__module__", "") or "")
if not hmod:
return False
for mod_name in (modules or []):
mod_name = str(mod_name or "")
if mod_name and (hmod == mod_name or hmod.startswith(mod_name + ".")):
return True
return False
def _replay_addon_load_post_handlers(modules):
mods = [str(m or "") for m in (modules or []) if str(m or "")]
if not mods:
return
try:
handlers = list(getattr(bpy.app.handlers, "load_post", []) or [])
except Exception:
handlers = []
if not handlers:
return
filepath = bpy.data.filepath or ""
called = 0
failed = 0
for handler in handlers:
if not callable(handler) or not _handler_matches_modules(handler, mods):
continue
try:
handler(filepath)
called += 1
except TypeError:
try:
handler()
called += 1
except Exception as e:
failed += 1
print(f"[MGPU-CHILD] WARNING: load_post replay failed for {getattr(handler, '__name__', '<handler>')}: {e}")
sys.stdout.flush()
except Exception as e:
failed += 1
print(f"[MGPU-CHILD] WARNING: load_post replay failed for {getattr(handler, '__name__', '<handler>')}: {e}")
sys.stdout.flush()
if called or failed:
print(f"[MGPU-CHILD] load_post replay: called={called} failed={failed} modules={len(mods)}")
sys.stdout.flush()
def _replay_addon_scene_handlers(handler_name, scn, modules):
mods = [str(m or "") for m in (modules or []) if str(m or "")]
if not scn or not mods:
return
try:
handlers = list(getattr(bpy.app.handlers, handler_name, []) or [])
except Exception:
handlers = []
for handler in handlers:
if not callable(handler) or not _handler_matches_modules(handler, mods):
continue
try:
_run_with_scene_override(scn, lambda _h=handler: _h(scn))
except TypeError:
try:
_run_with_scene_override(scn, lambda _h=handler: _h())
except Exception as e:
print(f"[MGPU-CHILD] WARNING: {handler_name} replay failed for {getattr(handler, '__name__', '<handler>')}: {e}")
sys.stdout.flush()
except Exception as e:
print(f"[MGPU-CHILD] WARNING: {handler_name} replay failed for {getattr(handler, '__name__', '<handler>')}: {e}")
sys.stdout.flush()
def _force_scene_refresh(scn, frame=None):
if scn is None:
return
try:
target = int(scn.frame_current if frame is None else frame)
except Exception:
try:
target = int(scn.frame_current)
except Exception:
target = 0
try:
cur = int(scn.frame_current)
except Exception:
cur = target
def _apply():
try:
if cur == target:
alt = target - 1
try:
start = int(getattr(scn, "frame_start", target))
end = int(getattr(scn, "frame_end", target))
if alt < start and (target + 1) <= end:
alt = target + 1
if alt != target and start <= alt <= end:
try:
scn.frame_set(alt, subframe=0.0)
except TypeError:
scn.frame_set(alt)
except Exception:
pass
try:
scn.frame_set(target, subframe=0.0)
except TypeError:
scn.frame_set(target)
except Exception:
pass
try:
bpy.context.view_layer.update()
except Exception:
pass
try:
deps = bpy.context.evaluated_depsgraph_get()
upd = getattr(deps, "update", None)
if callable(upd):
upd()
except Exception:
pass
try:
_replay_addon_scene_handlers("frame_change_post", scn, _SYNCED_ADDON_MODULES)
except Exception:
pass
_run_with_scene_override(scn, _apply)
_replay_addon_load_post_handlers(_SYNCED_ADDON_MODULES)
try:
scn0 = bpy.context.scene
if scn0:
_force_scene_refresh(scn0, scn0.frame_current)
except Exception:
pass
# Cycles device setup:
# - GPU workers must stay on GPU (no silent CPU fallback).
# - If OptiX cannot enable devices, try CUDA as a fallback backend.
def _configure_cycles_devices():
try:
prefs = bpy.context.preferences.addons['cycles'].preferences
except Exception as e:
print(f"[MGPU-CHILD] WARNING: Cycles preferences unavailable: {e}")
sys.stdout.flush()
# For GPU workers, missing Cycles prefs must be treated as failure to
# prevent silent CPU rendering.
return bool(USECPU)
scn = bpy.context.scene
if USECPU:
try:
scn.cycles.device = 'CPU'
except Exception:
pass
try:
prefs.refresh_devices()
except Exception:
pass
cpu_enabled = 0
for d in getattr(prefs, "devices", []):
try:
dtype = str(getattr(d, "type", "") or "").upper()
if dtype == "CPU":
d.use = True
cpu_enabled += 1
else:
d.use = False
except Exception:
pass
print(f"[MGPU-CHILD] Cycles device setup: mode=CPU cpu_enabled={cpu_enabled}")
sys.stdout.flush()
return (cpu_enabled > 0)
# GPU worker path
try:
scn.cycles.device = 'GPU'
except Exception:
pass
wanted = str(DEVICE or "").upper()
fallback = str(FALLBACK_DEVICE or "").upper()
target_bus = _normalize_pci_bus_id(TARGET_GPU_BUS)
target_name_norm = _normalize_gpu_name(TARGET_GPU_NAME)
attempts = []
if wanted:
attempts.append(wanted)
if fallback and fallback not in attempts:
attempts.append(fallback)
if not attempts:
attempts = ["CUDA"]
for backend in attempts:
try:
prefs.compute_device_type = backend
except Exception as e:
print(f"[MGPU-CHILD] Cycles backend set failed for {backend}: {e}")
sys.stdout.flush()
continue
try:
prefs.refresh_devices()
except Exception as e:
print(f"[MGPU-CHILD] Cycles refresh failed for {backend}: {e}")
sys.stdout.flush()
continue
candidates = []
for d in getattr(prefs, "devices", []):
try:
dtype = str(getattr(d, "type", "") or "").upper()
if dtype == "CPU":
d.use = False
continue
if dtype != backend:
d.use = False
continue
bus = _extract_pci_bus_id_from_dev(d)
name = str(getattr(d, "name", dtype))
candidates.append((d, bus, name, dtype))
except Exception:
pass
selected = []
# Primary selector: PCI bus id (stable and unique per physical GPU).
if target_bus:
for (d, bus, name, _dtype) in candidates:
use_this = bool(bus and bus == target_bus)
d.use = use_this
if use_this:
selected.append((bus, name))
# Secondary selector: normalized device name (only if bus match failed).
if (not selected) and target_name_norm:
picked = False
for (d, _bus, name, _dtype) in candidates:
use_this = (not picked) and (_normalize_gpu_name(name) == target_name_norm)
d.use = use_this
if use_this:
selected.append((_bus, name))
picked = True
# Last resort: if only one candidate exists for this backend, use it.
if (not selected) and len(candidates) == 1:
d, bus, name, _dtype = candidates[0]
d.use = True
selected.append((bus, name))
print(f"[MGPU-CHILD] Cycles selector fallback: single-candidate backend={backend} bus={bus or '-'} name={name}")
sys.stdout.flush()
# If no explicit selector was passed, keep previous behavior (all backend devices).
if (not target_bus) and (not target_name_norm):
selected = []
for (d, bus, name, _dtype) in candidates:
d.use = True
selected.append((bus, name))
enabled = len(selected)
names = [n for (_b, n) in selected]
print(
f"[MGPU-CHILD] Cycles device setup: backend={backend} enabled_gpu={enabled} names={names} "
f"target_bus={target_bus or '-'} target_name={TARGET_GPU_NAME or '-'}"
)
sys.stdout.flush()
if enabled > 0:
if wanted and backend != wanted:
print(f"[MGPU-CHILD] Cycles backend fallback: requested={wanted} active={backend}")
sys.stdout.flush()
return True
else:
cands = [f"{b or '-'}:{n}" for (_d, b, n, _t) in candidates]
print(f"[MGPU-CHILD] Cycles selector no-match for backend={backend}; candidates={cands}")
sys.stdout.flush()
print(f"[MGPU-CHILD] ERROR: No GPU devices enabled for backends={attempts}; CPU fallback disabled for GPU worker.")
sys.stdout.flush()
return False
_CYCLES_READY = _configure_cycles_devices()
def _enforce_cycles_scene_device(scn):
try:
if not scn or scn.render.engine != "CYCLES":
return
scn.cycles.device = 'CPU' if USECPU else 'GPU'
except Exception:
pass
try:
for _scn in list(getattr(bpy.data, "scenes", []) or []):
_enforce_cycles_scene_device(_scn)
except Exception:
pass
def _cycles_runtime_report():
rep = {
"scene_device": None,
"compute_device_type": None,
"cpu_enabled": None,
"gpu_enabled": 0,
"enabled": [],
}
try:
scn = bpy.context.scene
rep["scene_device"] = str(getattr(getattr(scn, "cycles", None), "device", "") or "").upper()
except Exception:
pass
try:
prefs = bpy.context.preferences.addons['cycles'].preferences
rep["compute_device_type"] = str(getattr(prefs, "compute_device_type", "") or "").upper()
try:
prefs.refresh_devices()
except Exception:
pass
cpu_enabled = False
gpu_enabled = 0
enabled = []
for d in getattr(prefs, "devices", []):
try:
use = bool(getattr(d, "use", False))
dtype = str(getattr(d, "type", "") or "").upper()
name = str(getattr(d, "name", dtype))
if not use:
continue
enabled.append(f"{dtype}:{name}")
if dtype == "CPU":
cpu_enabled = True
else:
gpu_enabled += 1
except Exception:
pass
rep["cpu_enabled"] = cpu_enabled
rep["gpu_enabled"] = gpu_enabled
rep["enabled"] = enabled
except Exception as e:
rep["error"] = str(e)
return rep
def _cycles_policy_ok(require_gpu):
rep = _cycles_runtime_report()
if require_gpu:
ok = (
rep.get("scene_device") == "GPU" and
int(rep.get("gpu_enabled", 0) or 0) > 0 and
not bool(rep.get("cpu_enabled"))
)
else:
ok = (rep.get("scene_device") == "CPU" and bool(rep.get("cpu_enabled")))
return ok, rep
def _ensure_cycles_policy(require_gpu, phase):
ok, rep = _cycles_policy_ok(require_gpu)
if ok:
return True, rep
try:
_configure_cycles_devices()
except Exception:
pass
try:
_enforce_cycles_scene_device(bpy.context.scene)
except Exception:
pass
ok2, rep2 = _cycles_policy_ok(require_gpu)
if ok2:
print(f"[MGPU-CHILD] Cycles policy recovered at {phase}: {rep2}")
sys.stdout.flush()
return True, rep2
print(f"[MGPU-CHILD] GPU_POLICY_VIOLATION at {phase}: {rep2}")
sys.stdout.flush()
return False, rep2
def _set_enum_if_valid(owner, prop_name, value):
try:
prop = owner.bl_rna.properties.get(prop_name)
if not prop:
return False
items = [e.identifier for e in prop.enum_items]
if value in items:
setattr(owner, prop_name, value)
return True
except Exception:
pass
return False
def _configure_cycles_denoiser():
try:
scn = bpy.context.scene
except Exception:
return
if not scn or scn.render.engine != "CYCLES":
return
if not bool(DENOISE_GPU):
print("[MGPU-CHILD] Cycles denoise policy: disabled by add-on setting.")
sys.stdout.flush()
return
if bool(USECPU):
print("[MGPU-CHILD] Cycles denoise policy: CPU worker, leaving denoiser unchanged.")
sys.stdout.flush()
return
changed = []
try:
c = scn.cycles
except Exception:
c = None
if c is not None:
try:
if bool(getattr(c, "use_denoising", False)):
if _set_enum_if_valid(c, "denoiser", "OPTIX"):
changed.append("scene.cycles.denoiser=OPTIX")
except Exception:
pass
try:
if bool(getattr(c, "use_preview_denoising", False)):
if _set_enum_if_valid(c, "preview_denoiser", "OPTIX"):
changed.append("scene.cycles.preview_denoiser=OPTIX")
except Exception:
pass
try:
for vl in list(getattr(scn, "view_layers", []) or []):
vc = getattr(vl, "cycles", None)
if not vc:
continue
if bool(getattr(vc, "use_denoising", False)):
if _set_enum_if_valid(vc, "denoiser", "OPTIX"):
changed.append(f"view_layer[{vl.name}].cycles.denoiser=OPTIX")
except Exception:
pass
if changed:
print(f"[MGPU-CHILD] Cycles denoise policy: GPU denoiser configured ({'; '.join(changed)}).")
else:
print("[MGPU-CHILD] Cycles denoise policy: no active denoiser properties changed.")
sys.stdout.flush()
try:
if bpy.context.scene and bpy.context.scene.render.engine == "CYCLES":
_configure_cycles_denoiser()
print(f"[MGPU-CHILD] Cycles runtime report(init): {_cycles_runtime_report()}"); sys.stdout.flush()
except Exception:
pass
try:
bpy.context.scene.render.use_persistent_data = bool(PERSIST)
except Exception:
pass
# If we are building a video in the parent, workers render a temp image sequence.
try:
if MODE == "FRAMES" and SEQ_DIR:
os.makedirs(SEQ_DIR, exist_ok=True)
scn = bpy.context.scene
try:
scn.render.image_settings.color_mode = "RGBA"
except Exception:
pass
try:
if hasattr(scn.render.image_settings, "media_type"):
scn.render.image_settings.media_type = "IMAGE"
except Exception:
pass
scn.render.filepath = os.path.join(SEQ_DIR, "frame_")
try:
scn.render.use_file_extension = True
except Exception:
pass
except Exception as e:
print(f"[MGPU-CHILD] WARNING: sequence bootstrap setup failed: {e}"); sys.stdout.flush()
# --- Rebase MARI output folder to original .blend directory ---
try:
scn = bpy.context.scene
prop = getattr(scn, "mari_props", None)
if prop:
raw = getattr(prop, "render_settings_filepath", "") or ""
name = getattr(prop, "render_settings_name", "") or ""
rebased = raw
# If Blender-style relative path ("//..."), rebase against SRC_DIR
if raw.startswith("//") and SRC_DIR:
rebased = os.path.normpath(os.path.join(SRC_DIR, raw[2:]))
else:
# Resolve any other path using Blender's abspath (will be absolute already)
rebased = bpy.path.abspath(raw)
# Ensure a trailing separator
if not rebased.endswith(os.sep):
rebased += os.sep
# Persist back so ALL operators (including bpy.ops.mari.render_one) use the corrected absolute path
prop.render_settings_filepath = rebased
print(f"[MGPU-CHILD] Rebased MARI output dir to: {rebased} (name='{name}')"); sys.stdout.flush()
except Exception as e:
print("[MGPU-CHILD] WARNING: Could not rebase MARI output path:", e); sys.stdout.flush()
def _mari_ext_from_settings(scn):
# Match MARI add-on's extension mapping so filenames match everywhere
ff = scn.render.image_settings.file_format.lower()
if ff == "ffmpeg":
# MARI uses ffmpeg.format to decide container (e.g. mkv)
fmt = scn.render.ffmpeg.format
return {"MPEG1":"mpeg1","MPEG2":"mpeg2","MPEG4":"mp4","AVI":"avi","QUICKTIME":"mov",
"DV":"dv","OGG":"ogg","MKV":"mkv","FLASH":"flv","WEBM":"webm"}.get(fmt, "mkv")
return {
"jpeg":"jpeg","jpeg_2000":"jpeg","iris":"rgb",
"targa":"tga","targa_raw":"tga","cineon":"cin",
"open_exr":"exr","open_exr_multilayer":"exr",
"tiff":"tif","avi_jpeg":"avi","avi_raw":"avi",
"png":"png","bmp":"bmp"
}.get(ff, ff)
def _fix_mari_still_output_name(scn, H, V):
try:
prop = getattr(scn, "mari_props", None)
if not prop:
return
base_dir = bpy.path.abspath(getattr(prop, "render_settings_filepath", ""))
name = (getattr(prop, "render_settings_name", "") or "").strip()
if not (base_dir and name):
return
if not base_dir.endswith(os.sep):
base_dir += os.sep
root = os.path.join(base_dir, name)
if not os.path.isdir(root):
return
ext = (_mari_ext_from_settings(scn) or "").lower().lstrip(".")
if not ext:
return
stem = f"{name}_H{int(H)}_V{int(V)}"
final_path = os.path.join(root, f"{stem}.{ext}")
candidates = []
for fname in os.listdir(root):
full = os.path.join(root, fname)
if not os.path.isfile(full):
continue
fstem, fext = os.path.splitext(fname)
if fext.lower().lstrip(".") != ext:
continue
if not fstem.startswith(stem):
continue
suffix = fstem[len(stem):]
if suffix and len(suffix) >= 3 and suffix.isdigit():
candidates.append(full)
if os.path.isfile(final_path) and os.path.getsize(final_path) > 0:
for extra in candidates:
try:
os.remove(extra)
except Exception:
pass
return
if not candidates:
return
candidates.sort(key=lambda p: os.path.getmtime(p), reverse=True)
os.replace(candidates[0], final_path)
for extra in candidates[1:]:
try:
os.remove(extra)
except Exception:
pass
except Exception:
pass
def _prime_mari_output_for_frame(scn, H, V, action):
prop = getattr(scn, "mari_props", None)
if not prop:
return
base_dir = bpy.path.abspath(prop.render_settings_filepath)
if not base_dir.endswith(os.sep):
base_dir += os.sep
name = prop.render_settings_name
ext = _mari_ext_from_settings(scn)
# Root "<base>\<name>\"
root = os.path.join(base_dir, name)
os.makedirs(root, exist_ok=True)
# For image-sequence ANIM, make the per-camera folder and let Blender append frame numbers (NAME_0001, NAME_0002 ...)
if action == "ANIM" and scn.render.image_settings.file_format.lower() != "ffmpeg":
cam_dir = os.path.join(root, f"{name}_H{int(H)}_V{int(V)}")
os.makedirs(cam_dir, exist_ok=True)
scn.render.filepath = os.path.join(cam_dir, f"{name}_")
elif action == "STILL":
scn.render.filepath = os.path.join(root, f"{name}_H{int(H)}_V{int(V)}")
else:
# Video ANIM: point to final video base (per camera)
scn.render.filepath = os.path.join(root, f"{name}_H{int(H)}_V{int(V)}")
try:
scn.render.use_file_extension = True
except Exception:
pass
def jsend(sock, obj):
sock.sendall((json.dumps(obj) + "\n").encode("utf-8", "ignore"))
def jrecv(sock):
buf = b""
while b"\n" not in buf:
chunk = sock.recv(4096)
if not chunk:
raise ConnectionError("server closed")
buf += chunk
line, rest = buf.split(b"\n", 1)
return json.loads(line.decode("utf-8", "ignore"))
def _proj_bar(done, total, width=30):
try:
total = int(total)
done = max(0, int(done))
if total <= 0:
return "[------------------------------]", 0.0
ratio = min(1.0, done / float(total))
filled = int(round(width * ratio))
return "[" + ("#" * filled) + ("-" * (width - filled)) + "]", ratio * 100.0
except Exception:
return "[------------------------------]", 0.0
def _proj_print(H, V, elapsed, glb):
# glb carries per-job globals from the parent; we extend it in step 4
total = int(glb.get("proj_total") or 0)
done_before = int(glb.get("proj_done") or 0)
done_now = min(total, done_before + 1) if total > 0 else (done_before + 1)
bar, pct = _proj_bar(done_now, total)
try:
h = int(H)
except Exception:
h = H
try:
v = int(V)
except Exception:
v = V
print(f"[MGPU-PROJ] H{h}_V{v} | {float(elapsed):.2f}s | {bar} {pct:.1f}% ({done_now}/{total})")
sys.stdout.flush()
def _safe_out_path(scn, n):
# Native Blender path for frame n
try:
p = scn.render.frame_path(frame=n)
return bpy.path.abspath(p)
except Exception:
base = bpy.path.abspath(scn.render.filepath)
if "#" in base:
hashes = len(re.search(r"(#+)", base).group(1))
return re.sub(r"(#+)", str(n).zfill(hashes), base)
else:
root, ext = os.path.splitext(base)
if not ext:
ext = "." + (scn.render.file_extension or "png")
return f"{root}{str(n).zfill(4)}{ext}"
def _scene_expected_image_size(scn):
try:
pct = float(getattr(scn.render, "resolution_percentage", 100) or 100.0)
w = int(round(float(scn.render.resolution_x) * pct / 100.0))
h = int(round(float(scn.render.resolution_y) * pct / 100.0))
if w > 0 and h > 0:
return (w, h)
except Exception:
pass
return None
def _is_valid_render_output(path, expected_size=None):
try:
if os.path.getsize(path) <= 0:
return False
except Exception:
return False
img = None
try:
img = bpy.data.images.load(path, check_existing=False)
size = getattr(img, "size", None)
if not (size and size[0] > 0 and size[1] > 0):
return False
if expected_size:
try:
exp_w = int(expected_size[0]); exp_h = int(expected_size[1])
return int(size[0]) == exp_w and int(size[1]) == exp_h
except Exception:
return False
return True
except Exception:
return False
finally:
if img is not None:
try:
bpy.data.images.remove(img)
except Exception:
pass
def _render_meta(rendered=False, skipped=False, elapsed=0.0):
try:
elapsed = float(elapsed or 0.0)
except Exception:
elapsed = 0.0
return {
"rendered": bool(rendered),
"skipped": bool(skipped),
"elapsed": elapsed,
}
_MARI_RENDER_ONE_STATUS_KEY = "_mari_render_one_status"
def _mari_read_render_status(scn):
raw = ""
try:
raw = str(scn.get(_MARI_RENDER_ONE_STATUS_KEY, "") or "").upper()
except Exception:
raw = ""
return {
"status": raw,
"rendered": raw == "RENDERED",
"skipped": raw == "SKIPPED",
}
_SEQ_DIRECT_SAVE = False
_SEQ_DIRECT_SAVE_LOGGED = False
def _render_result_image():
img = bpy.data.images.get("Render Result")
if img is not None:
return img
for candidate in bpy.data.images:
try:
if getattr(candidate, "type", "") == "RENDER_RESULT":
return candidate
except Exception:
continue
return None
def _save_render_result_to_file(out_path, file_format="PNG"):
img = _render_result_image()
if img is None:
raise RuntimeError("Render Result image not found after frame render.")
prev_raw = getattr(img, "filepath_raw", "")
prev_fmt = getattr(img, "file_format", "PNG")
try:
os.makedirs(os.path.dirname(out_path), exist_ok=True)
except Exception:
pass
try:
img.filepath_raw = out_path
except Exception:
img.filepath = out_path
try:
img.file_format = str(file_format or "PNG").upper()
except Exception:
img.file_format = "PNG"
img.save()
try:
img.filepath_raw = prev_raw
except Exception:
pass
try:
img.file_format = prev_fmt
except Exception:
pass
def _render_frame_via_sandbox(main_scene, out_path, frame):
global _SEQ_DIRECT_SAVE, _SEQ_DIRECT_SAVE_LOGGED
prev_fp = main_scene.render.filepath
prev_use_ext = getattr(main_scene.render, "use_file_extension", True)
img = main_scene.render.image_settings
prev_fmt = getattr(img, "file_format", "PNG")
prev_mode = getattr(img, "color_mode", "RGBA")
prev_depth = getattr(img, "color_depth", "8")
prev_comp = getattr(img, "compression", 15)
prev_media = getattr(img, "media_type", None) if hasattr(img, "media_type") else None
try:
# Render from the real scene so scene-bound add-ons keep their state.
_force_scene_refresh(main_scene, frame)
use_direct_save = bool(_SEQ_DIRECT_SAVE)
if not use_direct_save:
try:
img.file_format = SEQ_FMT
img.color_mode = "RGBA"
img.color_depth = "16"
img.compression = 0
try:
if hasattr(img, "media_type"):
img.media_type = "IMAGE"
except Exception:
pass
except Exception as fmt_err:
use_direct_save = True
_SEQ_DIRECT_SAVE = True
if not _SEQ_DIRECT_SAVE_LOGGED:
print(
f"[MGPU-CHILD] {TAG} WARN: temp frame format {SEQ_FMT} unavailable on scene render settings; "
f"using Render Result direct-save fallback ({fmt_err})"
)
sys.stdout.flush()
_SEQ_DIRECT_SAVE_LOGGED = True
main_scene.render.filepath = os.path.splitext(out_path)[0]
try:
main_scene.render.use_file_extension = True
except Exception:
pass
if use_direct_save:
_run_with_scene_override(
main_scene,
lambda: bpy.ops.render.render(write_still=False, animation=False, use_viewport=False),
)
_save_render_result_to_file(out_path, file_format=SEQ_FMT)
return {'FINISHED'}
return _run_with_scene_override(
main_scene,
lambda: bpy.ops.render.render(write_still=True, animation=False, use_viewport=False),
)
finally:
main_scene.render.filepath = prev_fp
try:
main_scene.render.use_file_extension = prev_use_ext
except Exception:
pass
try:
img.file_format = prev_fmt
except Exception:
pass
try:
img.color_mode = prev_mode
except Exception:
pass
try:
img.color_depth = prev_depth
except Exception:
pass
try:
img.compression = prev_comp
except Exception:
pass
try:
if hasattr(img, "media_type") and prev_media is not None:
img.media_type = prev_media
except Exception:
pass
def render_frame(n):
scn = bpy.context.scene
expected_size = _scene_expected_image_size(scn)
if scn.render.engine == "CYCLES":
_enforce_cycles_scene_device(scn)
if not _CYCLES_READY:
return False, "Cycles device setup failed (no eligible GPU/CPU device configured for this worker).", _render_meta()
ok_policy, rep = _ensure_cycles_policy(require_gpu=(not bool(USECPU)), phase=f"pre-frame-{n}")
if not ok_policy:
return False, f"GPU_POLICY_VIOLATION pre-frame-{n}: {rep}", _render_meta()
_force_scene_refresh(scn, n)
if SEQ_DIR:
out_path = os.path.join(SEQ_DIR, f"frame_{n:04d}{SEQ_EXT}")
if (not PRECHECKED_EXISTING) and (not getattr(scn.render, "use_overwrite", True)):
try:
if os.path.exists(out_path) and _is_valid_render_output(out_path, expected_size=expected_size):
start = time.time()
print(f"[MGPU-CHILD] {TAG} start frame {n} -> {out_path}"); sys.stdout.flush()
elapsed = time.time() - start
print(f"[MGPU-CHILD] {TAG} finished frame {n} ({elapsed:.2f}s) -> {out_path}"); sys.stdout.flush()
return True, "Skipped existing frame (overwrite disabled)", _render_meta(rendered=False, skipped=True, elapsed=elapsed)
except Exception:
pass
try:
os.makedirs(os.path.dirname(out_path), exist_ok=True)
except Exception:
pass
if getattr(scn.render, "use_placeholder", False):
try:
if not os.path.exists(out_path):
with open(out_path, "wb"):
pass
except Exception:
pass
try:
start = time.time()
print(f"[MGPU-CHILD] {TAG} start frame {n} -> {out_path}"); sys.stdout.flush()
_render_frame_via_sandbox(scn, out_path, n)
ok = os.path.exists(out_path) and os.path.getsize(out_path) > 0
elapsed = time.time() - start
if ok and scn.render.engine == "CYCLES":
ok_policy, rep = _ensure_cycles_policy(require_gpu=(not bool(USECPU)), phase=f"post-frame-{n}")
if not ok_policy:
return False, f"GPU_POLICY_VIOLATION post-frame-{n}: {rep}", _render_meta(rendered=True, elapsed=elapsed)
if ok:
print(f"[MGPU-CHILD] {TAG} finished frame {n} ({elapsed:.2f}s) -> {out_path}"); sys.stdout.flush()
return True, "", _render_meta(rendered=True, elapsed=elapsed)
else:
print(f"[MGPU-CHILD] {TAG} MISSING frame {n} ({elapsed:.2f}s) -> {out_path}"); sys.stdout.flush()
return False, f"Rendered file missing or empty: {out_path}", _render_meta(elapsed=elapsed)
except Exception as e:
print(f"[MGPU-CHILD] {TAG} ERROR frame {n}: {e}"); sys.stdout.flush()
return False, str(e), _render_meta()
prev_fp = scn.render.filepath
prev_use_ext = getattr(scn.render, "use_file_extension", True)
out_path = _safe_out_path(scn, n)
if (not PRECHECKED_EXISTING) and (not getattr(scn.render, "use_overwrite", True)):
try:
if os.path.exists(out_path) and _is_valid_render_output(out_path, expected_size=expected_size):
start = time.time()
print(f"[MGPU-CHILD] {TAG} start frame {n} -> {out_path}"); sys.stdout.flush()
elapsed = time.time() - start
print(f"[MGPU-CHILD] {TAG} finished frame {n} ({elapsed:.2f}s) -> {out_path}"); sys.stdout.flush()
return True, "Skipped existing frame (overwrite disabled)", _render_meta(rendered=False, skipped=True, elapsed=elapsed)
except Exception:
pass
try:
os.makedirs(os.path.dirname(out_path), exist_ok=True)
except Exception:
pass
if getattr(scn.render, "use_placeholder", False):
try:
if not os.path.exists(out_path):
with open(out_path, "wb"):
pass
except Exception:
pass
try:
scn.render.filepath = out_path
try:
scn.render.use_file_extension = False # out_path already has extension
except Exception:
pass
start = time.time()
print(f"[MGPU-CHILD] {TAG} start frame {n} -> {out_path}"); sys.stdout.flush()
_run_with_scene_override(
scn,
lambda: bpy.ops.render.render(animation=False, write_still=True, use_viewport=False),
)
ok = os.path.exists(out_path) and os.path.getsize(out_path) > 0
elapsed = time.time() - start
if ok and scn.render.engine == "CYCLES":
ok_policy, rep = _ensure_cycles_policy(require_gpu=(not bool(USECPU)), phase=f"post-frame-{n}")
if not ok_policy:
return False, f"GPU_POLICY_VIOLATION post-frame-{n}: {rep}", _render_meta(rendered=True, elapsed=elapsed)
if ok:
print(f"[MGPU-CHILD] {TAG} finished frame {n} ({elapsed:.2f}s) -> {out_path}"); sys.stdout.flush()
return True, "", _render_meta(rendered=True, elapsed=elapsed)
else:
print(f"[MGPU-CHILD] {TAG} MISSING frame {n} ({elapsed:.2f}s) -> {out_path}"); sys.stdout.flush()
return False, f"Rendered file missing or empty: {out_path}", _render_meta(elapsed=elapsed)
except Exception as e:
print(f"[MGPU-CHILD] {TAG} ERROR frame {n}: {e}"); sys.stdout.flush()
return False, str(e), _render_meta()
finally:
scn.render.filepath = prev_fp
try:
scn.render.use_file_extension = prev_use_ext
except Exception:
pass
def _ensure_mari_enabled():
# If we injected a path, we already imported & registered it
if MARI_PATH:
return True
try:
import addon_utils
for m in addon_utils.modules():
bi = getattr(m, "bl_info", {}) or {}
nm = (bi.get("name") or "").lower()
if "mari" in nm:
addon_utils.enable(m.__name__, default_set=True, persistent=True)
return True
return False
except Exception:
return False
def _mari_prop_mode_id(value):
key = str(value or "").upper()
if key == "FRAME":
return "FRAME"
return "CRICLE"
def _apply_mari_scene_settings(scn, glb):
prop = getattr(scn, "mari_props", None)
settings = dict(glb.get("mari_settings") or {})
try:
if "render_resolution_x" in glb:
scn.render.resolution_x = int(glb.get("render_resolution_x"))
if "render_resolution_y" in glb:
scn.render.resolution_y = int(glb.get("render_resolution_y"))
if "render_resolution_percentage" in glb:
scn.render.resolution_percentage = int(glb.get("render_resolution_percentage"))
except Exception:
pass
if not prop:
return
vector_props = ("frame_ratio", "frame_dimensions", "frame_center", "frame_rotation")
for name in vector_props:
if name not in settings:
continue
value = settings.get(name)
try:
setattr(prop, name, tuple(value))
continue
except Exception:
pass
try:
seq = tuple(value)
cur = getattr(prop, name)
for idx in range(min(len(cur), len(seq))):
cur[idx] = seq[idx]
except Exception:
pass
scalar_props = ("render_settings_filepath", "render_settings_name", "render_settings_normalize")
for name in scalar_props:
if name not in settings:
continue
try:
setattr(prop, name, settings.get(name))
except Exception:
pass
def _render_mari_job(job, glb):
'''
job: {"cam_name": str, "H": int, "V": int, ["frame": int]}
glb: {"mode": "FRAME"/"CIRCLE", "action": "STILL"/"ANIM", "is_video": bool}
'''
try:
if not _ensure_mari_enabled():
return False, "MARI add-on not enabled in child", _render_meta()
if not (hasattr(bpy.ops, "mari") and hasattr(bpy.ops.mari, "render_one")):
return False, "bpy.ops.mari.render_one unavailable", _render_meta()
scn = bpy.context.scene
if scn.render.engine == "CYCLES":
_enforce_cycles_scene_device(scn)
if not _CYCLES_READY:
return False, "Cycles device setup failed (no eligible GPU/CPU device configured for this worker).", _render_meta()
ok_policy, rep = _ensure_cycles_policy(require_gpu=(not bool(USECPU)), phase=f"pre-mari-{job.get('cam_name','?')}")
if not ok_policy:
return False, f"GPU_POLICY_VIOLATION pre-mari: {rep}", _render_meta()
prop = getattr(scn, "mari_props", None)
try:
scn.render.use_overwrite = bool(glb.get("use_overwrite", scn.render.use_overwrite))
if hasattr(scn.render, "use_placeholder"):
scn.render.use_placeholder = bool(glb.get("use_placeholder", scn.render.use_placeholder))
except Exception:
pass
try:
_apply_mari_scene_settings(scn, glb)
except Exception:
pass
cam_name = job.get("cam_name")
cam_obj = bpy.data.objects.get(cam_name) if cam_name else None
if cam_obj:
scn.camera = cam_obj
else:
return False, f"Camera '{cam_name}' not found", _render_meta()
try:
bpy.context.view_layer.update()
except Exception:
pass
mode_target = _mari_prop_mode_id(glb.get("mode"))
if prop and _mari_prop_mode_id(getattr(prop, "frame", None)) != mode_target:
try:
prop.frame = mode_target
print(f"[MGPU-CHILD] Adjusted MARI mode to {prop.frame}")
except Exception:
pass
try:
obj = bpy.context.object
if obj and obj.mode != 'OBJECT':
bpy.ops.object.mode_set(mode='OBJECT', toggle=False)
except Exception:
pass
action = glb.get("action", "STILL")
try:
frame = int(job.get("frame", -1))
except Exception:
frame = -1
try:
_force_scene_refresh(scn, (frame if frame >= 0 else scn.frame_current))
except Exception:
pass
H = job.get("H")
V = job.get("V")
if prop:
# Ensure per-job paths exist and set filepaths to avoid any spillover across cameras.
_prime_mari_output_for_frame(scn, H, V, action)
try:
scn.render.use_file_extension = True
except Exception:
pass
tag = TAG
start_msg = f"[MGPU-CHILD] {tag} start MARI {action} H{H} V{V}"
if frame >= 0:
start_msg += f" f{frame}"
print(start_msg + f" -> {cam_name}")
st = time.time()
try:
scn[_MARI_RENDER_ONE_STATUS_KEY] = ""
except Exception:
pass
try:
res = _run_with_scene_override(
scn,
lambda: bpy.ops.mari.render_one(camera_name=cam_name, action=action, frame=frame),
)
except Exception as call_err:
return False, str(call_err), _render_meta()
ok = (res == {'FINISHED'})
if ok and scn.render.engine == "CYCLES":
ok_policy, rep = _ensure_cycles_policy(
require_gpu=(not bool(USECPU)),
phase=f"post-mari-{cam_name}-f{frame}"
)
if not ok_policy:
return False, f"GPU_POLICY_VIOLATION post-mari: {rep}", _render_meta(rendered=True, elapsed=(time.time() - st))
if ok and action == "STILL":
_fix_mari_still_output_name(scn, H, V)
elapsed = time.time() - st
status = _mari_read_render_status(scn)
fin_msg = f"[MGPU-CHILD] {tag} finished MARI {action} H{H} V{V}"
if frame >= 0:
fin_msg += f" f{frame}"
if ok:
print(fin_msg + f" ({elapsed:.2f}s) -> {cam_name}")
else:
print(fin_msg.replace("finished", "MISSING") + f" ({elapsed:.2f}s) -> {cam_name}")
sys.stdout.flush()
_proj_print(H, V, elapsed, glb)
meta = _render_meta(rendered=status.get("rendered"), skipped=status.get("skipped"), elapsed=elapsed)
return ok, "" if ok else "mari.render_one returned CANCELLED", meta
except Exception as e:
traceback.print_exc()
return False, str(e), _render_meta()
# connect to parent scheduler
import socket
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
_CONNECT_RETRIES = 40
for attempt in range(_CONNECT_RETRIES):
try:
sock.connect((HOST, PORT))
break
except Exception as e:
if attempt + 1 == _CONNECT_RETRIES:
raise
print(f"[MGPU-CHILD] {TAG} waiting for scheduler ({attempt+1}/{_CONNECT_RETRIES}): {e}")
time.sleep(0.25)
jsend(sock, {"hello": TAG, "token": TOKEN})
while True:
jsend(sock, {"get": True})
msg = jrecv(sock)
if msg.get("exit"):
print(f"[MGPU-CHILD] {TAG} exit signal received"); sys.stdout.flush()
break
if MODE == "MARI" and msg.get("mari_job"):
job = msg.get("mari_job") or {}
glb = msg.get("globals") or {}
ok, err, meta = _render_mari_job(job, glb)
if not ok:
print(f"[MGPU-CHILD] {TAG} ERROR MARI job {job.get('cam_name')} f{job.get('frame','-')}: {err}"); sys.stdout.flush()
jsend(sock, {"done": job.get("cam_name"), "ok": bool(ok), "job": job, "err": err, "meta": meta})
continue
# default legacy: frames
if "frame" in msg:
n = int(msg["frame"])
ok, err, meta = render_frame(n)
jsend(sock, {"done": n, "ok": bool(ok), "err": err, "meta": meta})
continue
try:
sock.close()
except Exception:
pass
"""
def _write_child_script(dirpath):
path = os.path.join(dirpath, "mgpu_worker.py")
with open(path, "w", encoding="utf-8") as f:
f.write(_CHILD_SCRIPT_SRC)
return path
def _mgpu_scene_expected_image_size(scene):
try:
pct = float(getattr(scene.render, "resolution_percentage", 100) or 100.0)
w = int(round(float(scene.render.resolution_x) * pct / 100.0))
h = int(round(float(scene.render.resolution_y) * pct / 100.0))
if w > 0 and h > 0:
return (w, h)
except Exception:
pass
return None
def _mgpu_existing_file_nonempty(path):
try:
return os.path.isfile(path) and os.path.getsize(path) > 0
except Exception:
return False
def _mgpu_validate_existing_image(path, expected_size=None, cache=None):
if cache is not None and path in cache:
return bool(cache[path])
ok = False
img = None
try:
if os.path.getsize(path) <= 0:
ok = False
else:
img = bpy.data.images.load(path, check_existing=False)
size = getattr(img, "size", None)
ok = bool(size and size[0] > 0 and size[1] > 0)
if ok and expected_size:
try:
ok = (int(size[0]) == int(expected_size[0]) and int(size[1]) == int(expected_size[1]))
except Exception:
ok = False
except Exception:
ok = False
finally:
if img is not None:
try:
bpy.data.images.remove(img)
except Exception:
pass
if cache is not None:
cache[path] = bool(ok)
return bool(ok)
def _mgpu_scan_dir_files(dir_path):
index = {}
try:
if not os.path.isdir(dir_path):
return index
with os.scandir(dir_path) as it:
for entry in it:
try:
if not entry.is_file():
continue
except Exception:
continue
try:
size = int(entry.stat().st_size)
except Exception:
size = -1
index[entry.name.lower()] = {"path": entry.path, "size": size}
except Exception:
return {}
return index
def _mgpu_scene_frame_output_path(scene, frame):
try:
return bpy.path.abspath(scene.render.frame_path(frame=int(frame)))
except Exception:
try:
return bpy.path.abspath(scene.render.filepath)
except Exception:
return ""
def _mgpu_video_seq_frame_path(seq_dir, frame, ext=".png"):
ext = str(ext or ".png").strip() or ".png"
if not ext.startswith("."):
ext = "." + ext
try:
frame_num = int(frame)
except Exception:
frame_num = frame
return os.path.join(seq_dir, f"frame_{frame_num:04d}{ext}")
def _mgpu_mari_ext_from_scene(scene):
try:
ff = str(scene.render.image_settings.file_format or "").lower()
except Exception:
ff = ""
if ff == "ffmpeg":
try:
fmt = str(scene.render.ffmpeg.format or "")
except Exception:
fmt = ""
return {
"MPEG1": "mpeg1",
"MPEG2": "mpeg2",
"MPEG4": "mp4",
"AVI": "avi",
"QUICKTIME": "mov",
"DV": "dv",
"OGG": "ogg",
"MKV": "mkv",
"FLASH": "flv",
"WEBM": "webm",
}.get(fmt, "mkv")
return {
"jpeg": "jpeg",
"jpeg_2000": "jpeg",
"iris": "rgb",
"targa": "tga",
"targa_raw": "tga",
"cineon": "cin",
"open_exr": "exr",
"open_exr_multilayer": "exr",
"tiff": "tif",
"avi_jpeg": "avi",
"avi_raw": "avi",
"png": "png",
"bmp": "bmp",
}.get(ff, ff)
def _mgpu_mari_output_root(scene):
try:
prop = getattr(scene, "mari_props", None)
if not prop:
return "", ""
base = bpy.path.abspath(getattr(prop, "render_settings_filepath", ""))
name = str(getattr(prop, "render_settings_name", "") or "").strip()
if not (base and name):
return "", ""
return os.path.join(base, name), name
except Exception:
return "", ""
def _mgpu_format_hv_label(value):
try:
return str(int(value))
except Exception:
return str(value)
def _mgpu_get_vse_strip_collection(se):
if hasattr(se, "strips"):
return se.strips
if hasattr(se, "strips_all"):
return se.strips_all
if hasattr(se, "sequences"):
return se.sequences
if hasattr(se, "sequences_all"):
return se.sequences_all
return None
def _mgpu_build_video_from_sequence(scene, seq_dir, frames, final_path):
if not frames:
raise RuntimeError("No frames provided to build video.")
seq_dir = os.path.normpath(seq_dir)
frames = sorted(frames)
first_name = frames[0]
first_path = os.path.join(seq_dir, first_name)
if not os.path.isfile(first_path):
raise RuntimeError(f"First frame not found: {first_path!r}")
work_scene = scene
created_scene = False
try:
work_scene = scene.copy()
work_scene.name = "MGPU_TEMP_VSE"
created_scene = True
except Exception:
work_scene = scene
old_se = work_scene.sequence_editor
old_frame_start = work_scene.frame_start
old_frame_end = work_scene.frame_end
old_filepath = work_scene.render.filepath
old_use_seq = work_scene.render.use_sequencer
old_use_cmp = work_scene.render.use_compositing
se = old_se if old_se is not None else work_scene.sequence_editor_create()
strip_coll = _mgpu_get_vse_strip_collection(se)
if strip_coll is None:
raise RuntimeError("SequenceEditor has no strips/sequences collection.")
try:
for s in list(strip_coll):
strip_coll.remove(s)
except Exception:
pass
strip = None
frame_count = len(frames)
try:
try:
strip = strip_coll.new_image(
name="MGPU_TEMP_SEQ",
filepath=first_path,
channel=1,
frame_start=old_frame_start,
)
except TypeError:
strip = strip_coll.new_image("MGPU_TEMP_SEQ", first_path, 1, old_frame_start)
directory = seq_dir + os.sep if not seq_dir.endswith(os.sep) else seq_dir
strip.directory = directory
if strip.elements:
strip.elements[0].filename = first_name
else:
strip.elements.append(first_name)
for name in frames[1:]:
strip.elements.append(name)
strip.frame_start = old_frame_start
strip.frame_final_duration = frame_count
work_scene.frame_start = old_frame_start
work_scene.frame_end = old_frame_start + frame_count - 1
work_scene.render.use_sequencer = True
work_scene.render.use_compositing = False
work_scene.render.filepath = os.path.splitext(final_path)[0]
try:
with bpy.context.temp_override(scene=work_scene, view_layer=work_scene.view_layers[0]):
bpy.ops.render.render(animation=True)
except Exception:
win = bpy.context.window
prev_scene = win.scene if win else None
try:
if win:
win.scene = work_scene
bpy.ops.render.render(animation=True)
finally:
if win and prev_scene:
win.scene = prev_scene
finally:
try:
work_scene.render.filepath = old_filepath
work_scene.frame_start = old_frame_start
work_scene.frame_end = old_frame_end
work_scene.render.use_sequencer = old_use_seq
work_scene.render.use_compositing = old_use_cmp
except Exception:
pass
try:
if strip is not None and strip_coll is not None and hasattr(strip_coll, "remove"):
strip_coll.remove(strip)
except Exception:
pass
if created_scene:
try:
bpy.data.scenes.remove(work_scene)
except Exception:
pass
# ----------------------- progress parsing (parent side) -----------------------
_SAMPLE_RE = re.compile(r"[Ss]amples?\s+(\d+)\s*/\s*(\d+)")
_TILE_RE = re.compile(r"[Tt]iles?\s+(\d+)\s*/\s*(\d+)")
_TILE2_RE = re.compile(r"[Tt]ile\s+(\d+)\s*/\s*(\d+)")
_CHILD_START_RE = re.compile(r"^\[MGPU-CHILD\]\s+(.+?)\s+start\s+frame\s+(\d+)\s+->\s+(.+)$")
_CHILD_FIN_RE = re.compile(r"^\[MGPU-CHILD\]\s+(.+?)\s+finished\s+frame\s+(\d+)\s+\(([\d.]+)s\)\s+->\s+(.+)$")
_CHILD_MISS_RE = re.compile(r"^\[MGPU-CHILD\]\s+(.+?)\s+MISSING\s+frame\s+(\d+)\s+\(([\d.]+)s\)\s+->\s+(.+)$")
_CHILD_MARI_START_RE = re.compile(r"^\[MGPU-CHILD\]\s+(.+?)\s+start\s+MARI\s+(\S+)\s+H(-?\d+)\s+V(-?\d+)(?:\s+f(-?\d+))?\s+->\s+(.+)$")
_CHILD_MARI_FIN_RE = re.compile(r"^\[MGPU-CHILD\]\s+(.+?)\s+finished\s+MARI\s+(\S+)\s+H(-?\d+)\s+V(-?\d+)(?:\s+f(-?\d+))?\s+\(([\d.]+)s\)\s+->\s+(.+)$")
_CHILD_MARI_MISS_RE = re.compile(r"^\[MGPU-CHILD\]\s+(.+?)\s+MISSING\s+MARI\s+(\S+)\s+H(-?\d+)\s+V(-?\d+)(?:\s+f(-?\d+))?\s+\(([\d.]+)s\)\s+->\s+(.+)$")
def _parse_progress_fields(line: str):
s_cur = s_tot = t_cur = t_tot = None
m = _SAMPLE_RE.search(line)
if m:
try:
s_cur, s_tot = int(m.group(1)), int(m.group(2))
except Exception:
pass
m2 = _TILE_RE.search(line) or _TILE2_RE.search(line)
if m2:
try:
t_cur, t_tot = int(m2.group(1)), int(m2.group(2))
except Exception:
pass
return s_cur, s_tot, t_cur, t_tot
def _progress_percent(s_cur, s_tot, t_cur, t_tot):
if s_cur is not None and s_tot and s_tot > 0:
return max(0.0, min(100.0, (s_cur / s_tot) * 100.0))
if t_cur is not None and t_tot and t_tot > 0:
return max(0.0, min(100.0, (t_cur / t_tot) * 100.0))
return None
def _progress_bar(pct, width=20):
if pct is None: return "-" * width
filled = max(0, min(width, int(round((pct / 100.0) * width))))
return "#" * filled + "-" * (width - filled)
# ----------------------- Windows Job Object (kill children on Blender exit) -----------------------
_WS_JOB = None
def _win_job_init():
if not IS_WIN: return None
try:
import ctypes
from ctypes import wintypes as wt
kernel32 = ctypes.windll.kernel32
CreateJobObjectW = kernel32.CreateJobObjectW
SetInformationJobObject = kernel32.SetInformationJobObject
class JOBOBJECT_BASIC_LIMIT_INFORMATION(ctypes.Structure):
_fields_ = [
("PerProcessUserTimeLimit", ctypes.c_longlong),
("PerJobUserTimeLimit", ctypes.c_longlong),
("LimitFlags", wt.DWORD),
("MinimumWorkingSetSize", ctypes.c_size_t),
("MaximumWorkingSetSize", ctypes.c_size_t),
("ActiveProcessLimit", wt.DWORD),
("Affinity", wt.LPVOID),
("PriorityClass", wt.DWORD),
("SchedulingClass", wt.DWORD),
]
class IO_COUNTERS(ctypes.Structure):
_fields_ = [
("ReadOperationCount", ctypes.c_ulonglong),
("WriteOperationCount", ctypes.c_ulonglong),
("OtherOperationCount", ctypes.c_ulonglong),
("ReadTransferCount", ctypes.c_ulonglong),
("WriteTransferCount", ctypes.c_ulonglong),
("OtherTransferCount", ctypes.c_ulonglong),
]
class JOBOBJECT_EXTENDED_LIMIT_INFORMATION(ctypes.Structure):
_fields_ = [
("BasicLimitInformation", JOBOBJECT_BASIC_LIMIT_INFORMATION),
("IoInfo", IO_COUNTERS),
("ProcessMemoryLimit", ctypes.c_size_t),
("JobMemoryLimit", ctypes.c_size_t),
("PeakProcessMemoryUsed", ctypes.c_size_t),
("PeakJobMemoryUsed", ctypes.c_size_t),
]
JOB_OBJECT_EXTENDED_LIMIT_INFORMATION = 9
JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE = 0x00002000
hJob = CreateJobObjectW(None, None)
if not hJob:
return None
info = JOBOBJECT_EXTENDED_LIMIT_INFORMATION()
info.BasicLimitInformation.LimitFlags = JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE
if not SetInformationJobObject(hJob, JOB_OBJECT_EXTENDED_LIMIT_INFORMATION,
ctypes.byref(info), ctypes.sizeof(info)):
return None
return hJob
except Exception:
return None
def _win_job_assign(proc):
if not IS_WIN or not proc: return
global _WS_JOB
if _WS_JOB is None:
_WS_JOB = _win_job_init()
if _WS_JOB:
try:
import ctypes
AssignProcessToJobObject = ctypes.windll.kernel32.AssignProcessToJobObject
AssignProcessToJobObject(_WS_JOB, int(proc._handle))
except Exception:
pass
# ----------------------- manager / workers -----------------------
class Worker:
def __init__(self, tag_label, gpu_uuid, phys_index, instance_index, total_instances, is_cpu=False, gpu_bus="", gpu_name=""):
self.is_cpu = bool(is_cpu)
self.gpu_uuid = gpu_uuid # 'GPU-xxxx...' or None for CPU
self.phys_index = phys_index # physical index for display (int or None)
self.gpu_bus = str(gpu_bus or "")
self.gpu_name = str(gpu_name or "")
self.instance_index = instance_index
self.total_instances = total_instances
self.proc = None
self.stdout_thread = None
self.last_line = ""
self.log_path = None
self._log_fp = None
self.tag = f"{tag_label}-#{instance_index}"
safe = re.sub(r"[^A-Za-z0-9._-]+", "_", self.tag)
if not safe:
safe = f"worker_{instance_index}"
self._file_tag = safe
self._live_line_active = False
self._last_samples = (None, None) # remember for the final 100% line
self.local_frames = []
# terminal tail process (PowerShell/xterm)
self.term_proc = None
# progress throttling state
self._last_emit_time = 0.0
self._last_pct = -1.0
self._last_s_pair = (None, None)
self._last_t_pair = (None, None)
# per-frame state (for richer messages)
self.cur_frame = None
self.cur_path = None
self.frame_start_time = 0.0
# launch diagnostics
self.launch_state = "PLANNED"
self.launch_reason = "planned"
self.launch_detail = ""
self.launch_attempted = False
self.launch_ok = False
self.launch_ts = 0.0
self.launch_pid = None
self.hello_received = False
self.hello_ts = 0.0
self.hello_timeout_reported = False
self.exit_before_hello_reported = False
self.guard_last_progress_ts = 0.0
self.guard_last_progress_sig = None
self.guard_epoch = 0
self.guard_restart_ts = deque(maxlen=16)
self.guard_restart_marks = []
self.guard_restarts_total = 0
self.cycles_backend_override = None
self.cycles_policy_failures = 0
self.cycles_cpu_hint_ts = 0.0
self.cycles_cpu_hint_line = ""
self._banner_lines_since_repeat = 0
@property
def running(self):
return (self.proc is not None) and (self.proc.poll() is None)
def open_log(self, path):
dir_path = os.path.dirname(path)
os.makedirs(dir_path, exist_ok=True)
self.log_path = os.path.join(dir_path, f"{self._file_tag}.log")
# UTF-8 with BOM so PS5/PS7 detect UTF-8 and render block glyphs correctly
self._log_fp = open(self.log_path, "w", encoding="utf-8-sig", newline="")
def alive(self) -> bool:
p = getattr(self, "proc", None)
try:
return (p is not None) and (p.poll() is None)
except Exception:
return False
def close_log(self):
try:
if self._log_fp:
self._log_fp.close()
except Exception:
pass
class MultiGPUManager:
def __init__(self, scene, threads=0, instances_per_gpu=1, dispatch_mode="DYNAMIC", max_retries=2,
open_terms=True, ghost_mode="STRICT_MINUS_LEGACY", use_persistent_data=True,
job_mode="FRAMES", mari_jobs=None, mari_globals=None, render_guard_tier="AGGRESSIVE",
denoise_on_gpu=True):
self.scene = scene
self.device_mode = _current_compute_type()
self.fallback_device_mode = "CUDA" if str(self.device_mode or "").upper() == "OPTIX" else ""
self.cpu_selected = bool(_cycles_cpu_device_selected()) if scene.render.engine == "CYCLES" else False
self.threads = int(threads)
self.instances_per_gpu = max(1, int(instances_per_gpu))
self.dispatch_mode = dispatch_mode
self.max_retries = max(0, int(max_retries))
self.open_terms = bool(open_terms)
self.ghost_mode = ghost_mode
self.use_persistent_data = bool(use_persistent_data)
self.denoise_on_gpu = bool(denoise_on_gpu)
self.job_mode = job_mode # "FRAMES" or "MARI"
self.mari_jobs = list(mari_jobs or [])
self.mari_globals = dict(mari_globals or {})
self.render_guard_tier = str(render_guard_tier or "AGGRESSIVE").upper()
self.rt_guard_cfg = _rendertime_guard_profile(self.render_guard_tier)
self.rt_guard_enabled = bool(self.rt_guard_cfg.get("enabled", False))
self.rt_guard_pause_until = 0.0
self.rt_guard_job_state = {}
self.rt_guard_restart_ts = deque(maxlen=64)
self._worker_hist = {}
self._global_hist = deque(maxlen=80)
self._rt_guard_last_log = {}
self.rt_periodic_recycle_enabled = bool(self.rt_guard_cfg.get("periodic_recycle_enabled", False))
raw_points = list(self.rt_guard_cfg.get("periodic_recycle_points", []) or [])
cleaned_points = []
for p in raw_points:
try:
fp = float(p)
if 0.0 < fp < 1.0:
cleaned_points.append(fp)
except Exception:
pass
self.rt_periodic_recycle_points = sorted(set(cleaned_points))
self.rt_periodic_recycle_seen = set()
self.rt_periodic_recycle_pending = {}
self.video_mode = False
self.video_seq_dir = None
self.video_seq_format = "PNG"
self.video_seq_ext = ".png"
self.video_output_path = None
self._forced_temp_dir = None
self._preflight_existing_check_done = False
self._skip_video_encode = False
try:
if self.job_mode == "FRAMES":
img = scene.render.image_settings
fmt = str(getattr(img, "file_format", "") or "").upper()
media = str(getattr(img, "media_type", "") or "").upper()
if fmt in VIDEO_FORMATS or media == "VIDEO":
self.video_mode = True
self.video_output_path = bpy.path.abspath(scene.render.filepath)
except Exception:
pass
if self.job_mode == "FRAMES":
fstart, fend, fstep = scene.frame_start, scene.frame_end, max(1, scene.frame_step)
self.frames = list(range(fstart, fend + 1, fstep))
self.total_frames = len(self.frames)
self.pending = list(self.frames)
else:
# MARI jobs can be expanded later (per-frame for ANIM) before start()
self.frames = []
self.total_frames = len(self.mari_jobs)
self.pending = list(self.mari_jobs)
self.finished = []
self.retries = {}
self.finished_set = set()
self.inflight = {}
self.worker_stats = {}
self.total_render_time = 0.0
self.total_render_count = 0
self.rt_last_real_completion_ts = 0.0
self.temp_dir = None
self.temp_blend = None
self.logs_dir = None
self.cancelled = False
self._lock = threading.Lock()
self._server_sock = None
self._server_thread = None
self._clients = {}
self._token = ''.join(random.choice(string.ascii_letters+string.digits) for _ in range(24))
self._child_script = None
self._hello_timeout_s = 20.0
self._launch_events = []
self._ram_cap_estimate = None
self._ram_cap_note = ""
self._enabled_addon_modules_csv = ""
self._diag_log_path = None
self._diag_log_fp = None
self._diag_term_proc = None
self._diag_term_opened = False
self._diag_buffer = []
self._selection_warning = False
self._legacy_detect = _detect_gpu_devices_legacy(False)
self._strict_detect = _detect_gpu_devices_strict(True)
sel = _detect_gpu_devices_final_from_lists(
self.ghost_mode, self._legacy_detect, self._strict_detect
)
mapped_all = _map_selection_to_uuids(sel)
mapped, dropped_unknown = _filter_known_mapped_gpus(mapped_all)
if dropped_unknown:
msg = (
f"[MGPU-GPUSEL] INFO: Hidden {len(dropped_unknown)} unresolved GPU entry(s) "
f"(index='?'). They will not be launched."
)
_log(msg)
self._diag_write(msg)
for d in dropped_unknown:
self._diag_write(
f"[MGPU-GPUSEL] dropped idx={d.get('index')} bus={d.get('bus') or '-'} "
f"uuid={d.get('uuid') or 'NONE'} name={d.get('name')}"
)
self.workers = []
if mapped:
for m in mapped:
tag_label = f"GPU{m['phys_index'] if m['phys_index'] is not None else '??'}"
for i in range(1, self.instances_per_gpu + 1):
self.workers.append(
Worker(
tag_label, m["uuid"], m["phys_index"], i, self.instances_per_gpu, is_cpu=False,
gpu_bus=(m.get("bus") or ""), gpu_name=(m.get("name") or "")
)
)
if self.scene.render.engine == "CYCLES":
if self.cpu_selected:
self.workers.append(Worker("CPU", None, None, 1, 1, is_cpu=True))
_log("[MGPU-LAUNCH] Cycles CPU device is enabled; adding one dedicated CPU worker.")
elif not mapped:
_log("[MGPU-LAUNCH] No mapped GPU workers and Cycles CPU is disabled; CPU fallback is disabled.")
else:
if not mapped:
self.workers.append(Worker("EEVEE", None, 0, 1, 1, is_cpu=False))
_log("No explicit GPU list for Eevee - running one worker.")
if self.scene.render.engine == "CYCLES" and not self.workers:
raise RuntimeError(
"No Cycles workers planned. Enable at least one GPU device, or enable CPU in Cycles render devices."
)
if self.scene.render.engine == "CYCLES":
_log(
f"[MGPU-LAUNCH] Cycles device policy: requested_backend={self.device_mode} "
f"fallback_backend={self.fallback_device_mode or 'none'} "
f"cpu_selected={'YES' if self.cpu_selected else 'NO'}"
)
_log(f"[MGPU-LAUNCH] Worker plan: mapped_gpus={len(mapped)} instances_per_gpu={self.instances_per_gpu} planned_workers={len(self.workers)}")
for w in self.workers:
dev_txt = "CPU" if w.is_cpu else f"GPU idx={w.phys_index if w.phys_index is not None else '?'} uuid={(w.gpu_uuid or 'none')[:24]}"
self._record_launch_event(w, "PLANNED", "WORKER_PLANNED", dev_txt)
self._log_gpu_selection_breakdown(sel, mapped)
self._rebuild_dispatch_queues()
if self.rt_guard_enabled:
_log(
f"[MGPU-GUARD] Render-time guard active: tier={self.render_guard_tier} "
f"(soft={self.rt_guard_cfg.get('soft_mult')}x/{int(self.rt_guard_cfg.get('soft_min_s', 0))}s, "
f"hard={self.rt_guard_cfg.get('hard_mult')}x/{int(self.rt_guard_cfg.get('hard_min_s', 0))}s, "
f"warmup_worker={int(self.rt_guard_cfg.get('warmup_per_worker_jobs', 0) or 0)}, "
f"warmup_global={int(self.rt_guard_cfg.get('warmup_completed_jobs', 0) or 0)})"
)
if self.rt_periodic_recycle_enabled and self.rt_periodic_recycle_points:
pts = ",".join(str(int(round(p * 100.0))) for p in self.rt_periodic_recycle_points)
_log(f"[MGPU-GUARD] Periodic VRAM hygiene restarts enabled at progress points: {pts}%")
else:
_log("[MGPU-GUARD] Render-time guard is OFF.")
def prepare_blend_copy(self):
if self._forced_temp_dir:
self.temp_dir = self._forced_temp_dir
try:
if os.path.isdir(self.temp_dir) and self.scene.render.use_overwrite:
shutil.rmtree(self.temp_dir, ignore_errors=True)
except Exception:
pass
os.makedirs(self.temp_dir, exist_ok=True)
else:
self.temp_dir = tempfile.mkdtemp(prefix="mgpu_frames_")
self.logs_dir = os.path.join(self.temp_dir, "logs")
os.makedirs(self.logs_dir, exist_ok=True)
self._init_diag_log()
if self.video_mode:
self.video_seq_dir = os.path.join(self.temp_dir, "frames")
os.makedirs(self.video_seq_dir, exist_ok=True)
base = os.path.basename(bpy.data.filepath) or "untitled.blend"
temp_path = os.path.join(self.temp_dir, base)
bpy.ops.wm.save_as_mainfile(filepath=temp_path, copy=True)
self.temp_blend = temp_path
self.src_blend_dir = os.path.dirname(bpy.data.filepath)
self._child_script = _write_child_script(self.temp_dir)
self._enabled_addons_file = None
try:
enabled_addons = _mgpu_enabled_addons_snapshot()
addons_file = os.path.join(self.temp_dir, "enabled_addons.json")
with open(addons_file, "w", encoding="utf-8") as fp:
json.dump({"addons": enabled_addons}, fp)
self._enabled_addons_file = addons_file
self._enabled_addon_modules_csv = ",".join(_mgpu_enabled_addon_module_names(enabled_addons))
_log(f"Captured {len(enabled_addons)} enabled add-ons for workers.")
except Exception as e:
_log(f"WARN: Failed to capture enabled add-ons: {e}")
_log(f"Prepared temp blend: {self.temp_blend}")
# Bundle the MARI add-on so child workers load the same version
self._mari_dir = None
if self.job_mode == "MARI":
try:
import importlib, inspect
mari_mod = None
# 1) Search installed add-ons; import each real module, then look for addon_prefix == "mari"
for meta in addon_utils.modules():
name = getattr(meta, "__name__", None)
if not name:
continue
try:
mod = importlib.import_module(name)
except Exception:
continue
# Primary signal: addon declares addon_prefix = "mari"
if getattr(mod, "addon_prefix", None) == "mari":
mari_mod = mod
break
# Fallback heuristic: any registered classes with bl_idname starting with "mari."
try:
if any(
isinstance(obj, type) and getattr(obj, "bl_idname", "").startswith("mari.")
for obj in mod.__dict__.values()
):
mari_mod = mod
break
except Exception:
pass
if mari_mod:
src = os.path.dirname(mari_mod.__file__)
dst = os.path.join(self.temp_dir, "holo_mari_addon")
if os.path.isdir(src):
shutil.copytree(src, dst, ignore=shutil.ignore_patterns("__pycache__", "*.pyc"))
else:
os.makedirs(dst, exist_ok=True)
shutil.copy2(mari_mod.__file__, os.path.join(dst, "__init__.py"))
self._mari_dir = dst
_log(f"Copied MARI addon to: {self._mari_dir}")
else:
_log("WARN: Could not import/locate the MARI add-on; child will not have bpy.ops.mari.*")
except Exception as e:
_log(f"WARN: Failed to copy MARI add-on: {e}")
def _start_server(self):
self._server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
self._server_sock.bind(("127.0.0.1", 0))
self._server_sock.listen(16)
self._server_sock.settimeout(1.0)
self._server_port = self._server_sock.getsockname()[1]
self._server_thread = threading.Thread(target=self._accept_loop, daemon=True)
self._server_thread.start()
_log(f"Scheduler server on port {self._server_port}")
def _accept_loop(self):
while not self.cancelled:
try:
conn, _addr = self._server_sock.accept()
except socket.timeout:
continue
except Exception:
break
threading.Thread(target=self._client_loop, args=(conn,), daemon=True).start()
def _client_loop(self, conn):
f = conn.makefile("rwb", buffering=0)
tag = None
def jrecv():
line = f.readline()
if not line:
raise ConnectionError("client closed")
return json.loads(line.decode("utf-8", "ignore"))
def jsend(obj):
f.write((json.dumps(obj)+"\n").encode("utf-8", "ignore")); f.flush()
try:
hello = jrecv()
tag = hello.get("hello"); token = hello.get("token")
if token != self._token or not tag:
try: jsend({"exit": True})
except Exception: pass
f.close(); conn.close(); return
with self._lock:
self._clients[tag] = (conn, f, jsend, jrecv)
self._mark_worker_connected(tag)
while not self.cancelled:
msg = jrecv()
if msg.get("get"):
unit = None
send_exit = False
g = None
frame = None
with self._lock:
if self.job_mode == "MARI":
unit = self._next_mari_for_tag(tag)
else:
unit = self._next_frame_for_tag(tag)
if unit is None:
send_exit = True
else:
self._record_inflight(tag, unit)
if self.job_mode == "MARI":
g = dict(self.mari_globals)
g["proj_total"] = getattr(self, "total_frames", 0)
g["proj_done"] = len(self.finished_set)
else:
frame = int(unit)
if send_exit:
try: jsend({"exit": True})
except Exception: pass
break
if self.job_mode == "MARI":
jsend({"mari_job": unit, "globals": g})
else:
jsend({"frame": frame})
continue
if "done" in msg:
with self._lock:
self._handle_job_done(tag, msg)
continue
except Exception as e:
_log(f"[client] connection end: {e}")
finally:
if tag:
try:
with self._lock:
cur = self._clients.get(tag)
if cur and cur[0] is conn:
self._clients.pop(tag, None)
except Exception:
pass
try: f.close()
except Exception: pass
try: conn.close()
except Exception: pass
def _worker_by_tag(self, tag):
for w in self.workers:
if w.tag == tag:
return w
return None
def _cycles_backend_for_worker(self, w: Worker):
if not w:
return str(self.device_mode or "CUDA").upper()
if w.is_cpu or self.scene.render.engine != "CYCLES":
return str(self.device_mode or "CUDA").upper()
override = str(getattr(w, "cycles_backend_override", "") or "").upper()
if override:
return override
return str(self.device_mode or "CUDA").upper()
def _cycles_fallback_for_worker(self, w: Worker, primary_backend: str = ""):
if (not w) or w.is_cpu or self.scene.render.engine != "CYCLES":
return ""
primary = str(primary_backend or self._cycles_backend_for_worker(w) or "").upper()
if primary == "OPTIX":
return "CUDA"
return ""
def _handle_cycles_gpu_policy_failure(self, w: Worker, err_text: str, inflight=None):
if (not w) or w.is_cpu or self.scene.render.engine != "CYCLES":
return False
txt = str(err_text or "")
txt_up = txt.upper()
trigger = ("GPU_POLICY_VIOLATION" in txt_up) or ("CYCLES DEVICE SETUP FAILED" in txt_up)
if not trigger:
return False
w.cycles_policy_failures = int(getattr(w, "cycles_policy_failures", 0) or 0) + 1
current_backend = self._cycles_backend_for_worker(w)
switched = False
if current_backend == "OPTIX":
w.cycles_backend_override = "CUDA"
switched = True
reason = f"cycles-gpu-policy-failure#{w.cycles_policy_failures}"
if switched:
reason += f" backend={current_backend}->CUDA"
_log(f"[MGPU-LAUNCH] {w.tag}: GPU policy violation; switching backend {current_backend} -> CUDA and restarting worker.")
else:
reason += f" backend={self._cycles_backend_for_worker(w)}"
_log(f"[MGPU-LAUNCH] {w.tag}: GPU policy violation persisted on backend={self._cycles_backend_for_worker(w)}; restarting worker.")
self._diag_write(f"[MGPU-LAUNCH] {w.tag}: err='{txt[:220]}'")
ok = self._restart_worker_same_gpu(w, reason, info=inflight)
if ok:
self._open_diag_terminal_if_needed()
else:
_log(f"[MGPU-LAUNCH] WARNING: restart failed after GPU policy violation for {w.tag}.")
return ok
def _init_diag_log(self):
if not self.logs_dir:
return
try:
path = os.path.join(self.logs_dir, "_launch_diagnostics.log")
self._diag_log_path = path
self._diag_log_fp = open(path, "w", encoding="utf-8-sig", newline="")
self._diag_log_fp.write(BANNER_MANAGER_ASCII.rstrip("\n") + "\n")
self._diag_log_fp.write("[MGPU-LAUNCH] Diagnostics log initialized.\n")
for line in self._diag_buffer:
self._diag_log_fp.write((line or "").rstrip("\n") + "\n")
self._diag_buffer = []
# Flush queued events captured before logs_dir existed
for evt in self._launch_events:
msg = f"[MGPU-LAUNCH] {evt.get('tag','?')} {evt.get('state','')}:{evt.get('reason','')}"
det = evt.get("detail") or ""
if det:
msg += f" | {det}"
self._diag_log_fp.write(msg + "\n")
self._diag_log_fp.flush()
except Exception:
self._diag_log_fp = None
def _diag_write(self, text):
try:
line = (text or "").rstrip("\n")
if self._diag_log_fp:
self._diag_log_fp.write(line + "\n")
self._diag_log_fp.flush()
else:
self._diag_buffer.append(line)
if len(self._diag_buffer) > 300:
self._diag_buffer = self._diag_buffer[-300:]
except Exception:
pass
def _spawn_tail_terminal(self, log_path: str, enable_vt=False):
if not log_path:
return None
if IS_WIN:
path_ps = str(log_path).replace("'", "''")
vt_block = ""
if enable_vt:
vt_block = (
"$c='using System; using System.Runtime.InteropServices; "
"public static class VT{"
"[DllImport(\"kernel32.dll\")] public static extern System.IntPtr GetStdHandle(int n); "
"[DllImport(\"kernel32.dll\")] public static extern bool GetConsoleMode(System.IntPtr h, out int m); "
"[DllImport(\"kernel32.dll\")] public static extern bool SetConsoleMode(System.IntPtr h, int m);"
"}'; "
"Add-Type -TypeDefinition $c -ErrorAction SilentlyContinue; "
"$h=[VT]::GetStdHandle(-11); $m=0; [VT]::GetConsoleMode($h,[ref]$m)|Out-Null; "
"[VT]::SetConsoleMode($h, ($m -bor 4)) | Out-Null; "
)
cmd = (
"$ErrorActionPreference='SilentlyContinue'; "
+ vt_block +
"try{[Console]::OutputEncoding=[Text.UTF8Encoding]::new($true)}catch{}; "
"chcp 65001 | Out-Null; "
"try{$raw=$Host.UI.RawUI; $raw.BackgroundColor='Black'; $raw.ForegroundColor='Red'; Clear-Host}catch{}; "
f"$p='{path_ps}'; Get-Content -LiteralPath $p -Wait"
)
try:
proc = subprocess.Popen(
["powershell", "-NoLogo", "-NoProfile", "-Command", cmd],
creationflags=subprocess.CREATE_NEW_CONSOLE
)
_win_job_assign(proc)
return proc
except Exception:
return None
if IS_MAC:
path_applescript = str(log_path).replace("\\", "\\\\").replace('"', '\\"')
cmd_prefix = "printf '\\\\033[0;31;40m'; clear; tail -f "
script = (
'tell application "Terminal"\n'
f' do script "{cmd_prefix}" & quoted form of POSIX path of "{path_applescript}"\n'
'end tell'
)
try:
return subprocess.Popen(["osascript", "-e", script])
except Exception:
return None
# Linux / BSD / other POSIX
quoted = shlex.quote(str(log_path))
tail_cmd = f"printf '\\033[0;31;40m'; clear; tail -f {quoted}"
candidates = []
if shutil.which("xterm"):
candidates.append(["xterm", "-hold", "-bg", "black", "-fg", "red", "-e", "sh", "-lc", tail_cmd])
if shutil.which("x-terminal-emulator"):
candidates.append(["x-terminal-emulator", "-e", "sh", "-lc", tail_cmd])
if shutil.which("gnome-terminal"):
candidates.append(["gnome-terminal", "--", "sh", "-lc", tail_cmd])
if shutil.which("konsole"):
candidates.append(["konsole", "--hold", "-e", "sh", "-lc", tail_cmd])
if shutil.which("xfce4-terminal"):
candidates.append(["xfce4-terminal", "--hold", "--command", f"sh -lc {shlex.quote(tail_cmd)}"])
if shutil.which("mate-terminal"):
candidates.append(["mate-terminal", "--", "sh", "-lc", tail_cmd])
if shutil.which("lxterminal"):
candidates.append(["lxterminal", "-e", f"sh -lc {shlex.quote(tail_cmd)}"])
if shutil.which("kitty"):
candidates.append(["kitty", "sh", "-lc", tail_cmd])
if shutil.which("alacritty"):
candidates.append(["alacritty", "-e", "sh", "-lc", tail_cmd])
for argv in candidates:
try:
return subprocess.Popen(argv)
except Exception:
continue
return None
def _open_diag_terminal_if_needed(self):
if self._diag_term_opened:
return
if (not self.open_terms) or (not self._diag_log_path):
return
self._diag_term_opened = True
self._diag_term_proc = self._spawn_tail_terminal(self._diag_log_path, enable_vt=False)
if self._diag_term_proc is None:
self._diag_term_opened = False
def _log_gpu_selection_breakdown(self, final_sel, mapped):
mode = self.ghost_mode
snap = _cycles_device_snapshot()
backend = snap.get("backend")
rows = list(snap.get("rows") or [])
if rows:
type_counts = Counter(str(r.get("type") or "?") for r in rows)
cnt_text = ", ".join(f"{k}:{type_counts[k]}" for k in sorted(type_counts.keys()))
sum_msg = f"[MGPU-GPUSEL] cycles backend={backend} rows={len(rows)} by_type=[{cnt_text}]"
_log(sum_msg)
self._diag_write(sum_msg)
for i, r in enumerate(rows):
self._diag_write(
f"[MGPU-GPUSEL] cycles[{i}] type={r.get('type')} use={r.get('use')} "
f"bus={r.get('bus') or '-'} id={r.get('id') or '-'} name={r.get('name')}"
)
_log(
f"[MGPU-GPUSEL] mode={mode} legacy={len(self._legacy_detect)} strict={len(self._strict_detect)} "
f"final={len(final_sel)} mapped={len(mapped)}"
)
for i, item in enumerate(final_sel):
idx, name, backend, selected, bus = item
msg = f"[MGPU-GPUSEL] final[{i}] idx={idx} sel={selected} backend={backend} bus={bus or '-'} name={name}"
_log(msg)
self._diag_write(msg)
for i, m in enumerate(mapped):
msg = (
f"[MGPU-GPUSEL] mapped[{i}] phys_index={m.get('phys_index')} idx={m.get('index')} "
f"bus={m.get('bus') or '-'} uuid={(m.get('uuid') or 'NONE')} "
f"name={m.get('name') or '-'}"
f"{(' cycles_name=' + str(m.get('cycles_name'))) if m.get('cycles_name') else ''}"
)
_log(msg)
self._diag_write(msg)
if len(final_sel) != len(mapped):
msg = f"[MGPU-GPUSEL] WARNING: final({len(final_sel)}) != mapped({len(mapped)})"
_log(msg)
self._diag_write(msg)
self._selection_warning = True
if len(final_sel) < len(self._strict_detect):
msg = (
f"[MGPU-GPUSEL] WARNING: strict selected GPUs ({len(self._strict_detect)}) reduced to final ({len(final_sel)}) "
f"by ghost filter mode '{mode}'. STRICT safeguard should prevent launch drops."
)
_log(msg)
self._diag_write(msg)
self._selection_warning = True
missing_uuid = sum(1 for m in mapped if not m.get("uuid"))
if missing_uuid > 0:
msg = f"[MGPU-GPUSEL] WARNING: {missing_uuid} mapped GPU(s) have no UUID match; launch pinning may be unreliable."
_log(msg)
self._diag_write(msg)
self._selection_warning = True
phys = _win_query_nvidia_smi_detailed() or []
if phys:
mapped_phys = {m.get("phys_index") for m in mapped if m.get("phys_index") is not None}
backend_rows = [r for r in rows if str(r.get("type")) == str(backend)]
if backend and len(backend_rows) < len(phys):
msg = (
f"[MGPU-GPUSEL] WARNING: Cycles backend '{backend}' exposes {len(backend_rows)} "
f"device row(s), but nvidia-smi sees {len(phys)} GPU(s)."
)
_log(msg)
self._diag_write(msg)
self._selection_warning = True
if len(mapped_phys) < len(phys):
phys_idx = {g.get("index") for g in phys if g.get("index") is not None}
missing_idx = sorted([i for i in phys_idx if i not in mapped_phys])
msg = (
f"[MGPU-GPUSEL] WARNING: NVIDIA physical GPUs={len(phys)} but mapped GPUs={len(mapped_phys)}. "
f"A GPU may be filtered out by backend/type mismatch or unresolved bus-id mapping."
)
_log(msg)
self._diag_write(msg)
if missing_idx:
miss_msg = f"[MGPU-GPUSEL] WARNING: unmapped NVIDIA index(es): {','.join(str(i) for i in missing_idx)}"
_log(miss_msg)
self._diag_write(miss_msg)
self._selection_warning = True
for g in phys:
msg = (
f"[MGPU-GPUSEL] phys idx={g.get('index')} bus={g.get('bus')} "
f"uuid={g.get('uuid')} name={g.get('name')}"
)
self._diag_write(msg)
def _record_launch_event(self, w: Worker, state: str, reason: str, detail: str = ""):
w.launch_state = str(state or "")
w.launch_reason = str(reason or "")
w.launch_detail = str(detail or "")
evt = {"tag": w.tag, "state": w.launch_state, "reason": w.launch_reason, "detail": w.launch_detail, "t": time.time()}
self._launch_events.append(evt)
msg = f"[MGPU-LAUNCH] {w.tag} {w.launch_state}: {w.launch_reason}"
if w.launch_detail:
msg += f" | {w.launch_detail}"
_log(msg)
self._diag_write(msg)
try:
self._emit(w, msg + "\n")
except Exception:
pass
if w.launch_state in {"FAILED_TO_LAUNCH", "FAILED_RUNTIME", "STALLING"}:
self._open_diag_terminal_if_needed()
def _update_ram_capacity_note(self):
rss = _proc_rss_bytes()
avail = _sys_mem_available_bytes()
if rss is None or avail is None:
return
per_child = max(int(rss * 0.8), 512 * 1024 * 1024)
if per_child <= 0:
return
cap = max(1, int(avail // per_child))
self._ram_cap_estimate = cap
planned = len(self.workers)
if planned > cap:
self._ram_cap_note = (
f"Planned workers={planned} exceeds rough RAM capacity={cap} "
f"(RSS={_fmt_bytes(rss)}, free={_fmt_bytes(avail)}, per-worker~{_fmt_bytes(per_child)})."
)
_log(f"[MGPU-LAUNCH] WARN: {self._ram_cap_note}")
def _mark_worker_connected(self, tag):
w = self._worker_by_tag(tag)
if not w:
return
if not w.hello_received:
w.hello_received = True
w.hello_ts = time.time()
self._record_launch_event(w, "CONNECTED", "WORKER_HELLO_OK", f"pid={w.launch_pid if w.launch_pid is not None else '?'}")
def _check_launch_health(self):
now = time.time()
for w in self.workers:
if not w.launch_attempted:
continue
p = w.proc
if not p:
continue
try:
rc = p.poll()
except Exception:
rc = None
if (rc is None) and (not w.hello_received) and w.launch_ts and (now - w.launch_ts > self._hello_timeout_s) and (not w.hello_timeout_reported):
w.hello_timeout_reported = True
self._record_launch_event(w, "STALLING", "NO_HANDSHAKE_TIMEOUT", f"waited>{int(self._hello_timeout_s)}s")
if (rc is not None) and (not w.hello_received) and (not w.exit_before_hello_reported):
w.exit_before_hello_reported = True
reason = _classify_runtime_exit_reason(getattr(w, "last_line", ""), rc)
self._record_launch_event(w, "FAILED_RUNTIME", reason, f"returncode={rc} last='{(w.last_line or '').strip()[:180]}'")
def _rebuild_dispatch_queues(self):
if self.dispatch_mode != "STRIDE":
for w in self.workers:
w.local_frames = []
return
worker_count = max(1, len(self.workers))
units = list(self.pending)
for idx, w in enumerate(self.workers):
w.local_frames = units[idx::worker_count]
def _output_spec_for_unit(self, unit):
expected_size = _mgpu_scene_expected_image_size(self.scene)
if self.job_mode == "MARI":
if not isinstance(unit, dict):
return None
root, name = _mgpu_mari_output_root(self.scene)
ext = (_mgpu_mari_ext_from_scene(self.scene) or "").lower().lstrip(".")
if not (root and name and ext):
return None
action = str(self.mari_globals.get("action", "STILL") or "STILL").upper()
is_video = bool(self.mari_globals.get("is_video", False))
h_txt = _mgpu_format_hv_label(unit.get("H"))
v_txt = _mgpu_format_hv_label(unit.get("V"))
stem = f"{name}_H{h_txt}_V{v_txt}"
try:
frame = int(unit.get("frame", -1))
except Exception:
frame = -1
if action == "ANIM" and (not is_video) and frame >= 0:
return {
"path": os.path.join(root, stem, f"{name}_{frame:04d}.{ext}"),
"kind": "image",
"expected_size": expected_size,
}
if action == "ANIM" and is_video:
start_f = int(getattr(self.scene, "frame_start", 0))
end_f = int(getattr(self.scene, "frame_end", 0))
video_stem = f"{name}_{start_f:04d}-{end_f:04d}_H{h_txt}_V{v_txt}"
return {
"path": os.path.join(root, f"{video_stem}.{ext}"),
"kind": "video",
"expected_size": None,
}
return {
"path": os.path.join(root, f"{stem}.{ext}"),
"kind": "image",
"expected_size": expected_size,
}
try:
frame = int(unit)
except Exception:
frame = unit
if self.video_mode and self.video_seq_dir:
return {
"path": _mgpu_video_seq_frame_path(self.video_seq_dir, frame, self.video_seq_ext),
"kind": "image",
"expected_size": expected_size,
}
return {
"path": _mgpu_scene_frame_output_path(self.scene, frame),
"kind": "image",
"expected_size": expected_size,
}
def _preflight_existing_outputs(self):
self._preflight_existing_check_done = False
self._skip_video_encode = False
self._rebuild_dispatch_queues()
if getattr(self.scene.render, "use_overwrite", True):
return
pending_before = list(self.pending)
total_checked = len(pending_before)
if total_checked <= 0:
return
if self.job_mode == "FRAMES" and self.video_mode and _mgpu_existing_file_nonempty(self.video_output_path):
reused = 0
for unit in pending_before:
if self._mark_finished(self._job_key(unit, "FRAMES")):
reused += 1
self.pending = []
self._skip_video_encode = True
self._preflight_existing_check_done = True
self._rebuild_dispatch_queues()
msg = (
f"[MGPU-PREFLIGHT] overwrite=OFF reused existing final video; "
f"checked={total_checked} reusable={reused} pending=0 final={self.video_output_path}"
)
_log(msg)
self._diag_write(msg)
return
dir_cache = {}
image_cache = {}
pending_after = []
reused = 0
missing = 0
invalid = 0
for unit in pending_before:
spec = self._output_spec_for_unit(unit)
path = os.path.normpath(str((spec or {}).get("path", "") or ""))
if not path:
pending_after.append(unit)
missing += 1
continue
dir_path = os.path.dirname(path)
base = os.path.basename(path).lower()
if dir_path not in dir_cache:
dir_cache[dir_path] = _mgpu_scan_dir_files(dir_path)
entry = dir_cache[dir_path].get(base)
if not entry:
pending_after.append(unit)
missing += 1
continue
if int(entry.get("size", -1) or -1) <= 0:
pending_after.append(unit)
invalid += 1
continue
kind = str(spec.get("kind", "image") or "image").lower()
if kind == "image":
ok = _mgpu_validate_existing_image(
path,
expected_size=spec.get("expected_size"),
cache=image_cache,
)
else:
ok = True
if ok:
if self._mark_finished(self._job_key(unit)):
reused += 1
else:
pending_after.append(unit)
invalid += 1
self.pending = pending_after
self._preflight_existing_check_done = True
if self.job_mode == "FRAMES" and self.video_mode and (not self.pending) and _mgpu_existing_file_nonempty(self.video_output_path):
self._skip_video_encode = True
self._rebuild_dispatch_queues()
msg = (
f"[MGPU-PREFLIGHT] overwrite=OFF checked={total_checked} reusable={reused} "
f"pending={len(self.pending)} missing={missing} invalid={invalid} dirs={len(dir_cache)}"
)
_log(msg)
self._diag_write(msg)
if self.job_mode == "FRAMES" and self.video_mode and (not self.pending) and (not self._skip_video_encode):
enc_msg = (
f"[MGPU-PREFLIGHT] All temp frames are already valid; final video will be encoded -> "
f"{self.video_output_path}"
)
_log(enc_msg)
self._diag_write(enc_msg)
def _next_frame_for_tag(self, tag):
if self.dispatch_mode == "STRIDE":
for w in self.workers:
if w.tag == tag and getattr(w, "local_frames", None) is not None:
while w.local_frames:
frame = w.local_frames.pop(0)
if self._job_key(frame, "FRAMES") in self.finished_set:
continue
return frame
return None
while self.pending:
frame = self.pending.pop(0)
if self._job_key(frame, "FRAMES") in self.finished_set:
continue
return frame
return None
def _requeue_frame_for_tag(self, tag, frame, prefer_other=False):
if self.dispatch_mode == "STRIDE":
target = None
if prefer_other:
others = [w for w in self.workers if w.tag != tag]
if others:
target = min(others, key=lambda w: len(getattr(w, "local_frames", []) or []))
if target is None:
for w in self.workers:
if w.tag == tag:
target = w
break
if target:
if getattr(target, "local_frames", None) is None:
target.local_frames = []
target.local_frames.append(frame)
return
else:
self.pending.insert(0, frame)
def _next_mari_for_tag(self, tag):
if self.dispatch_mode == "STRIDE":
for w in self.workers:
if w.tag == tag and getattr(w, "local_frames", None) is not None:
# For MARI stride we reuse local_frames to store jobs
while w.local_frames:
job = w.local_frames.pop(0)
if self._job_key(job, "MARI") in self.finished_set:
continue
return job
return None
while self.pending:
job = self.pending.pop(0)
if self._job_key(job, "MARI") in self.finished_set:
continue
return job
return None
def _requeue_mari_for_tag(self, tag, job, prefer_other=False):
if self.dispatch_mode == "STRIDE":
target = None
if prefer_other:
others = [w for w in self.workers if w.tag != tag]
if others:
target = min(others, key=lambda w: len(getattr(w, "local_frames", []) or []))
if target is None:
for w in self.workers:
if w.tag == tag:
target = w
break
if target:
if getattr(target, "local_frames", None) is None:
target.local_frames = []
target.local_frames.append(job)
return
else:
self.pending.insert(0, job)
def _job_key(self, job, mode=None):
mode = mode or self.job_mode
if mode == "MARI":
if isinstance(job, dict):
cam = job.get("cam_name") or job.get("camera") or job.get("name") or "?"
h = job.get("H")
v = job.get("V")
frame = job.get("frame", None)
try:
frame_val = int(frame) if frame is not None else -1
except Exception:
frame_val = frame if frame is not None else -1
return f"mari:{cam}:{h}:{v}:{frame_val}"
return f"mari:{job}"
try:
n = int(job)
except Exception:
n = job
return f"frame:{n}"
def _job_label(self, job, mode=None):
mode = mode or self.job_mode
if mode == "MARI":
if isinstance(job, dict):
cam = job.get("cam_name") or "?"
frame = job.get("frame", None)
try:
frame_val = int(frame) if frame is not None else -1
except Exception:
frame_val = frame if frame is not None else -1
if frame_val is None or frame_val == -1:
return cam
return f"{cam} f{frame_val}"
return str(job)
return f"frame {job}"
def _record_inflight(self, tag, job):
key = self._job_key(job)
now = time.time()
w = self._worker_by_tag(tag)
progress_ts = now
if w:
w.cycles_cpu_hint_ts = 0.0
w.cycles_cpu_hint_line = ""
w.guard_last_progress_ts = now
w.guard_last_progress_sig = ("ASSIGN", str(key))
self.inflight[tag] = {
"job": job,
"key": key,
"start": now,
"stolen": False,
"guard_hedged": False,
"guard_hedge_ts": 0.0,
"guard_last_progress_ts": progress_ts,
"guard_progress_seen": False,
"guard_epoch": int(getattr(w, "guard_epoch", 0) or 0),
}
return key
def _mark_finished(self, job_key):
if job_key in self.finished_set:
return False
self.finished_set.add(job_key)
self.finished.append(job_key)
self.rt_guard_job_state.pop(job_key, None)
return True
def _update_worker_avg(self, tag, elapsed):
try:
elapsed = float(elapsed)
except Exception:
return
if elapsed <= 0.0:
return
st = self.worker_stats.get(tag)
if not st:
st = {"count": 0, "avg": 0.0}
count = st["count"] + 1
avg = (st["avg"] * st["count"] + elapsed) / count
st["count"] = count
st["avg"] = avg
self.worker_stats[tag] = st
self.total_render_time += elapsed
self.total_render_count += 1
self.rt_last_real_completion_ts = time.time()
try:
self._worker_hist.setdefault(tag, deque(maxlen=12)).append(float(elapsed))
self._global_hist.append(float(elapsed))
except Exception:
pass
def _reset_worker_timing_baseline(self, tag):
self.worker_stats.pop(tag, None)
try:
self._worker_hist[tag] = deque(maxlen=12)
except Exception:
self._worker_hist[tag] = deque(maxlen=12)
def _avg_for_tag(self, tag):
st = self.worker_stats.get(tag)
if st and st.get("count", 0) > 0:
return st.get("avg", 0.0)
if self.total_render_count > 0:
return self.total_render_time / float(self.total_render_count)
return None
def _rt_guard_log(self, key: str, msg: str, every_s: float = 20.0, force: bool = False):
now = time.time()
if not force:
last = self._rt_guard_last_log.get(key, 0.0)
if (now - last) < max(0.0, float(every_s)):
return
self._rt_guard_last_log[key] = now
_log(msg)
self._diag_write(msg)
def _alive_worker_count(self):
n = 0
for w in self.workers:
try:
if w.alive():
n += 1
except Exception:
continue
return n
def _has_other_alive_worker(self, tag):
for w in self.workers:
if w.tag == tag:
continue
try:
if w.alive():
return True
except Exception:
continue
return False
def _rt_pending_count(self):
count = 0
if self.dispatch_mode == "STRIDE":
for w in self.workers:
for unit in list(getattr(w, "local_frames", []) or []):
try:
if self._job_key(unit) in self.finished_set:
continue
except Exception:
pass
count += 1
return count
for unit in list(self.pending):
try:
if self._job_key(unit) in self.finished_set:
continue
except Exception:
pass
count += 1
return count
def _rt_tail_straggler_ready(self, tag):
# Tail-straggler means this worker is the only remaining inflight job
# and there is no queued work left for anyone else.
if tag not in self.inflight:
return False
if len(self.inflight) != 1:
return False
if self._rt_pending_count() > 0:
return False
others = [w for w in self.workers if w.tag != tag]
if not others:
return False
for ow in others:
try:
st = self.worker_stats.get(ow.tag) or {}
if int(st.get("count", 0) or 0) > 0:
return True
except Exception:
pass
try:
if (ow.tag not in self.inflight) and ow.alive() and bool(getattr(ow, "hello_received", False)):
return True
except Exception:
pass
return False
def _rt_warmup_state(self, tag):
need_global = int(self.rt_guard_cfg.get("warmup_completed_jobs", 0) or 0)
need_worker = int(self.rt_guard_cfg.get("warmup_per_worker_jobs", 0) or 0)
done_global = int(self.total_render_count or 0)
done_worker = int((self.worker_stats.get(tag) or {}).get("count", 0) or 0)
block_global = (need_global > 0 and done_global < need_global)
block_worker = (need_worker > 0 and done_worker < need_worker)
block = bool(block_global or block_worker)
return {
"block": block,
"done_global": done_global,
"done_worker": done_worker,
"need_global": need_global,
"need_worker": need_worker,
}
def _rt_activate_periodic_recycle_stages(self):
if (not self.rt_guard_enabled) or (not self.rt_periodic_recycle_enabled):
return
if not self.rt_periodic_recycle_points:
return
total = int(getattr(self, "total_frames", 0) or 0)
if total <= 0:
return
rendered_done = int(self.total_render_count or 0)
min_done = int(self.rt_guard_cfg.get("periodic_recycle_min_completed_jobs", 0) or 0)
if rendered_done < max(0, min_done):
return
progress = float(rendered_done) / float(max(1, total))
gpu_tags = [w.tag for w in self.workers if not getattr(w, "is_cpu", False)]
if not gpu_tags:
return
for p in self.rt_periodic_recycle_points:
mark = int(round(float(p) * 100.0))
if mark in self.rt_periodic_recycle_seen:
continue
if progress < float(p):
continue
self.rt_periodic_recycle_seen.add(mark)
self.rt_periodic_recycle_pending[mark] = set(gpu_tags)
self._rt_guard_log(
f"rt-periodic-activate-{mark}",
f"[MGPU-GUARD] Activated VRAM hygiene recycle stage {mark}% "
f"(rendered={rendered_done}/{total}, completed={len(self.finished_set)}/{total}).",
force=True
)
def _rt_try_periodic_recycle_after_job(self, tag):
if (not self.rt_guard_enabled) or (not self.rt_periodic_recycle_enabled):
return
self._rt_activate_periodic_recycle_stages()
if not self.rt_periodic_recycle_pending:
return
try:
stages = sorted(self.rt_periodic_recycle_pending.keys())
except Exception:
stages = list(self.rt_periodic_recycle_pending.keys())
for stage in stages:
pending = self.rt_periodic_recycle_pending.get(stage)
if not pending:
self.rt_periodic_recycle_pending.pop(stage, None)
continue
if tag not in pending:
continue
w = self._worker_by_tag(tag)
if (not w) or w.is_cpu:
pending.discard(tag)
if not pending:
self.rt_periodic_recycle_pending.pop(stage, None)
continue
now = time.time()
can_restart, why = self._rt_can_restart_worker(w, now)
if not can_restart:
self._rt_guard_log(
f"rt-periodic-skip-{stage}-{tag}",
f"[MGPU-GUARD] {tag}: periodic VRAM recycle {stage}% delayed ({why}).",
every_s=20.0
)
return
reason = f"periodic-vram-hygiene-{stage}%"
ok = self._restart_worker_same_gpu(w, reason, info=None)
if ok:
pending.discard(tag)
self._rt_guard_log(
f"rt-periodic-restarted-{stage}-{tag}",
f"[MGPU-GUARD] {tag}: periodic VRAM hygiene restart at {stage}% complete.",
force=True
)
self._open_diag_terminal_if_needed()
if not pending:
self.rt_periodic_recycle_pending.pop(stage, None)
self._rt_guard_log(
f"rt-periodic-stage-done-{stage}",
f"[MGPU-GUARD] Periodic VRAM hygiene stage {stage}% completed for all GPU workers.",
force=True
)
else:
self._rt_guard_log(
f"rt-periodic-fail-{stage}-{tag}",
f"[MGPU-GUARD] {tag}: periodic VRAM hygiene restart at {stage}% failed.",
force=True
)
self._open_diag_terminal_if_needed()
return
def _rt_baseline_for(self, tag):
min_base = float(self.rt_guard_cfg.get("min_baseline_s", 20.0) or 20.0)
worker_med = _median(self._worker_hist.get(tag, []))
global_med = _median(self._global_hist)
tag_avg = self._avg_for_tag(tag)
worker_count = int((self.worker_stats.get(tag) or {}).get("count", 0) or 0)
cands = []
pref = (worker_med, tag_avg) if worker_count > 0 else ()
fallback = (global_med,) if worker_count <= 0 else (global_med,)
for v in tuple(pref) + tuple(fallback):
try:
fv = float(v)
if fv > 0:
cands.append(fv)
except Exception:
pass
if not cands:
return min_base
return max(max(cands), min_base)
def _rt_stall_restart_threshold(self, baseline, soft_th, progress_stall_s):
try:
baseline = float(baseline)
except Exception:
baseline = 0.0
try:
soft_th = float(soft_th)
except Exception:
soft_th = 0.0
try:
progress_stall_s = float(progress_stall_s)
except Exception:
progress_stall_s = 0.0
return max(progress_stall_s * 2.0, min(soft_th if soft_th > 0.0 else progress_stall_s, baseline * 2.5 if baseline > 0.0 else progress_stall_s))
def _rt_clean_worker_restart_marks(self, w: Worker):
window_frames = int(self.rt_guard_cfg.get("worker_restart_window_frames", 20) or 20)
kept = []
for marker in list(getattr(w, "guard_restart_marks", []) or []):
try:
if (int(self.total_render_count) - int(marker)) < window_frames:
kept.append(int(marker))
except Exception:
pass
w.guard_restart_marks = kept
def _rt_can_restart_worker(self, w: Worker, now: float):
if (not w) or (not w.alive()):
return (False, "worker-not-alive")
if now < float(getattr(self, "rt_guard_pause_until", 0.0) or 0.0):
return (False, "global-pause")
global_window = float(self.rt_guard_cfg.get("global_restart_window_s", 150.0) or 150.0)
global_limit = int(self.rt_guard_cfg.get("global_restart_limit", 3) or 3)
while self.rt_guard_restart_ts and ((now - self.rt_guard_restart_ts[0]) > global_window):
self.rt_guard_restart_ts.popleft()
if global_limit > 0 and len(self.rt_guard_restart_ts) >= global_limit:
self.rt_guard_pause_until = now + global_window
self._rt_guard_log(
"rt-global-circuit",
f"[MGPU-GUARD] Global restart circuit open for {int(global_window)}s (limit={global_limit}).",
force=True
)
self._open_diag_terminal_if_needed()
return (False, "global-circuit")
cooldown = float(self.rt_guard_cfg.get("worker_restart_cooldown_s", 120.0) or 120.0)
if w.guard_restart_ts and ((now - w.guard_restart_ts[-1]) < cooldown):
return (False, "worker-cooldown")
self._rt_clean_worker_restart_marks(w)
budget = int(self.rt_guard_cfg.get("worker_restart_budget", 2) or 2)
if budget > 0 and len(w.guard_restart_marks) >= budget:
return (False, "worker-budget")
return (True, "ok")
def _duplicate_job_for_hedge(self, tag, job):
if job is None:
return
if self.job_mode == "MARI":
self._requeue_mari_for_tag(tag, job, prefer_other=True)
else:
try:
frame = int(job)
except Exception:
frame = job
self._requeue_frame_for_tag(tag, frame, prefer_other=True)
def _restart_worker_same_gpu(self, w: Worker, reason: str, info=None):
now = time.time()
if info is None:
info = self.inflight.get(w.tag)
self.inflight.pop(w.tag, None)
if info:
key = info.get("key")
job = info.get("job")
if job is not None and (not key or key not in self.finished_set):
self._duplicate_job_for_hedge(w.tag, job)
self._rt_guard_log(
f"rt-requeue-{w.tag}",
f"[MGPU-GUARD] {w.tag}: requeued {self._job_label(job)} before restart.",
force=True
)
p = getattr(w, "proc", None)
if p and (p.poll() is None):
try:
if IS_WIN:
p.send_signal(signal.CTRL_BREAK_EVENT)
else:
p.terminate()
except Exception:
pass
deadline = time.time() + 2.5
while (p.poll() is None) and (time.time() < deadline):
time.sleep(0.05)
if p.poll() is None:
try:
p.kill()
except Exception:
pass
w.proc = None
w.stdout_thread = None
w.launch_ok = False
w.launch_pid = None
w.hello_received = False
w.hello_ts = 0.0
w.hello_timeout_reported = False
w.exit_before_hello_reported = False
w.guard_epoch = int(getattr(w, "guard_epoch", 0) or 0) + 1
w.guard_last_progress_ts = now
w.guard_last_progress_sig = ("RESTART", int(now))
self._reset_worker_timing_baseline(w.tag)
self._record_launch_event(w, "RESTARTING", "RENDERTIME_GUARD", reason)
ok = self._launch_worker_process(w)
if ok:
w.guard_restart_ts.append(now)
w.guard_restart_marks.append(int(self.total_render_count))
w.guard_restarts_total += 1
self.rt_guard_restart_ts.append(now)
return ok
def _check_render_time_guard(self):
if not self.rt_guard_enabled:
return
cfg = self.rt_guard_cfg
now = time.time()
min_samples_soft = int(cfg.get("min_samples_soft", 3) or 3)
soft_mult = float(cfg.get("soft_mult", 2.5) or 2.5)
soft_min = float(cfg.get("soft_min_s", 60.0) or 60.0)
hard_mult = float(cfg.get("hard_mult", 4.5) or 4.5)
hard_min = float(cfg.get("hard_min_s", 180.0) or 180.0)
progress_stall_s = float(cfg.get("progress_stall_s", 60.0) or 60.0)
hedge_grace_s = float(cfg.get("hedge_grace_s", 45.0) or 45.0)
hedge_max_per_job = int(cfg.get("hedge_max_per_job", 1) or 1)
restart_max_per_job = int(cfg.get("restart_max_per_job", 1) or 1)
single_worker_min_stall = float(cfg.get("single_worker_min_stall_s", 180.0) or 180.0)
snapshots = {}
for tag, info in list(self.inflight.items()):
w = self._worker_by_tag(tag)
if (not w) or (not w.alive()):
continue
key = info.get("key")
if not key:
continue
start = float(info.get("start", now) or now)
elapsed = max(0.0, now - start)
baseline = self._rt_baseline_for(tag)
soft_th = max(soft_min, baseline * soft_mult)
hard_th = max(hard_min, baseline * hard_mult)
progress_ts = float(getattr(w, "guard_last_progress_ts", 0.0) or 0.0)
if progress_ts <= 0.0:
progress_ts = float(info.get("guard_last_progress_ts", start) or start)
progress_ts = max(progress_ts, start)
info["guard_last_progress_ts"] = progress_ts
no_progress_for = max(0.0, now - progress_ts)
stall_restart_s = self._rt_stall_restart_threshold(baseline, soft_th, progress_stall_s)
snapshots[tag] = {
"worker": w,
"key": key,
"start": start,
"elapsed": elapsed,
"baseline": baseline,
"soft_th": soft_th,
"hard_th": hard_th,
"progress_ts": progress_ts,
"no_progress_for": no_progress_for,
"stall_restart_s": stall_restart_s,
}
global_wave_tags = set()
active_tags = [tag for tag, snap in snapshots.items() if snap["key"] not in self.finished_set]
if len(active_tags) > 1 and self.total_render_count >= max(1, min_samples_soft):
all_stalled = True
for tag in active_tags:
snap = snapshots[tag]
info = self.inflight.get(tag) or {}
if (not bool(info.get("guard_progress_seen"))) or snap["elapsed"] < snap["stall_restart_s"] or snap["no_progress_for"] < snap["stall_restart_s"]:
all_stalled = False
break
if all_stalled:
global_wave_tags = set(active_tags)
wave_gap = min(snapshots[tag]["no_progress_for"] for tag in active_tags)
self._rt_guard_log(
"rt-global-stall-wave",
f"[MGPU-GUARD] Global no-progress wave detected across {len(active_tags)} workers "
f"(stall={wave_gap:.1f}s).",
every_s=15.0
)
for tag, info in list(self.inflight.items()):
snap = snapshots.get(tag)
if not snap:
continue
w = snap["worker"]
key = snap["key"]
job = info.get("job")
elapsed = snap["elapsed"]
baseline = snap["baseline"]
soft_th = snap["soft_th"]
hard_th = snap["hard_th"]
no_progress_for = snap["no_progress_for"]
stall_restart_s = snap["stall_restart_s"]
progress_seen = bool(info.get("guard_progress_seen"))
state = self.rt_guard_job_state.setdefault(key, {"hedges": 0, "restarts": 0})
warm = self._rt_warmup_state(tag)
if warm.get("block") and tag not in global_wave_tags:
tail_ready = self._rt_tail_straggler_ready(tag)
if not tail_ready:
needs = []
if int(warm.get("need_worker", 0) or 0) > 0:
needs.append(f"worker={int(warm.get('done_worker', 0))}/{int(warm.get('need_worker', 0))}")
if int(warm.get("need_global", 0) or 0) > 0:
needs.append(f"global={int(warm.get('done_global', 0))}/{int(warm.get('need_global', 0))}")
detail = ", ".join(needs) if needs else "warmup"
self._rt_guard_log(
f"rt-warmup-skip-{tag}",
f"[MGPU-GUARD] {tag}: warmup skip for {self._job_label(job)} ({detail}).",
every_s=20.0
)
continue
self._rt_guard_log(
f"rt-warmup-tail-{tag}",
f"[MGPU-GUARD] {tag}: warmup override (tail-straggler) for {self._job_label(job)}.",
every_s=20.0
)
restart_reason = None
if self.scene.render.engine == "CYCLES" and (not w.is_cpu):
hint_ts = float(getattr(w, "cycles_cpu_hint_ts", 0.0) or 0.0)
if hint_ts > 0.0 and hint_ts >= snap["start"]:
line_hint = str(getattr(w, "cycles_cpu_hint_line", "") or "").strip()
if self._cycles_backend_for_worker(w) == "OPTIX":
w.cycles_backend_override = "CUDA"
self._rt_guard_log(
f"rt-cpu-hint-switch-{tag}",
f"[MGPU-GUARD] {tag}: CPU hint detected; switching backend OPTIX -> CUDA.",
every_s=10.0
)
restart_reason = f"cpu-device-hint ({line_hint[:96]})"
if (key not in self.finished_set) and (restart_reason is None):
can_hedge = (
tag not in global_wave_tags and
progress_seen and
elapsed >= soft_th and
self.total_render_count >= min_samples_soft and
(not info.get("guard_hedged")) and
state.get("hedges", 0) < hedge_max_per_job and
self._has_other_alive_worker(tag)
)
if can_hedge:
info["guard_hedged"] = True
info["guard_hedge_ts"] = now
state["hedges"] = int(state.get("hedges", 0)) + 1
self._duplicate_job_for_hedge(tag, job)
self._rt_guard_log(
f"rt-hedge-{key}",
f"[MGPU-GUARD] {tag}: hedge duplicate for {self._job_label(job)} "
f"(elapsed={elapsed:.1f}s, baseline={baseline:.1f}s, soft={soft_th:.1f}s).",
force=True
)
continue
if (key in self.finished_set) and info.get("guard_hedged"):
hedge_for = max(0.0, now - float(info.get("guard_hedge_ts", now) or now))
if hedge_for >= hedge_grace_s:
restart_reason = f"hedged-copy-finished-elsewhere ({hedge_for:.0f}s)"
if restart_reason is None and info.get("guard_hedged"):
hedge_for = max(0.0, now - float(info.get("guard_hedge_ts", now) or now))
if progress_seen and hedge_for >= hedge_grace_s and no_progress_for >= stall_restart_s:
restart_reason = f"post-hedge no-progress {no_progress_for:.0f}s"
if restart_reason is None and tag in global_wave_tags:
restart_reason = f"global-no-progress-wave {no_progress_for:.0f}s"
if restart_reason is None and elapsed >= hard_th:
if progress_seen and no_progress_for >= progress_stall_s:
restart_reason = f"no-progress {no_progress_for:.0f}s"
elif info.get("guard_hedged"):
hedge_for = max(0.0, now - float(info.get("guard_hedge_ts", now) or now))
if hedge_for >= hedge_grace_s:
restart_reason = f"post-hedge slow ({hedge_for:.0f}s)"
elif elapsed >= (hard_th * 1.35):
restart_reason = "hard-timeout"
if restart_reason is None:
continue
if int(state.get("restarts", 0)) >= restart_max_per_job:
self._rt_guard_log(
f"rt-restart-cap-{key}",
f"[MGPU-GUARD] {tag}: restart cap reached for {self._job_label(job)}; continuing without restart.",
every_s=45.0
)
continue
if self._alive_worker_count() <= 1 and no_progress_for < single_worker_min_stall:
self._rt_guard_log(
f"rt-single-skip-{tag}",
f"[MGPU-GUARD] {tag}: single-worker mode, delaying restart until stall>{int(single_worker_min_stall)}s.",
every_s=45.0
)
continue
can_restart, why = self._rt_can_restart_worker(w, now)
if not can_restart:
self._rt_guard_log(
f"rt-restart-skip-{tag}-{why}",
f"[MGPU-GUARD] {tag}: restart skipped ({why}).",
every_s=30.0
)
continue
state["restarts"] = int(state.get("restarts", 0)) + 1
reason = (
f"{restart_reason}; elapsed={elapsed:.1f}s baseline={baseline:.1f}s "
f"stall={stall_restart_s:.1f}s soft={soft_th:.1f}s hard={hard_th:.1f}s"
)
ok = self._restart_worker_same_gpu(w, reason, info=info)
if ok:
self._rt_guard_log(
f"rt-restarted-{tag}",
f"[MGPU-GUARD] {tag}: restarted on same GPU ({reason}).",
force=True
)
self._open_diag_terminal_if_needed()
else:
self._rt_guard_log(
f"rt-restart-fail-{tag}",
f"[MGPU-GUARD] {tag}: restart failed ({reason}).",
force=True
)
self._open_diag_terminal_if_needed()
def _handle_job_done(self, tag, msg):
ok = bool(msg.get("ok", False))
meta = dict(msg.get("meta") or {})
inflight = self.inflight.pop(tag, None)
job = inflight.get("job") if inflight else None
key = inflight.get("key") if inflight else None
if job is None:
job = msg.get("job")
if job is None:
job = msg.get("done")
if job is None:
_log(f"WARNING: Missing job payload from {tag}")
return
if key is None:
mode = "MARI" if isinstance(job, dict) else "FRAMES"
key = self._job_key(job, mode)
if not ok:
w = self._worker_by_tag(tag)
err_text = str(msg.get("err", "") or "")
if self._handle_cycles_gpu_policy_failure(w, err_text, inflight=inflight):
return
self._handle_retry(tag, job, key=key, reason="failed")
return
if key in self.finished_set:
self.rt_guard_job_state.pop(key, None)
return
w = self._worker_by_tag(tag)
if w:
try:
w.guard_last_progress_ts = time.time()
w.guard_last_progress_sig = ("SOCKET_DONE", str(key))
except Exception:
pass
if self.scene.render.engine == "CYCLES" and w and (not w.is_cpu):
hint_ts = float(getattr(w, "cycles_cpu_hint_ts", 0.0) or 0.0)
start_ts = float((inflight or {}).get("start", 0.0) or 0.0)
if hint_ts > 0.0 and start_ts > 0.0 and hint_ts >= start_ts:
err_text = f"GPU_POLICY_VIOLATION cpu-hint: {getattr(w, 'cycles_cpu_hint_line', '')}"
if self._handle_cycles_gpu_policy_failure(w, err_text, inflight=inflight):
return
self._handle_retry(tag, job, key=key, reason="cpu-hint")
return
rendered = meta.get("rendered")
skipped = meta.get("skipped")
if rendered is None:
err_text = str(msg.get("err", "") or "")
if "skip existing" in err_text.lower():
rendered = False
skipped = True
else:
rendered = ok
rendered = bool(rendered)
skipped = bool(skipped)
if rendered:
elapsed = 0.0
try:
elapsed = float(meta.get("elapsed", 0.0) or 0.0)
except Exception:
elapsed = 0.0
if elapsed <= 0.0 and inflight and inflight.get("start"):
elapsed = max(0.0, time.time() - inflight["start"])
self._update_worker_avg(tag, elapsed)
elif skipped:
self._rt_guard_log(
f"rt-skip-{tag}-{key}",
f"[MGPU-GUARD] {tag}: reused existing output for {self._job_label(job)}; not counted in timing baseline.",
every_s=5.0
)
marked = self._mark_finished(key)
if marked and rendered:
self._rt_try_periodic_recycle_after_job(tag)
def _handle_retry(self, tag, job, key=None, reason="failed", prefer_other=False):
if job is None:
return
if key is None:
mode = "MARI" if isinstance(job, dict) else "FRAMES"
key = self._job_key(job, mode)
if key in self.finished_set:
return
self.inflight.pop(tag, None)
tries = self.retries.get(key, 0) + 1
self.retries[key] = tries
if tries > self.max_retries:
_log(f"Giving up on {self._job_label(job)} after {tries - 1} retries ({reason}).")
self._mark_finished(key)
return
_log(f"Retrying {self._job_label(job)} ({reason}) attempt {tries}/{self.max_retries}")
if self.job_mode == "MARI":
self._requeue_mari_for_tag(tag, job, prefer_other=prefer_other)
else:
try:
frame = int(job)
except Exception:
frame = job
self._requeue_frame_for_tag(tag, frame, prefer_other=prefer_other)
def _check_inflight_timeouts(self):
if self.rt_guard_enabled:
return
if len(self.workers) < 2:
return
now = time.time()
for tag, info in list(self.inflight.items()):
if info.get("stolen"):
continue
start = info.get("start", 0)
if not start:
continue
avg = self._avg_for_tag(tag)
if avg is None or avg <= 0.0:
continue
elapsed = now - start
threshold = max(avg * 2.5, 30.0)
if elapsed >= threshold:
info["stolen"] = True
job = info.get("job")
_log(f"Slow job on {tag} ({elapsed:.1f}s > {threshold:.1f}s). Requeueing {self._job_label(job)}.")
self._handle_retry(tag, job, reason="slow", prefer_other=True)
# ---------- terminal tail ----------
def _write_header(self, w: Worker):
try:
scene = self.scene
engine = scene.render.engine
backend = self.device_mode
if engine == "CYCLES" and not w.is_cpu:
backend = self._cycles_backend_for_worker(w)
dev = "CPU" if w.is_cpu else (
f"GPU {w.phys_index if w.phys_index is not None else '?'} {(w.gpu_uuid or '')[:12]}"
f"{(' bus=' + w.gpu_bus) if w.gpu_bus else ''}"
)
blend_name = os.path.basename(bpy.data.filepath or "untitled.blend")
fstart, fend, fstep = scene.frame_start, scene.frame_end, scene.frame_step
try:
out_dir = os.path.dirname(bpy.path.abspath(scene.render.frame_path(frame=fstart)))
except Exception:
out_dir = bpy.path.abspath(scene.render.filepath)
header = (
BANNER_ASCII + "\n"
f"[MGPU-INFO] Tag: {w.tag}\n"
f"[MGPU-INFO] Engine: {engine} | Backend: {backend} | Device: {dev} | Threads/child: {self.threads} | Guard: {self.render_guard_tier} | DenoiseGPU: {'ON' if self.denoise_on_gpu else 'OFF'}\n"
f"[MGPU-INFO] .blend: {blend_name} | Frames: {fstart}–{fend} step {fstep}\n"
f"[MGPU-INFO] Output dir: {out_dir}\n"
)
w._banner_lines_since_repeat = 0
self._emit(w, header, count_for_banner=False)
except Exception:
pass
def _open_terminal_tail(self, w: Worker):
if not self.open_terms:
return
proc = self._spawn_tail_terminal(w.log_path, enable_vt=True)
if proc:
w.term_proc = proc
def _launch_worker_process(self, w: Worker):
with open(w.log_path, "a", encoding="utf-8"): pass
w.launch_attempted = True
w.launch_ts = time.time()
w.guard_last_progress_ts = w.launch_ts
w.guard_last_progress_sig = ("LAUNCH", int(w.launch_ts))
w.cycles_cpu_hint_ts = 0.0
w.cycles_cpu_hint_line = ""
blender_bin = bpy.app.binary_path
launch_backend = str(self.device_mode or "").upper()
launch_fallback = ""
if self.scene.render.engine == "CYCLES" and not w.is_cpu:
launch_backend = self._cycles_backend_for_worker(w)
launch_fallback = self._cycles_fallback_for_worker(w, launch_backend)
cmd = [
blender_bin, "--enable-autoexec",
]
if getattr(self, "_enabled_addon_modules_csv", ""):
cmd += ["--addons", self._enabled_addon_modules_csv]
cmd += [
"-b", self.temp_blend,
"-P", self._child_script,
"--",
]
if self.scene.render.engine == "CYCLES":
cli_dev = "CPU" if w.is_cpu else (launch_backend or self.device_mode or "CUDA")
cmd += ["--cycles-device", str(cli_dev).upper()]
cmd += [
"--mgpu-port", str(self._server_port),
"--mgpu-token", self._token,
"--mgpu-tag", w.tag,
"--mgpu-device", launch_backend or self.device_mode,
"--mgpu-threads", str(self.threads),
"--mgpu-usecpu", "1" if w.is_cpu else "0",
"--mgpu-denoise-gpu", "1" if self.denoise_on_gpu else "0",
"--mgpu-persistent", "1" if self.use_persistent_data else "0",
"--mgpu-mode", self.job_mode,
]
if (self.scene.render.engine == "CYCLES") and (not w.is_cpu):
if w.gpu_bus:
cmd += ["--mgpu-gpu-bus", w.gpu_bus]
if w.gpu_name:
cmd += ["--mgpu-gpu-name", w.gpu_name]
if (self.scene.render.engine == "CYCLES") and (not w.is_cpu) and launch_fallback:
cmd += ["--mgpu-fallback-device", launch_fallback]
if getattr(self, "src_blend_dir", None):
cmd += ["--src-dir", self.src_blend_dir]
if getattr(self, "_enabled_addons_file", None):
cmd += ["--mgpu-enabled-addons-file", self._enabled_addons_file]
if self.video_mode and self.job_mode == "FRAMES" and self.video_seq_dir:
cmd += [
"--mgpu-seq-dir", self.video_seq_dir,
"--mgpu-seq-format", self.video_seq_format,
"--mgpu-seq-ext", self.video_seq_ext,
]
if self._preflight_existing_check_done:
cmd += ["--mgpu-prechecked-existing", "1"]
# Pass MARI add-on path (so child imports & registers it)
if self.job_mode == "MARI" and getattr(self, "_mari_dir", None):
cmd += ["--mari-path", self._mari_dir]
env = os.environ.copy()
# Bind EXACT GPU via UUID
if not w.is_cpu and w.gpu_uuid:
env["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
env["CUDA_VISIBLE_DEVICES"] = w.gpu_uuid
else:
env.pop("CUDA_VISIBLE_DEVICES", None)
creationflags = subprocess.CREATE_NEW_PROCESS_GROUP if IS_WIN else 0
try:
proc = subprocess.Popen(
cmd, env=env,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
universal_newlines=True, bufsize=1,
creationflags=creationflags
)
w.proc = proc
w.launch_ok = True
w.launch_pid = int(getattr(proc, "pid", -1))
w.guard_last_progress_ts = time.time()
w.guard_last_progress_sig = ("PROCESS_STARTED", w.launch_pid)
_win_job_assign(proc)
w.stdout_thread = threading.Thread(target=self._pump_stdout, args=(w,), daemon=True)
w.stdout_thread.start()
self._record_launch_event(
w, "LAUNCHED", "PROCESS_STARTED",
f"pid={w.launch_pid} dev={'CPU' if w.is_cpu else (w.gpu_uuid or 'no-uuid')} "
f"{(' bus=' + w.gpu_bus) if ((not w.is_cpu) and w.gpu_bus) else ''} "
f"{(' cli_cycles_device=' + ('CPU' if w.is_cpu else (launch_backend or self.device_mode or 'CUDA'))) if (self.scene.render.engine == 'CYCLES') else ''} "
f"backend={(launch_backend or self.device_mode)}{(' fallback=' + launch_fallback) if ((not w.is_cpu) and launch_fallback) else ''} "
f"denoise_gpu={'ON' if self.denoise_on_gpu else 'OFF'}"
)
return True
except Exception as e:
w.launch_ok = False
w.launch_pid = None
reason = _classify_launch_exception(e)
self._record_launch_event(w, "FAILED_TO_LAUNCH", reason, str(e))
return False
def _emit(self, w: Worker, text: str, count_for_banner: bool = True):
try:
if w._log_fp:
w._log_fp.write(text)
w._log_fp.flush()
w.last_line = text.rstrip()
except Exception:
return
if not count_for_banner:
return
try:
step = int(_WORKER_BANNER_REPEAT_EVERY_LINES or 0)
if step <= 0:
return
added = int(str(text).count("\n"))
if added <= 0 and str(text):
added = 1
w._banner_lines_since_repeat = int(getattr(w, "_banner_lines_since_repeat", 0) or 0) + max(0, added)
if w._banner_lines_since_repeat < step:
return
w._banner_lines_since_repeat = 0
banner = "\n" + _WORKER_BANNER_REPEAT_TEXT
if w._log_fp:
w._log_fp.write(banner)
w._log_fp.flush()
w.last_line = banner.rstrip()
except Exception:
pass
def _pump_stdout(self, w: Worker):
PERCENT_STEP = 5.0 # update when percentage jumps by ≥5
TIME_STEP = 2.0 # every ≥2s
try:
for raw in w.proc.stdout:
line = raw.strip()
if not line:
continue
if (self.scene.render.engine == "CYCLES") and (not w.is_cpu):
ll = line.lower()
cpu_hint = False
if ("'cpu_enabled': true" in ll) or ('"cpu_enabled": true' in ll):
cpu_hint = True
if ("'scene_device': 'cpu'" in ll) or ('"scene_device": "cpu"' in ll):
cpu_hint = True
if not cpu_hint and re.search(r"\b(using|use|rendering on|fallback(?:ing)? to)\s+cpu\b", ll):
cpu_hint = True
if (
(not cpu_hint) and
re.search(r"\bdevice\b.{0,24}\bcpu\b", ll) and
("scene_device" not in ll) and
("cpu_enabled" not in ll)
):
cpu_hint = True
if cpu_hint:
w.cycles_cpu_hint_ts = time.time()
w.cycles_cpu_hint_line = line[:220]
self._emit(w, f"[MGPU-GUARD] {w.tag} CPU device hint: {w.cycles_cpu_hint_line}\n")
# Frame lifecycle from child markers
ms = _CHILD_START_RE.match(line)
if ms:
w.cur_frame = int(ms.group(2))
w.cur_path = ms.group(3)
w.frame_start_time = time.time()
w.guard_last_progress_ts = time.time()
w.guard_last_progress_sig = ("START", w.cur_frame)
w._live_line_active = False
w._last_pct = -1.0
w._last_emit_time = 0.0
self._emit(w, f"[MGPU-DASH] frame {w.cur_frame} preparing -> {w.cur_path}\n")
continue
msm = _CHILD_MARI_START_RE.match(line)
if msm:
action = str(msm.group(2) or "")
h = msm.group(3)
v = msm.group(4)
frame_txt = msm.group(5)
target = msm.group(6)
label = f"{action} H{h} V{v}" + (f" f{frame_txt}" if frame_txt is not None else "")
w.cur_frame = None
w.cur_path = target
w.frame_start_time = time.time()
w.guard_last_progress_ts = time.time()
w.guard_last_progress_sig = ("MARI_START", label)
w._live_line_active = False
w._last_pct = -1.0
w._last_emit_time = 0.0
self._emit(w, f"[MGPU-DASH] {label} preparing -> {target}\n")
continue
mf = _CHILD_FIN_RE.match(line)
if mf:
frame = int(mf.group(2))
elapsed = float(mf.group(3))
w.guard_last_progress_ts = time.time()
w.guard_last_progress_sig = ("FIN", frame)
s_cur, s_tot = w._last_samples
bar = _progress_bar(100.0, 20)
samples_txt = f" samples {s_tot}/{s_tot}" if (s_tot is not None) else ""
prefix = "\x1b[1F\x1b[2K" if w._live_line_active else ""
final_line = f"{prefix}[MGPU-PROG] {w.tag} f{frame:>4} 100.0% [{bar}]{samples_txt} {elapsed:.1f}s\n"
self._emit(w, final_line)
# reset
w._live_line_active = False
w.cur_frame = None
w.cur_path = None
w.frame_start_time = 0.0
continue
mfm = _CHILD_MARI_FIN_RE.match(line)
if mfm:
action = str(mfm.group(2) or "")
h = mfm.group(3)
v = mfm.group(4)
frame_txt = mfm.group(5)
elapsed = float(mfm.group(6))
target = mfm.group(7)
label = f"{action} H{h} V{v}" + (f" f{frame_txt}" if frame_txt is not None else "")
w.guard_last_progress_ts = time.time()
w.guard_last_progress_sig = ("MARI_FIN", label)
prefix = "\x1b[1F\x1b[2K" if w._live_line_active else ""
self._emit(w, f"{prefix}[MGPU-PROG] {w.tag} {label} 100.0% {elapsed:.1f}s -> {target}\n")
w._live_line_active = False
w.cur_frame = None
w.cur_path = None
w.frame_start_time = 0.0
continue
mm = _CHILD_MISS_RE.match(line)
if mm:
frame = int(mm.group(2))
elapsed = float(mm.group(3))
w.guard_last_progress_ts = time.time()
w.guard_last_progress_sig = ("MISS", frame)
prefix = "\x1b[1F\x1b[2K" if w._live_line_active else ""
self._emit(w, f"{prefix}[MGPU-FAIL] {w.tag} f{frame:>4} ({elapsed:.1f}s)\n")
w._live_line_active = False
w.cur_frame = None
w.cur_path = None
w.frame_start_time = 0.0
continue
mmm = _CHILD_MARI_MISS_RE.match(line)
if mmm:
action = str(mmm.group(2) or "")
h = mmm.group(3)
v = mmm.group(4)
frame_txt = mmm.group(5)
elapsed = float(mmm.group(6))
target = mmm.group(7)
label = f"{action} H{h} V{v}" + (f" f{frame_txt}" if frame_txt is not None else "")
w.guard_last_progress_ts = time.time()
w.guard_last_progress_sig = ("MARI_MISS", label)
prefix = "\x1b[1F\x1b[2K" if w._live_line_active else ""
self._emit(w, f"{prefix}[MGPU-FAIL] {w.tag} {label} ({elapsed:.1f}s) -> {target}\n")
w._live_line_active = False
w.cur_frame = None
w.cur_path = None
w.frame_start_time = 0.0
continue
if line.startswith("[MGPU-CHILD]") or "ERROR" in line or "WARNING" in line or "Traceback" in line:
self._emit(w, line + "\n")
continue
if line.startswith("[MGPU-PROJ]"):
self._emit(w, line + "\n")
continue
s_cur, s_tot, t_cur, t_tot = _parse_progress_fields(line)
pct = _progress_percent(s_cur, s_tot, t_cur, t_tot)
has_progress = any(v is not None for v in (s_cur, s_tot, t_cur, t_tot, pct))
if has_progress:
try:
info = self.inflight.get(w.tag)
if info is not None:
info["guard_progress_seen"] = True
except Exception:
pass
now = time.time()
progress_sig = (s_cur, s_tot, t_cur, t_tot, (None if pct is None else int(pct)))
if progress_sig != w.guard_last_progress_sig:
w.guard_last_progress_sig = progress_sig
w.guard_last_progress_ts = now
if s_cur is not None or s_tot is not None:
w._last_samples = (s_cur, s_tot)
should_emit = False
if pct is not None:
if (pct - w._last_pct) >= PERCENT_STEP or (now - w._last_emit_time) >= TIME_STEP:
should_emit = True
if should_emit and pct is not None:
bar = _progress_bar(pct, 20)
samples_txt = f" samples {s_cur}/{s_tot}" if (s_cur is not None and s_tot) else ""
elapsed = (now - w.frame_start_time) if w.frame_start_time else 0.0
fr = f"f{w.cur_frame:>4}" if (w.cur_frame is not None) else "f --"
prefix = "\x1b[1F\x1b[2K" if w._live_line_active else ""
out = f"{prefix}[MGPU-PROG] {w.tag} {fr} {pct:5.1f}% [{bar}]{samples_txt} {elapsed:.1f}s\n"
self._emit(w, out)
w._live_line_active = True
w._last_pct = pct
w._last_emit_time = now
except Exception:
pass
def prepare_and_spawn(self):
self._update_ram_capacity_note()
if self._selection_warning:
self._open_diag_terminal_if_needed()
if not self.pending:
msg = "[MGPU-LAUNCH] No worker launch needed; all pending outputs were resolved during preflight."
_log(msg)
self._diag_write(msg)
return
launched = 0
for w in self.workers:
try:
log_file = os.path.join(self.logs_dir, f"{w.tag}.log")
w.open_log(log_file)
self._write_header(w)
if self.open_terms:
self._open_terminal_tail(w)
ok = self._launch_worker_process(w)
if ok:
launched += 1
except Exception as e:
self._record_launch_event(w, "FAILED_TO_LAUNCH", "PREPARE_OR_OPEN_LOG_FAILED", str(e))
failed = len(self.workers) - launched
_log(f"[MGPU-LAUNCH] Spawn summary: planned={len(self.workers)} launched={launched} failed_to_launch={failed}")
self._diag_write(f"[MGPU-LAUNCH] Spawn summary: planned={len(self.workers)} launched={launched} failed_to_launch={failed}")
if self._ram_cap_note:
_log(f"[MGPU-LAUNCH] Note: {self._ram_cap_note}")
self._diag_write(f"[MGPU-LAUNCH] Note: {self._ram_cap_note}")
self._open_diag_terminal_if_needed()
if failed > 0:
self._open_diag_terminal_if_needed()
if launched <= 0:
raise RuntimeError("No workers launched. Check [MGPU-LAUNCH] lines for reasons.")
def _print_launch_status_summary(self, title="Status"):
_log(f"[MGPU-LAUNCH] {title} summary:")
self._diag_write(f"[MGPU-LAUNCH] {title} summary:")
for w in self.workers:
detail = f" | {w.launch_detail}" if w.launch_detail else ""
conn = " hello=yes" if w.hello_received else " hello=no"
pid = f" pid={w.launch_pid}" if w.launch_pid is not None else ""
line = f"[MGPU-LAUNCH] {w.tag}: {w.launch_state}/{w.launch_reason}{pid}{conn}{detail}"
_log(line)
self._diag_write(line)
def finish(self):
self._print_launch_status_summary("Final")
self._kill_all()
for w in self.workers:
try:
if getattr(w, "term_proc", None) and (w.term_proc.poll() is None):
if IS_WIN: w.term_proc.send_signal(signal.CTRL_BREAK_EVENT)
else: w.term_proc.terminate()
except Exception:
pass
for w in self.workers:
w.close_log()
# Build final video from temp frames (non-MARI mode only).
try:
if self.job_mode == "FRAMES" and self.video_mode and (not self._skip_video_encode):
self._encode_video_from_sequence()
elif self.job_mode == "FRAMES" and self.video_mode and self._skip_video_encode:
_log(f"[MGPU] Reused existing final video: {self.video_output_path}")
except Exception as _e:
print(f"[MGPU] Video encode failed: {_e}")
# --- NEW: package MARI media if requested ---
try:
if self.job_mode == "MARI":
self._package_mari_zip()
except Exception as _e:
print(f"[MGPU] ZIP packaging skipped/failed: {_e}")
# -------------------------------------------
try:
if self.job_mode == "MARI":
self._cleanup_mari_temp_dirs()
except Exception as _e:
print(f"[MGPU] TEMP cleanup skipped/failed: {_e}")
self._cleanup_temp()
def _package_mari_zip(self):
"""Create <blend-dir>/<render_settings_name>.zip that contains the entire MARI output folder."""
try:
prop = self.scene.mari_props
except Exception:
return
try:
save_zip = bool(getattr(prop, "mari_save_media", False))
except Exception:
save_zip = False
if not save_zip:
return
try:
base = bpy.path.abspath(getattr(prop, "render_settings_filepath", ""))
name = getattr(prop, "render_settings_name", "").strip()
src_dir = os.path.join(base, name)
if not (name and os.path.isdir(src_dir)):
print(f"[MGPU] ZIP: source folder missing or invalid: {src_dir}")
return
# Zip lives one level above the render folder, alongside it.
zip_path = os.path.join(base, f"{name}.zip")
from zipfile import ZipFile, ZIP_DEFLATED
with ZipFile(zip_path, 'w', ZIP_DEFLATED) as zipf:
for root, dirs, files in os.walk(src_dir):
for file in files:
full = os.path.join(root, file)
# Avoid adding the zip file into itself if it already exists.
if os.path.normpath(full) == os.path.normpath(zip_path):
continue
arc = os.path.relpath(full, start=src_dir)
# keep folder name at the top level in the archive
zipf.write(full, arcname=os.path.join(name, arc))
print(f"[MGPU] Wrote MARI media ZIP: {zip_path}")
except Exception as e:
print(f"[MGPU] ZIP packaging failed: {e}")
def _collect_video_frames(self):
if not self.video_seq_dir:
return []
try:
ext = (self.video_seq_ext or "").lower()
files = [f for f in os.listdir(self.video_seq_dir) if f.lower().endswith(ext)]
files.sort()
return files
except Exception:
return []
def _encode_video_from_sequence(self):
if not self.video_seq_dir or not self.video_output_path:
return
frames = self._collect_video_frames()
if not frames:
raise RuntimeError("No rendered frames found for video encode.")
expected = int(getattr(self, "total_frames", 0) or 0)
if expected and len(frames) < expected:
raise RuntimeError(f"Missing frames ({len(frames)}/{expected}) for video encode.")
_log(f"Encoding video from {len(frames)} frames -> {self.video_output_path}")
_mgpu_build_video_from_sequence(self.scene, self.video_seq_dir, frames, self.video_output_path)
def start(self):
if (self.scene.render.engine == 'CYCLES') and (not _cycles_prefs()):
raise RuntimeError("Cycles add-on is not enabled. Enable it in Preferences > Add-ons, or switch render engine to Eevee.")
if self.job_mode == "FRAMES":
if not self.frames:
raise RuntimeError("No frames to render (check frame start/end).")
else:
if not self.pending:
raise RuntimeError("No MARI camera jobs to render (job list empty).")
self.prepare_blend_copy()
# If MARI ANIM, expand camera jobs into per-frame jobs unless we're rendering video containers.
# Explicit per-frame jobs supplied by MARI are preserved as-is so partial resumes can start immediately.
expand_frames = (
self.job_mode == "MARI" and
self.mari_globals.get("action") == "ANIM" and
not self.mari_globals.get("is_video", False)
)
if expand_frames:
fstart, fend, fstep = self.scene.frame_start, self.scene.frame_end, max(1, self.scene.frame_step)
expanded = []
for j in self.mari_jobs:
try:
existing_frame = int(j.get("frame", -1))
except Exception:
existing_frame = -1
if existing_frame >= 0:
expanded.append(dict(j))
continue
for f in range(fstart, fend + 1, fstep):
jj = dict(j)
jj["frame"] = int(f)
expanded.append(jj)
self.pending = expanded
self.total_frames = len(expanded)
self._preflight_existing_outputs()
self._start_server()
self.prepare_and_spawn()
def stop(self):
self._print_launch_status_summary("Stop")
self.cancelled = True
self._kill_all()
for w in self.workers:
try:
if getattr(w, "term_proc", None) and (w.term_proc.poll() is None):
if IS_WIN: w.term_proc.send_signal(signal.CTRL_BREAK_EVENT)
else: w.term_proc.terminate()
except Exception:
pass
for w in self.workers:
w.close_log()
self._cleanup_temp()
def poll(self) -> bool:
with self._lock:
self._check_render_time_guard()
self._check_inflight_timeouts()
self._check_launch_health()
for w in self.workers:
if w.proc and (w.proc.poll() is not None):
try:
if w.stdout_thread:
w.stdout_thread.join(timeout=0.1)
except Exception:
pass
with self._lock:
info = self.inflight.get(w.tag)
if info:
self._handle_retry(w.tag, info.get("job"), key=info.get("key"),
reason="worker-exit", prefer_other=True)
return self._shutdown_if_done()
def _shutdown_if_done(self) -> bool:
all_frames_done = len(self.finished_set) >= getattr(self, "total_frames", 0)
procs_alive = any(getattr(w, "alive", lambda: False)() if callable(getattr(w, "alive", None)) else w.alive
for w in self.workers)
if all_frames_done and not procs_alive:
self.finish()
return True
return False
def _kill_all(self):
for w in self.workers:
if w.proc and (w.proc.poll() is None):
try:
if IS_WIN: w.proc.send_signal(signal.CTRL_BREAK_EVENT)
else: w.proc.terminate()
except Exception:
pass
for w in self.workers:
if w.term_proc and (w.term_proc.poll() is None):
try:
if IS_WIN: w.term_proc.send_signal(signal.CTRL_BREAK_EVENT)
else: w.term_proc.terminate()
except Exception:
pass
try:
if self._diag_term_proc and (self._diag_term_proc.poll() is None):
if IS_WIN: self._diag_term_proc.send_signal(signal.CTRL_BREAK_EVENT)
else: self._diag_term_proc.terminate()
except Exception:
pass
def _cleanup_mari_temp_dirs(self):
try:
prop = self.scene.mari_props
except Exception:
return
try:
base = bpy.path.abspath(getattr(prop, "render_settings_filepath", ""))
name = getattr(prop, "render_settings_name", "").strip()
root = os.path.join(base, name)
if not (name and os.path.isdir(root)):
return
for entry in os.scandir(root):
if entry.is_dir() and entry.name.upper().endswith("_TEMP"):
shutil.rmtree(entry.path, ignore_errors=True)
except Exception:
pass
def _cleanup_temp(self):
try:
if self._server_sock: self._server_sock.close()
except Exception:
pass
try:
if self._diag_log_fp:
self._diag_log_fp.close()
except Exception:
pass
try:
if self.temp_dir and os.path.isdir(self.temp_dir):
shutil.rmtree(self.temp_dir, ignore_errors=True)
except Exception:
pass
def _shorten_path(p, maxlen=96):
try:
p = os.path.normpath(p)
except Exception:
p = str(p)
if len(p) <= maxlen: return p
keep = max(12, maxlen // 2 - 3)
return p[:keep] + "..." + p[-(maxlen - keep - 3):]
# ----------------------- UI / operators -----------------------
class MGPU_RuntimePrefs(bpy.types.PropertyGroup):
threads_per_process: bpy.props.IntProperty(
name="Threads per Process", min=0, max=256, default=0,
description="CPU threads per child Blender (0 = Blender decides)"
)
instances_per_gpu: bpy.props.IntProperty(
name="Instances per GPU", min=1, max=8, default=1,
description="Workers launched per physical GPU"
)
dispatch_mode: bpy.props.EnumProperty(
name="Dispatch",
items=[("DYNAMIC","Dynamic Queue (recommended)",""),
("STRIDE","Stride (round-robin)","")],
default="DYNAMIC"
)
max_retries: bpy.props.IntProperty(
name="Max Retries per Frame", min=0, max=10, default=2
)
render_time_guard_tier: bpy.props.EnumProperty(
name="Render-Time Guard",
description="Adaptive slow-frame guard behavior: duplicate, then restart unhealthy worker on same GPU",
items=[
("AGGRESSIVE", "Aggressive (default)", "Fast intervention; highest chance to restart slow workers"),
("BALANCED", "Balanced", "Moderate intervention and restart cadence"),
("CONSERVATIVE", "Conservative", "Rare intervention; safer for naturally long frames"),
("OFF", "Off", "Disable adaptive render-time guard"),
],
default="AGGRESSIVE"
)
open_terminals: bpy.props.BoolProperty(
name="Open terminal windows for logs", default=IS_WIN
)
use_persistent_data: bpy.props.BoolProperty(
name="Use Persistent Data", default=True,
description="Keep render caches between frames in workers (uses more RAM)"
)
denoise_on_gpu: bpy.props.BoolProperty(
name="Denoise on GPU", default=True,
description="When enabled, workers try to use Blender's GPU denoiser (OptiX) where applicable"
)
use_target_dir_for_video_temp: bpy.props.BoolProperty(
name="Temp Frames In Target Dir", default=True,
description="Store video temp frames beside the final output file. Disable to use the system temp/AppData folder instead"
)
ghost_filter_mode: bpy.props.EnumProperty(
name="Ghost filter mode",
description="How to build the GPU list (your system works best with STRICT − LEGACY).",
items=[
("STRICT_MINUS_LEGACY", "Strict − Legacy (default)", "Use strict(full) minus legacy(ghost)"),
("LEGACY_MINUS_STRICT", "Legacy − Strict", "Use broad scan then subtract strict"),
("STRICT_ONLY", "Strict only", "Use strict set only"),
("LEGACY_ONLY", "Legacy only", "Use legacy set only"),
],
default="STRICT_MINUS_LEGACY"
)
class MGPU_OT_render_frames(bpy.types.Operator):
bl_idname = "render.multi_gpu_frames"
bl_label = "Render (Multi-GPU Frames)"
bl_options = {'REGISTER', 'INTERNAL'}
_timer = None
confirm_message: bpy.props.StringProperty(default="")
forced_temp_dir: bpy.props.StringProperty(default="")
def _resolve_video_temp_dir(self, context):
prefs = getattr(context.window_manager, "mgpu_runtime_prefs", None)
use_target_dir = True
if prefs is not None:
use_target_dir = bool(getattr(prefs, "use_target_dir_for_video_temp", True))
return _mgpu_video_temp_dir_for(context.scene, use_target_dir=use_target_dir)
def draw(self, context):
layout = self.layout
lines = [l for l in (self.confirm_message or "").split("\n") if l.strip()]
if not lines:
layout.label(text="Overwrite existing output?")
return
for line in lines:
layout.label(text=line)
def invoke(self, context, event=None):
is_video = _mgpu_is_video(context.scene)
temp_dir = self._resolve_video_temp_dir(context) if is_video else None
if is_video:
self.forced_temp_dir = temp_dir or ""
if context.scene.render.use_overwrite:
warnings = _mgpu_overwrite_warnings(context.scene, is_video, temp_dir=temp_dir)
if warnings:
self.confirm_message = "Overwrite existing output?\n" + "\n".join(warnings)
return context.window_manager.invoke_confirm(self, event)
return self.execute(context)
def execute(self, context):
global _MANAGER
_cleanup_stale_manager()
if _MANAGER is not None:
self.report({'ERROR'}, "Multi-GPU job already running.")
return {'CANCELLED'}
is_video = _mgpu_is_video(context.scene)
if is_video:
if not self.forced_temp_dir:
self.forced_temp_dir = self._resolve_video_temp_dir(context) or ""
self.report({'INFO'}, "Video output detected. Rendering to a temp image sequence, then encoding.")
p = context.window_manager.mgpu_runtime_prefs
try:
mgr = MultiGPUManager(
context.scene,
threads=p.threads_per_process,
instances_per_gpu=p.instances_per_gpu,
dispatch_mode=p.dispatch_mode,
max_retries=p.max_retries,
open_terms=p.open_terminals,
ghost_mode=p.ghost_filter_mode,
use_persistent_data=p.use_persistent_data,
render_guard_tier=p.render_time_guard_tier,
denoise_on_gpu=p.denoise_on_gpu
)
if is_video:
mgr.video_mode = True
mgr.video_output_path = bpy.path.abspath(context.scene.render.filepath)
if self.forced_temp_dir:
mgr._forced_temp_dir = self.forced_temp_dir
mgr.start()
except Exception as e:
self.report({'ERROR'}, str(e)); return {'CANCELLED'}
_MANAGER = mgr
wm = context.window_manager
self._timer = wm.event_timer_add(0.25, window=context.window)
wm.modal_handler_add(self)
_log("Multi-GPU frames started.")
return {'RUNNING_MODAL'}
def modal(self, context, event):
global _MANAGER
if _MANAGER is None:
return {'CANCELLED'}
if event and event.type == 'ESC':
_MANAGER.stop(); _MANAGER = None
try: context.window_manager.event_timer_remove(self._timer)
except Exception: pass
self.report({'INFO'}, "Multi-GPU cancelled.")
return {'CANCELLED'}
if event.type == 'TIMER':
try:
done = _MANAGER.poll()
if done:
try:
context.window_manager.event_timer_remove(self._timer)
except Exception:
pass
_MANAGER = None
try:
bpy.ops.wm.redraw_timer(type='DRAW_WIN_SWAP', iterations=1)
except Exception:
pass
self.report({'INFO'}, "Multi-GPU render finished.")
return {'FINISHED'}
except Exception as e:
_log(f"Manager error: {e}")
_MANAGER.stop(); _MANAGER = None
try: context.window_manager.event_timer_remove(self._timer)
except Exception: pass
self.report({'ERROR'}, str(e))
return {'CANCELLED'}
return {'RUNNING_MODAL'}
class MGPU_OT_render_mari(bpy.types.Operator):
bl_idname = "render.multi_gpu_mari"
bl_label = "Render (Multi-Instance MARI)"
bl_options = {'REGISTER', 'INTERNAL'}
job_json: bpy.props.StringProperty(name="Jobs JSON") # {"jobs":[{"cam_name":..., "H":..,"V":..},...]}
mode: bpy.props.EnumProperty(items=[("FRAME","FRAME",""),("CIRCLE","CIRCLE","")], default="FRAME")
action: bpy.props.EnumProperty(items=[("STILL","STILL",""),("ANIM","ANIM","")], default="STILL")
_timer = None
def invoke(self, context, event=None):
global _MANAGER
_cleanup_stale_manager()
if _MANAGER is not None:
self.report({'ERROR'}, "Multi-Instance job already running.")
return {'CANCELLED'}
try:
payload = json.loads(self.job_json or "{}")
jobs = payload.get("jobs") or []
except Exception as e:
self.report({'ERROR'}, f"Bad job_json: {e}")
return {'CANCELLED'}
if not jobs:
self.report({'ERROR'}, "No MARI jobs provided.")
return {'CANCELLED'}
# Prepare globals sent to children
is_video = _mgpu_is_video(context.scene)
if self.action == "STILL" and is_video:
self.report({'ERROR'}, "Cannot render STILL directly to video. Switch to an image format or use ANIM.")
return {'CANCELLED'}
mari_prop = getattr(context.scene, "mari_props", None)
mari_settings = {}
if mari_prop:
try:
mari_settings = {
"frame_ratio": [float(mari_prop.frame_ratio[0]), float(mari_prop.frame_ratio[1])],
"frame_dimensions": [float(mari_prop.frame_dimensions[0]), float(mari_prop.frame_dimensions[1])],
"frame_center": [float(mari_prop.frame_center[0]), float(mari_prop.frame_center[1]), float(mari_prop.frame_center[2])],
"frame_rotation": [float(mari_prop.frame_rotation[0]), float(mari_prop.frame_rotation[1]), float(mari_prop.frame_rotation[2])],
"render_settings_filepath": bpy.path.abspath(getattr(mari_prop, "render_settings_filepath", "")),
"render_settings_name": str(getattr(mari_prop, "render_settings_name", "") or ""),
"render_settings_normalize": bool(getattr(mari_prop, "render_settings_normalize", False)),
}
except Exception:
mari_settings = {}
mari_globals = {"mode": "FRAME" if self.mode == "FRAME" else "CIRCLE",
"action": self.action,
"is_video": is_video,
"use_overwrite": bool(getattr(context.scene.render, "use_overwrite", True)),
"use_placeholder": bool(getattr(context.scene.render, "use_placeholder", False)),
"render_resolution_x": int(getattr(context.scene.render, "resolution_x", 0)),
"render_resolution_y": int(getattr(context.scene.render, "resolution_y", 0)),
"render_resolution_percentage": int(getattr(context.scene.render, "resolution_percentage", 100)),
"mari_settings": mari_settings}
# Pre-export .mari3d and ensure the output folder is prepared exactly like MARI does
try:
export_type = "FRAME" if self.mode == "FRAME" else "CIRCLE"
# --- ensure the MARI output directory exists (mirrors MARI add-on) ---
try:
mari_prop = context.scene.mari_props
base = bpy.path.abspath(getattr(mari_prop, "render_settings_filepath", ""))
name = getattr(mari_prop, "render_settings_name", "").strip()
target_dir = os.path.join(base, name)
if context.scene.render.use_overwrite and os.path.isdir(target_dir):
shutil.rmtree(target_dir)
os.makedirs(target_dir, exist_ok=True)
print(f"[MGPU-PARENT] Ensured MARI output folder exists: {target_dir}")
except Exception as _e:
self.report({'ERROR'}, f"Could not prepare MARI folder: {target_dir} ({_e})")
return {'CANCELLED'}
# --------------------------------------------------------------------
bpy.ops.mari.export_mari(action="RENDER", type=export_type, format=self.action)
print(f"[MGPU-PARENT] Exported MARI .mari3d (type={export_type}, format={self.action})")
except Exception as e:
self.report({'ERROR'}, f"MARI export failed: {e}")
return {'CANCELLED'}
# Use same runtime prefs as frames operator
p = context.window_manager.mgpu_runtime_prefs
threads = p.threads_per_process
ipg = p.instances_per_gpu
dispatch = p.dispatch_mode
retries = p.max_retries
open_terms= p.open_terminals
mgr = MultiGPUManager(context.scene, threads=threads, instances_per_gpu=ipg,
dispatch_mode=dispatch, max_retries=retries, open_terms=open_terms,
use_persistent_data=p.use_persistent_data,
job_mode="MARI", mari_jobs=jobs, mari_globals=mari_globals,
render_guard_tier=p.render_time_guard_tier,
denoise_on_gpu=p.denoise_on_gpu)
_MANAGER = mgr
mgr.start() # will prepare blend copy, write child script, start server
# Replace worker command with MARI mode
for w in mgr.workers:
# re-launch with --mgpu-mode MARI
pass # handled in _launch_worker_process via self.job_mode
# Install modal timer for UI progress like frames op
self._timer = context.window_manager.event_timer_add(0.3, window=context.window)
context.window_manager.modal_handler_add(self)
return {'RUNNING_MODAL'}
def modal(self, context, event):
global _MANAGER
if event.type == 'ESC' and getattr(event, "value", 'PRESS') == 'PRESS':
_MANAGER.stop(); _MANAGER = None
try:
context.window_manager.event_timer_remove(self._timer)
except Exception:
pass
self.report({'INFO'}, "Cancelled.")
return {'CANCELLED'}
if event.type == 'TIMER':
if _MANAGER and _MANAGER.poll():
# finished
try:
context.window_manager.event_timer_remove(self._timer)
except Exception:
pass
_MANAGER = None
# Export .mari3d to the same folder the single-instance flow would use
try:
bpy.ops.mari.export_mari('EXEC_DEFAULT')
self.report({'INFO'}, "Completed (exported .mari3d).")
except Exception as e:
self.report({'WARNING'}, f"Completed (but .mari3d export failed: {e})")
return {'FINISHED'}
return {'RUNNING_MODAL'}
class MGPU_OT_cancel(bpy.types.Operator):
bl_idname = "render.multi_gpu_frames_cancel"
bl_label = "Cancel Multi-GPU Frames"
bl_options = {'INTERNAL'}
def execute(self, context):
global _MANAGER
if _MANAGER:
_MANAGER.stop(); _MANAGER = None
self.report({'INFO'}, "Multi-GPU job cancelled.")
return {'FINISHED'}
self.report({'INFO'}, "No Multi-GPU job running.")
return {'CANCELLED'}
class MGPU_OT_open_logs(bpy.types.Operator):
bl_idname = "render.multi_gpu_frames_open_logs"
bl_label = "Open Logs Folder"
bl_options = {'INTERNAL'}
def execute(self, context):
global _MANAGER
if not _MANAGER or not _MANAGER.logs_dir:
self.report({'ERROR'}, "No job/logs available.")
return {'CANCELLED'}
path = _MANAGER.logs_dir
if IS_WIN:
os.startfile(path) # noqa
elif IS_MAC:
subprocess.call(["open", path])
else:
subprocess.call(["xdg-open", path])
return {'FINISHED'}
class MGPU_PT_panel(bpy.types.Panel):
bl_label = "Multi-Instance Render"
bl_space_type = 'PROPERTIES'
bl_region_type = 'WINDOW'
bl_context = "render"
def draw(self, context):
layout = self.layout
p = context.window_manager.mgpu_runtime_prefs
col = layout.column(align=True)
row = col.row(align=True)
row.scale_y = 1.4
row.alert = True
row.operator("render.multi_gpu_frames", icon='RENDER_STILL', text="Render (Multi-GPU Frames)")
box = layout.box()
box.label(text="Scheduler Settings")
row = box.row(align=True)
row.prop(p, "dispatch_mode")
row = box.row(align=True)
row.prop(p, "threads_per_process")
row.prop(p, "max_retries")
row = box.row(align=True)
row.prop(p, "render_time_guard_tier")
row = box.row(align=True)
row.prop(p, "instances_per_gpu")
row = box.row(align=True)
row.prop(p, "use_persistent_data")
row.prop(p, "open_terminals")
row = box.row(align=True)
row.prop(p, "denoise_on_gpu")
row = box.row(align=True)
row.prop(p, "use_target_dir_for_video_temp")
box.label(text="Launch diagnostics: check console/logs for [MGPU-LAUNCH] reason codes.")
if _mgpu_is_video(context.scene):
temp_dir = _mgpu_video_temp_dir_for(
context.scene,
use_target_dir=bool(getattr(p, "use_target_dir_for_video_temp", True))
)
box.label(text=f"Video temp folder: {_shorten_path(temp_dir)}")
box2 = layout.box()
box2.label(text="GPU Detection")
row = box2.row(align=True)
row.alert = True
row.label(text="Please Find your Correct number/listing of GPUs")
row = box2.row(align=True)
row.prop(p, "ghost_filter_mode", expand=True)
backend = _current_compute_type()
legacy = _detect_gpu_devices_legacy(False)
strict = _detect_gpu_devices_strict(True)
final = _detect_gpu_devices_final_from_lists(p.ghost_filter_mode, legacy, strict)
box2.label(text=f"Compute backend: {backend}")
box2.label(text="Note: GPU index order follows NVIDIA/nvidia-smi and may differ from Windows Task Manager numbering.")
final_note = ""
if (not final) and (bpy.context.scene.render.engine == "CYCLES"):
final_note = " (no mapped GPU; CPU worker only if Cycles CPU device is enabled)"
box2.label(text=f"Legacy:{len(legacy)} Strict:{len(strict)} Final:{len(final)}{final_note}")
if len(final) < len(strict):
box2.label(text="Note: Final < Strict; ghost filter may be excluding one or more GPUs.")
mapped_all = _map_selection_to_uuids(final)
mapped, dropped_unknown = _filter_known_mapped_gpus(mapped_all)
if dropped_unknown:
box2.label(text=f"Hidden unresolved GPU entries: {len(dropped_unknown)} (index '?').")
if len(mapped) < len(final):
box2.label(text="Note: Mapping lost devices; check [MGPU-GPUSEL] and [MGPU-LAUNCH] logs.")
phys = _win_query_nvidia_smi_detailed() or []
if phys:
mapped_phys = {m.get("phys_index") for m in mapped if m.get("phys_index") is not None}
if len(mapped_phys) < len(phys):
phys_idx = {g.get("index") for g in phys if g.get("index") is not None}
missing_idx = sorted([i for i in phys_idx if i not in mapped_phys])
box2.label(text=f"Warning: NVIDIA physical GPUs={len(phys)} but mapped={len(mapped_phys)}.")
if missing_idx:
box2.label(text=f"Unmapped NVIDIA index(es): {', '.join(str(i) for i in missing_idx)}")
box2.label(text="Missing GPUs can be backend/type filtered; check [MGPU-GPUSEL] logs.")
for m in mapped:
label = f" [GPU {m['phys_index'] if m['phys_index'] is not None else '?'}] {m.get('name') or '?'}"
c_nm = str(m.get("cycles_name") or "")
p_nm = str(m.get("name") or "")
if c_nm and _normalize_gpu_name(c_nm) != _normalize_gpu_name(p_nm):
label += f" (Cycles row: {c_nm})"
box2.label(text=label)
est = layout.box()
rss = _proc_rss_bytes()
avail = _sys_mem_available_bytes()
n_workers = (len(mapped) if mapped else 1) * (p.instances_per_gpu if mapped else 1)
per_child = None if rss is None else max(int(rss * 0.8), 512 * 1024 * 1024)
total_need = None if (per_child is None) else per_child * n_workers
msg = f"Instances planned: {n_workers} | Blender RSS: {_fmt_bytes(rss) if rss else '?'}"
est.label(text=msg)
if total_need is not None and avail is not None:
risk = " (high risk of OOM)" if total_need > avail*0.8 else ""
est.label(text=f"Estimated RAM needed: {_fmt_bytes(total_need)} | Free: {_fmt_bytes(avail)}{risk}")
else:
est.label(text="RAM estimate not available on this platform (ok to ignore).")
if not _mgpu_has_mari_addon():
ad = layout.box()
ad.label(text="Render & Share Holographic 3D Images!!")
row = ad.row(align=True)
row.scale_y = 1.3
row.alert = True
row.operator("wm.url_open", text="holomari.com", icon='URL').url = "https://holomari.com/info/index"
# ----------------------- registration -----------------------
class MGPU_RuntimePrefsReg(bpy.types.AddonPreferences):
bl_idname = ADDON_KEY
def draw(self, context):
self.layout.label(text="Use the panel in Render Properties â–¸ Multi-GPU Frames.")
def _add_keymap():
"""Bind our operator to Ctrl+F12 (animation), and remove any old F12 binding we created."""
kc = bpy.context.window_manager.keyconfigs.addon
if not kc:
return
for km_name in ("Screen", "Window"):
km = kc.keymaps.get(km_name)
if not km:
continue
for kmi in list(km.keymap_items):
if kmi.idname == "render.multi_gpu_frames" and kmi.type == 'F12' and not kmi.ctrl:
try:
km.keymap_items.remove(kmi)
except Exception:
pass
km = kc.keymaps.new(name="Screen", space_type="EMPTY", region_type='WINDOW')
kmi = km.keymap_items.new("render.multi_gpu_frames", 'F12', 'PRESS', ctrl=True)
_KM_ITEMS.append((km, kmi))
def _remove_keymap():
"""Remove only the keymap items we added during this session."""
for km, kmi in _KM_ITEMS:
try:
km.keymap_items.remove(kmi)
except Exception:
pass
_KM_ITEMS.clear()
def register():
bpy.utils.register_class(MGPU_RuntimePrefs)
bpy.utils.register_class(MGPU_RuntimePrefsReg)
bpy.utils.register_class(MGPU_OT_render_frames)
bpy.utils.register_class(MGPU_OT_cancel)
bpy.utils.register_class(MGPU_OT_open_logs)
bpy.utils.register_class(MGPU_PT_panel)
bpy.utils.register_class(MGPU_OT_render_mari)
bpy.types.WindowManager.mgpu_runtime_prefs = bpy.props.PointerProperty(type=MGPU_RuntimePrefs)
_add_keymap()
def unregister():
_remove_keymap()
try:
del bpy.types.WindowManager.mgpu_runtime_prefs
except Exception:
pass
for cls in [
MGPU_PT_panel,
MGPU_OT_open_logs,
MGPU_OT_cancel,
MGPU_OT_render_frames,
MGPU_OT_render_mari, # <-- add this
MGPU_RuntimePrefsReg,
MGPU_RuntimePrefs,
]:
try:
bpy.utils.unregister_class(cls)
except Exception:
pass