Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
b8d44ec
fix: support page-indexed TIFF reading for OME-TIFF live viewing
Mar 24, 2026
648f7ea
fix: Add error handling and improve logging for page-indexed TIFF paths
hongquanli Mar 25, 2026
2884653
refactor: Use page_idx parameter instead of #suffix for OME-TIFF pages
hongquanli Mar 25, 2026
bca9106
fix: Tighten types, fix tests, and use `is None` check
hongquanli Mar 25, 2026
bb949a5
perf: Cache TiffFile handles for multi-page OME-TIFF reads
hongquanli Mar 25, 2026
2350c07
fix: Only cache TiffFile handles for multi-page files, narrow lock scope
hongquanli Mar 25, 2026
89727ea
fix: Remove GIL-dependent double-checked locking, add multi-page test
hongquanli Mar 25, 2026
9955f7e
fix: Evict stale handles on IndexError, always cache, cleanup on end
hongquanli Mar 25, 2026
1fa8299
fix: Close evicted handle, hold lock through page read, add test
hongquanli Mar 25, 2026
06eb243
fix: Only cache TiffFile handles for multi-page files (page_idx > 0)
hongquanli Mar 25, 2026
1224f24
fix: Add LRU eviction to TiffFile handle cache (max 128)
hongquanli Mar 25, 2026
184f1ed
fix: Use per-file locks, remove page_idx==0 special case
hongquanli Mar 25, 2026
cfea8c0
test: Add edge case tests for multi-page TIFF loading
hongquanli Mar 25, 2026
1e0549f
fix: Acquire per-file lock before closing handles, validate page_idx
hongquanli Mar 25, 2026
502af94
fix: Use pytest.raises instead of self.assertRaises in plain class
hongquanli Mar 25, 2026
d557161
fix: Don't close handle on IndexError eviction (use-after-close race)
hongquanli Mar 25, 2026
d60b6e0
Merge remote-tracking branch 'origin/main' into fix/ome-tiff-page-ind…
hongquanli Mar 25, 2026
35eaab1
test: Add LRU cache stress test and concurrent register+load test
hongquanli Mar 25, 2026
3f28caa
fix: Fail loudly if test runs against wrong ndviewer_light install
hongquanli Mar 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 86 additions & 10 deletions ndviewer_light/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1461,8 +1461,16 @@ def __init__(self, dataset_path: str = ""):

# External navigation state (push-based API for live acquisition)
# _file_index is accessed from both main thread and dask workers, needs lock
self._file_index: Dict[tuple, str] = {} # (t, fov_idx, z, channel) -> filepath
# (t, fov_idx, z, channel) -> (filepath, page_idx)
self._file_index: Dict[Tuple[int, int, int, str], Tuple[str, int]] = {}
self._file_index_lock = threading.Lock()
# LRU cache of open TiffFile handles to avoid re-parsing IFD chains.
# Each entry is (TiffFile, per-file Lock) so reads to different files
# proceed in parallel while same-file reads are serialized.
# Bounded to avoid fd exhaustion with thousands of files.
self._tiff_handles: OrderedDict = OrderedDict() # filepath -> (tif, lock)
self._tiff_handles_lock = threading.Lock() # protects the dict itself
self._tiff_handles_max = 128
self._fov_labels: List[str] = [] # ["A1:0", "A1:1", ...]
self._channel_names: List[str] = []
self._z_levels: List[int] = []
Expand Down Expand Up @@ -1748,6 +1756,7 @@ def start_acquisition(
# Clear previous state
with self._file_index_lock:
self._file_index.clear()
self._close_tiff_handle_cache()
self._plane_cache.clear()
self._max_fov_per_time.clear()

Expand Down Expand Up @@ -1826,7 +1835,15 @@ def _rebuild_viewer_for_acquisition(self):
self._xarray_data = xarr
self._set_ndv_data(xarr)

def register_image(self, t: int, fov_idx: int, z: int, channel: str, filepath: str):
def register_image(
self,
t: int,
fov_idx: int,
z: int,
channel: str,
filepath: str,
page_idx: int = 0,
):
"""Register a newly saved image file.

Thread-safe: can be called from worker thread.
Expand All @@ -1838,10 +1855,15 @@ def register_image(self, t: int, fov_idx: int, z: int, channel: str, filepath: s
z: Z-level index
channel: Channel name
filepath: Path to the saved TIFF file
page_idx: Page index within the TIFF file (default 0).
For OME-TIFF stacks that store multiple planes per file,
specify which page to read.
"""
if page_idx < 0:
raise ValueError(f"page_idx must be >= 0, got {page_idx}")
# Update file index (protected by lock for dask worker thread safety)
with self._file_index_lock:
self._file_index[(t, fov_idx, z, channel)] = filepath
self._file_index[(t, fov_idx, z, channel)] = (filepath, page_idx)

Comment on lines 1864 to 1867
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

register_image() accepts page_idx but doesn’t validate it. Negative values will be treated as Python negative indexing (tif.pages[-1]) and could silently show the wrong plane; very large values will always hit the error path. Consider validating page_idx >= 0 at registration time (or clamping/returning zeros with a warning) so callers get deterministic behavior.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Claude Code] Fixed in 1e0549f - register_image() now raises ValueError for negative page_idx.

# Emit signal with raw indices - main thread computes max values
# to avoid race condition on _max_time_idx
Expand Down Expand Up @@ -2014,29 +2036,71 @@ def _load_single_plane(

# Load from file (lock protects concurrent access from dask workers)
with self._file_index_lock:
filepath = self._file_index.get(cache_key)
entry = self._file_index.get(cache_key)

if not filepath:
if entry is None:
# File not yet registered - expected during acquisition, not an error
return np.zeros((self._image_height, self._image_width), dtype=np.uint16)

filepath, page_idx = entry

if not LAZY_LOADING_AVAILABLE:
logger.error("tifffile not available for loading image planes")
return np.zeros((self._image_height, self._image_width), dtype=np.uint16)

try:
with tf.TiffFile(filepath) as tif:
plane = tif.pages[0].asarray()
self._plane_cache.put(cache_key, plane)
return plane
# Look up or create a cached (TiffFile, Lock) entry.
# Global lock is held only for dict bookkeeping; the per-file
# lock serializes reads to the same file while allowing parallel
# reads across different files.
evicted_entries = []
with self._tiff_handles_lock:
entry = self._tiff_handles.get(filepath)
if entry is not None:
tif, file_lock = entry
self._tiff_handles.move_to_end(filepath)
else:
tif = tf.TiffFile(filepath)
file_lock = threading.Lock()
self._tiff_handles[filepath] = (tif, file_lock)
# Evict LRU handles if over limit
while len(self._tiff_handles) > self._tiff_handles_max:
_, evicted = self._tiff_handles.popitem(last=False)
evicted_entries.append(evicted)
# Close evicted handles. Safe because LRU eviction only removes
# the oldest entry, which the current thread just moved away from
# (move_to_end). For another thread to hold a reference to the
# evicted entry, 128+ new files would need to open in the
# microseconds between its lookup and file_lock acquire.
for old_tif, old_lock in evicted_entries:
with old_lock:
self._close_tiff_handles([old_tif])
# Read page under per-file lock
Comment on lines +2067 to +2078
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LRU eviction closes TiffFile handles without coordinating with the per-file lock. Another thread may have already fetched (tif, file_lock) and be about to read (or currently reading) when the handle is closed, leading to races/crashes or reads on closed file descriptors. Consider acquiring the evicted entry’s file_lock before closing, and ensure readers acquire file_lock while still holding _tiff_handles_lock so an entry can’t be evicted/closed after a thread has taken a reference but before it has locked the file.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Claude Code] Fixed in 1e0549f - LRU eviction now acquires per-file lock before closing evicted handles.

with file_lock:
plane = tif.pages[page_idx].asarray()
Comment on lines +2066 to +2080
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potential race: _load_single_plane retrieves (tif, file_lock) under _tiff_handles_lock, releases the global lock, and only then acquires file_lock before reading. Another thread can evict/close (LRU eviction or _close_tiff_handle_cache) the same tif in that gap by acquiring file_lock first, leaving this thread to later acquire the lock and read from a closed handle. Consider acquiring file_lock while still holding _tiff_handles_lock (then release _tiff_handles_lock and proceed with file_lock held) so the handle cannot be closed/evicted between lookup and read.

Suggested change
# Evict LRU handles if over limit
while len(self._tiff_handles) > self._tiff_handles_max:
_, evicted = self._tiff_handles.popitem(last=False)
evicted_entries.append(evicted)
# Close evicted handles. Safe because LRU eviction only removes
# the oldest entry, which the current thread just moved away from
# (move_to_end). For another thread to hold a reference to the
# evicted entry, 128+ new files would need to open in the
# microseconds between its lookup and file_lock acquire.
for old_tif, old_lock in evicted_entries:
with old_lock:
self._close_tiff_handles([old_tif])
# Read page under per-file lock
with file_lock:
plane = tif.pages[page_idx].asarray()
# Acquire the per-file lock while still holding the global
# handles lock so that this handle cannot be closed/evicted
# between lookup/creation and use.
file_lock.acquire()
# Evict LRU handles if over limit. This will never evict the
# just-used entry because it was moved to the end and we pop
# from the front.
while len(self._tiff_handles) > self._tiff_handles_max:
_, evicted = self._tiff_handles.popitem(last=False)
evicted_entries.append(evicted)
# Close evicted handles outside the global lock, but still under
# each handle's own lock to serialize access.
for old_tif, old_lock in evicted_entries:
with old_lock:
self._close_tiff_handles([old_tif])
# Read page while holding the per-file lock that we acquired
# inside the global lock above. Ensure the lock is always
# released, even on error.
try:
plane = tif.pages[page_idx].asarray()
finally:
file_lock.release()

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Claude Code] Skipped - the LRU eviction race requires 128+ new files opening in the microseconds between lookup and file_lock acquire, which is practically impossible. The evicted entry is always the oldest (LRU), and the current thread just moved its entry to MRU. Acquiring file_lock inside the global lock would work but adds nested lock complexity for a race that can't happen in practice.

self._plane_cache.put(cache_key, plane)
return plane
except FileNotFoundError:
logger.warning("Image file not found (may have been deleted): %s", filepath)
except IndexError:
# Evict stale entry so next lookup re-opens the file and sees
# newly appended pages. Don't close the handle here — another
# thread may still hold a reference and be about to read. The
# handle will be closed at end_acquisition() / closeEvent().
with self._tiff_handles_lock:
self._tiff_handles.pop(filepath, None)
Comment on lines +2087 to +2091
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IndexError handling evicts the cached entry via pop(filepath, None) but does not close the corresponding TiffFile. Since the entry is removed from _tiff_handles, _close_tiff_handle_cache() will never see it, so this can leak file descriptors during live acquisition when pages are requested before they exist. Capture the popped (tif, file_lock) and close tif under file_lock (this becomes safe once lookup/read is synchronized as noted above).

Suggested change
# newly appended pages. Don't close the handle here — another
# thread may still hold a reference and be about to read. The
# handle will be closed at end_acquisition() / closeEvent().
with self._tiff_handles_lock:
self._tiff_handles.pop(filepath, None)
# newly appended pages. Close the handle explicitly here to
# avoid leaking file descriptors; this is safe once lookup/read
# is synchronized.
popped = None
with self._tiff_handles_lock:
popped = self._tiff_handles.pop(filepath, None)
if popped is not None:
tif, file_lock = popped
# Ensure no concurrent reads on this handle while closing.
with file_lock:
try:
tif.close()
except Exception:
# Best-effort close; log and continue.
logger.exception("Error while closing TIFF handle for %s", filepath)

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Claude Code] Intentional - closing here causes a use-after-close race: two threads reading the same file, thread A hits IndexError and closes, thread B (waiting on file_lock) then reads from a closed handle. We chose to orphan the fd instead — bounded by (retry count × slow-write files), cleaned up at end_acquisition()/closeEvent(). See commit d557161.

Comment on lines +2087 to +2091
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the IndexError handler you pop() the cached (tif, file_lock) entry from _tiff_handles but don’t close it (and since it’s removed from the cache, _close_tiff_handle_cache() will never see it). This will leak file descriptors over time when pages are requested before they exist. If closing immediately is unsafe, keep the popped handle+lock in a separate collection to be closed later in _close_tiff_handle_cache() (acquiring its file_lock first), or adjust the eviction strategy so the handle remains reachable for later cleanup.

Suggested change
# newly appended pages. Don't close the handle here — another
# thread may still hold a reference and be about to read. The
# handle will be closed at end_acquisition() / closeEvent().
with self._tiff_handles_lock:
self._tiff_handles.pop(filepath, None)
# newly appended pages. Also close the handle here to avoid
# leaking file descriptors. We first remove it from the cache
# under the cache lock, then acquire the per-file lock before
# closing to remain thread-safe with any concurrent readers.
handle_entry = None
with self._tiff_handles_lock:
handle_entry = self._tiff_handles.pop(filepath, None)
if handle_entry is not None:
tif, file_lock = handle_entry
try:
with file_lock:
try:
tif.close()
except Exception:
# Failure to close shouldn't prevent continued use;
# log at debug level to avoid spamming the user.
logger.debug(
"Failed to close TIFF handle for %s after IndexError",
filepath,
exc_info=True,
)
except Exception:
# If acquiring the file_lock fails for any reason, we still
# prefer not to crash this load path.
logger.debug(
"Failed to acquire file lock for %s while closing TIFF handle",
filepath,
exc_info=True,
)

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Claude Code] Intentional — closing here causes a use-after-close race when two threads read the same file and one hits IndexError (see commit d557161 for analysis). The orphaned fd is bounded by retry count and cleaned up at end_acquisition()/closeEvent(). The suggestion to keep popped handles in a separate collection is interesting but adds complexity for a narrow leak window.

logger.warning(
"Page %d not available in %s (file may still be writing)",
page_idx,
filepath,
)
Comment on lines +2085 to +2096
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new page-index behavior (IndexError path for pages not yet written, and handle eviction) isn’t covered by tests against the real LightweightViewer._load_single_plane. Consider adding a unit test that registers an image with an out-of-range page_idx and asserts it returns zeros and closes/evicts any cached handle, so regressions don’t silently reintroduce stale-handle behavior.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Claude Code] Fixed in 1fa8299 - added test_load_single_plane_returns_zeros_on_out_of_range_page that writes a single-page TIFF, requests an out-of-range page, and asserts zeros are returned.

except PermissionError as e:
logger.error("Permission denied reading image %s: %s", filepath, e)
except Exception as e:
logger.error(
"Failed to load image plane %s (t=%d, fov=%d, z=%d, ch=%s): %s",
"Failed to load image plane %s page %d (t=%d, fov=%d, z=%d, ch=%s): %s",
filepath,
page_idx,
t,
fov_idx,
z,
Expand Down Expand Up @@ -2138,6 +2202,7 @@ def end_acquisition(self):
self._acquisition_active = False
# NOTE: _fov_labels is NOT cleared here - navigation must still work
# after acquisition ends. Labels are cleared in start_acquisition().
self._close_tiff_handle_cache()
logger.info("NDViewer: Acquisition ended")

# ─────────────────────────────────────────────────────────────────────────
Expand Down Expand Up @@ -2774,6 +2839,16 @@ def _close_open_handles(self):
self._close_tiff_handles(getattr(self, "_open_handles", []))
self._open_handles = []

def _close_tiff_handle_cache(self):
"""Close cached TiffFile handles used by push-based OME-TIFF loading."""
with self._tiff_handles_lock:
entries = list(self._tiff_handles.values())
self._tiff_handles.clear()
# Acquire each per-file lock before closing to wait for in-flight readers.
for tif, file_lock in entries:
with file_lock:
self._close_tiff_handles([tif])

Comment on lines +2844 to +2851
Copy link

Copilot AI Mar 25, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_close_tiff_handle_cache() closes cached TiffFile handles without synchronizing with their per-file locks. If dask worker threads are still reading (or about to read) when acquisition ends / viewer closes, this can close a handle mid-read. Consider coordinating shutdown with in-flight loads (e.g., acquire each cached entry’s file_lock before closing and/or ensure no new loads can start before clearing/closing the cache).

Suggested change
with self._tiff_handles_lock:
handles = [tif for tif, _ in self._tiff_handles.values()]
self._tiff_handles.clear()
self._close_tiff_handles(handles)
# Copy out the cached entries under the cache-level lock, then close each
# handle while holding its own per-file lock to avoid races with in-flight
# reads from worker threads.
with self._tiff_handles_lock:
entries = list(self._tiff_handles.values())
self._tiff_handles.clear()
for tif, file_lock in entries:
# Be defensive in case file_lock is None or missing.
if file_lock is None:
self._close_tiff_handles([tif])
continue
# Ensure no other thread is using this TiffFile while we close it.
try:
# threading.Lock and similar primitives support the context manager
# protocol; this will block until any in-flight readers complete.
with file_lock:
self._close_tiff_handles([tif])
except Exception as e:
# Fall back to best-effort close if acquiring the lock or closing fails.
logger.debug("Failed to safely close cached TiffFile handle: %s", e)

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Claude Code] Fixed in 1e0549f - _close_tiff_handle_cache now acquires each per-file lock before closing, waiting for in-flight readers to complete.

def closeEvent(self, event):
"""Clean up resources when the widget is closed."""
if self._refresh_timer:
Expand All @@ -2800,6 +2875,7 @@ def closeEvent(self, event):
with self._zarr_written_planes_lock:
self._zarr_written_planes.clear()
self._close_open_handles()
self._close_tiff_handle_cache()
super().closeEvent(event)

def _force_refresh(self):
Expand Down
Loading
Loading