From 2cf6132b780bddb2c4ecdec3a5422fe955976c92 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 13 Feb 2026 20:12:06 +0000 Subject: [PATCH 1/7] Initial plan From 672e9732fefda2e3168125ca68b4e81abba599e5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 13 Feb 2026 20:17:01 +0000 Subject: [PATCH 2/7] Add datalad support for upload with auto-detection - Add DataladMode enum with yes/no/auto options - Add --datalad CLI option to upload command - Add utility functions to detect git-annex repos and register URLs - Add backend checking to warn if not SHA256E - Integrate datalad URL registration after successful upload Co-authored-by: yarikoptic <39889+yarikoptic@users.noreply.github.com> --- dandi/cli/cmd_upload.py | 16 +++- dandi/upload.py | 174 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+), 1 deletion(-) diff --git a/dandi/cli/cmd_upload.py b/dandi/cli/cmd_upload.py index 1f057dc36..98c90daa0 100644 --- a/dandi/cli/cmd_upload.py +++ b/dandi/cli/cmd_upload.py @@ -9,7 +9,7 @@ instance_option, map_to_click_exceptions, ) -from ..upload import UploadExisting, UploadValidation +from ..upload import DataladMode, UploadExisting, UploadValidation @click.command() @@ -48,6 +48,18 @@ default="require", show_default=True, ) +@click.option( + "--datalad", + type=click.Choice(list(DataladMode)), + default="no", + show_default=True, + help=( + "Enable datalad/git-annex support to link local uploaded files with remote URLs. " + "'yes' - always use datalad; " + "'no' - never use datalad (default); " + "'auto' - auto-detect git-annex repository and use datalad if found." + ), +) @click.argument("paths", nargs=-1) # , type=click.Path(exists=True, dir_okay=False)) # & # Development options: Set DANDI_DEVEL for them to become available @@ -75,6 +87,7 @@ def upload( dandi_instance: str, existing: UploadExisting, validation: UploadValidation, + datalad: DataladMode, # Development options should come as kwargs allow_any_path: bool = False, upload_dandiset_metadata: bool = False, @@ -115,4 +128,5 @@ def upload( jobs=jobs, jobs_per_file=jobs_per_file, sync=sync, + datalad=datalad, ) diff --git a/dandi/upload.py b/dandi/upload.py index e68c71fb2..85237f981 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -6,9 +6,11 @@ from enum import Enum from functools import reduce import io +import os import os.path from pathlib import Path import re +import subprocess import time from time import sleep from typing import Any, TypedDict, cast @@ -66,6 +68,114 @@ def _check_dandidownload_paths(dfile: DandiFile) -> None: ) +def _is_git_annex_repo(path: Path) -> bool: + """ + Check if the given path is within a git-annex repository. + + Returns True if .git/annex directory exists in the repository root. + """ + try: + # Find git repository root + result = subprocess.run( + ["git", "rev-parse", "--show-toplevel"], + cwd=path if path.is_dir() else path.parent, + capture_output=True, + text=True, + check=False, + ) + if result.returncode == 0: + repo_root = Path(result.stdout.strip()) + annex_dir = repo_root / ".git" / "annex" + return annex_dir.exists() + except Exception as e: + lgr.debug("Error checking for git-annex repository: %s", e) + return False + + +def _get_file_annex_backend(filepath: Path) -> str | None: + """ + Get the git-annex backend for a file if it's annexed. + + Returns the backend name (e.g., 'SHA256E') or None if file is not annexed. + """ + if not filepath.exists(): + return None + + try: + # Check if file is a symlink (git-annex files are symlinks) + if not filepath.is_symlink(): + return None + + # Read the symlink target + target = os.readlink(filepath) + + # Parse the backend from the annex key format + # Typical format: ../../.git/annex/objects/XX/YY/SHA256E-sNNNNN--HASH/HASH + parts = Path(target).parts + for part in parts: + if "-s" in part: + # This looks like an annex key + backend = part.split("-")[0] + return backend + except Exception as e: + lgr.debug("Error checking git-annex backend for %s: %s", filepath, e) + + return None + + +def _register_url_with_annex( + filepath: Path, remote_url: str, expected_size: int | None = None +) -> bool: + """ + Register a remote URL with git-annex for a file. + + Args: + filepath: Local file path + remote_url: Remote URL to register + expected_size: Expected file size (for validation) + + Returns: + True if successful, False otherwise + """ + try: + # Use git-annex addurl with --relaxed to skip size/hash checks + # and --file to specify the target file + cmd = [ + "git", + "annex", + "addurl", + "--relaxed", + "--file", + str(filepath), + remote_url, + ] + + lgr.debug("Registering URL with git-annex: %s -> %s", filepath, remote_url) + + result = subprocess.run( + cmd, + cwd=filepath.parent, + capture_output=True, + text=True, + check=False, + ) + + if result.returncode == 0: + lgr.info("Successfully registered %s with git-annex", filepath) + return True + else: + lgr.warning( + "Failed to register %s with git-annex: %s", + filepath, + result.stderr.strip(), + ) + return False + + except Exception as e: + lgr.warning("Error registering URL with git-annex for %s: %s", filepath, e) + return False + + class Uploaded(TypedDict): size: int errors: list[str] @@ -91,6 +201,17 @@ def __str__(self) -> str: return self.value +class DataladMode(str, Enum): + """Mode for datalad/git-annex integration during upload""" + + YES = "yes" + NO = "no" + AUTO = "auto" + + def __str__(self) -> str: + return self.value + + def upload( paths: Sequence[str | Path] | None = None, existing: UploadExisting = UploadExisting.REFRESH, @@ -102,6 +223,7 @@ def upload( jobs: int | None = None, jobs_per_file: int | None = None, sync: bool = False, + datalad: DataladMode = DataladMode.NO, ) -> None: if paths: paths = [Path(p).absolute() for p in paths] @@ -114,6 +236,16 @@ def upload( " paths. Use 'dandi download' or 'organize' first." ) + # Determine if we should use datalad based on mode + use_datalad = False + if datalad == DataladMode.YES: + use_datalad = True + elif datalad == DataladMode.AUTO: + # Auto-detect git-annex repository + use_datalad = _is_git_annex_repo(dandiset.path) + if use_datalad: + lgr.info("Auto-detected git-annex repository; enabling datalad support") + with ExitStack() as stack: # We need to use the client as a context manager in order to ensure the # session gets properly closed. Otherwise, pytest sometimes complains @@ -384,9 +516,12 @@ def process_path(dfile: DandiFile) -> Iterator[dict]: # yield {"status": "uploading"} validating = False + uploaded_asset: RemoteAsset | None = None for r in dfile.iter_upload( remote_dandiset, metadata, jobs=jobs_per_file, replacing=extant ): + if r["status"] == "done": + uploaded_asset = r.get("asset") r.pop("asset", None) # to keep pyout from choking if r["status"] == "uploading": uploaded_paths[strpath]["size"] = r.pop("current") @@ -398,6 +533,45 @@ def process_path(dfile: DandiFile) -> Iterator[dict]: validating = True else: yield r + + # Handle datalad integration after successful upload + if use_datalad and uploaded_asset is not None: + try: + # Check backend and warn if not SHA256E + current_backend = _get_file_annex_backend(dfile.filepath) + if current_backend and current_backend != "SHA256E": + lgr.warning( + "%s: File backend is %s, but dandiset standard is SHA256E", + strpath, + current_backend, + ) + + # Get the remote URL from asset metadata + asset_metadata = uploaded_asset.get_raw_metadata() + content_urls = asset_metadata.get("contentUrl", []) + + if content_urls: + # Register the first URL (typically the S3 URL) + remote_url = content_urls[0] + file_size = asset_metadata.get("contentSize") + + if _register_url_with_annex( + dfile.filepath, remote_url, file_size + ): + lgr.debug( + "%s: Successfully registered remote URL with git-annex", + strpath, + ) + else: + lgr.debug( + "%s: No contentUrl found in asset metadata; skipping datalad registration", + strpath, + ) + except Exception as e: + lgr.warning( + "%s: Error during datalad integration: %s", strpath, e + ) + yield {"status": "done"} except Exception as exc: From 475c29cf233e6e42a7e4ea4b6b7e58558746e599 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 13 Feb 2026 20:19:16 +0000 Subject: [PATCH 3/7] Add tests for datalad functionality - Add test_datalad_mode_enum to verify enum values - Add test_is_git_annex_repo for repository detection - Add test_get_file_annex_backend for backend checking - Register ai_generated marker in tox.ini - Tests skip gracefully when git-annex is not available Co-authored-by: yarikoptic <39889+yarikoptic@users.noreply.github.com> --- dandi/tests/test_upload.py | 85 ++++++++++++++++++++++++++++++++++++++ tox.ini | 2 + 2 files changed, 87 insertions(+) diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index 1bc382866..9e2829632 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -707,3 +707,88 @@ def test_upload_rejects_dandidownload_nwb_file(new_dandiset: SampleDandiset) -> match=f"contains {DOWNLOAD_SUFFIX} path which indicates incomplete download", ): new_dandiset.upload(allow_any_path=True) + + +@pytest.mark.ai_generated +def test_is_git_annex_repo(tmp_path: Path) -> None: + """Test detection of git-annex repositories.""" + import subprocess + + from ..upload import _is_git_annex_repo + + # Check if git-annex is available + try: + subprocess.run( + ["git", "annex", "version"], + capture_output=True, + check=True, + ) + except (FileNotFoundError, subprocess.CalledProcessError): + pytest.skip("git-annex not available") + + # Not a git repo + assert not _is_git_annex_repo(tmp_path) + + # Create a git repo + subprocess.run(["git", "init"], cwd=tmp_path, check=True, capture_output=True) + assert not _is_git_annex_repo(tmp_path) + + # Initialize git-annex + subprocess.run( + ["git", "annex", "init"], cwd=tmp_path, check=True, capture_output=True + ) + assert _is_git_annex_repo(tmp_path) + + +@pytest.mark.ai_generated +def test_get_file_annex_backend(tmp_path: Path) -> None: + """Test getting the git-annex backend for a file.""" + import subprocess + + from ..upload import _get_file_annex_backend + + # Check if git-annex is available + try: + subprocess.run( + ["git", "annex", "version"], + capture_output=True, + check=True, + ) + except (FileNotFoundError, subprocess.CalledProcessError): + pytest.skip("git-annex not available") + + # Create a git-annex repo + subprocess.run(["git", "init"], cwd=tmp_path, check=True, capture_output=True) + subprocess.run( + ["git", "annex", "init"], cwd=tmp_path, check=True, capture_output=True + ) + + # Create a test file + test_file = tmp_path / "test.txt" + test_file.write_text("test content") + + # Not annexed yet + assert _get_file_annex_backend(test_file) is None + + # Add to annex + subprocess.run( + ["git", "annex", "add", "test.txt"], + cwd=tmp_path, + check=True, + capture_output=True, + ) + + # Should detect backend (default is SHA256E on most systems) + backend = _get_file_annex_backend(test_file) + assert backend is not None + + +@pytest.mark.ai_generated +def test_datalad_mode_enum() -> None: + """Test DataladMode enum values.""" + from ..upload import DataladMode + + assert DataladMode.YES.value == "yes" + assert DataladMode.NO.value == "no" + assert DataladMode.AUTO.value == "auto" + assert str(DataladMode.YES) == "yes" diff --git a/tox.ini b/tox.ini index ffdb9b6e9..136d1fac9 100644 --- a/tox.ini +++ b/tox.ini @@ -44,6 +44,8 @@ commands = sphinx-build -E -W -b html source build [pytest] addopts = --tb=short --durations=10 --timeout=300 +markers = + ai_generated: marks tests as AI-generated (deselect with '-m "not ai_generated"') filterwarnings = error ignore:No cached namespaces found .*:UserWarning From 296fa869295cb3af3973a829b0fdcc527526496f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 13 Feb 2026 20:21:52 +0000 Subject: [PATCH 4/7] Fix type error and linting issues - Fix dandiset.path type error (use path_obj instead) - Fix line length issue in debug message - All mypy and flake8 checks pass Co-authored-by: yarikoptic <39889+yarikoptic@users.noreply.github.com> --- dandi/upload.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dandi/upload.py b/dandi/upload.py index 85237f981..a651af4f6 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -242,7 +242,7 @@ def upload( use_datalad = True elif datalad == DataladMode.AUTO: # Auto-detect git-annex repository - use_datalad = _is_git_annex_repo(dandiset.path) + use_datalad = _is_git_annex_repo(dandiset.path_obj) if use_datalad: lgr.info("Auto-detected git-annex repository; enabling datalad support") @@ -564,7 +564,8 @@ def process_path(dfile: DandiFile) -> Iterator[dict]: ) else: lgr.debug( - "%s: No contentUrl found in asset metadata; skipping datalad registration", + "%s: No contentUrl found in asset metadata; " + "skipping datalad registration", strpath, ) except Exception as e: From 0658ea81d1aba5fe20dee36950eec86a67b9bf3a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 13 Feb 2026 20:23:51 +0000 Subject: [PATCH 5/7] Address code review feedback - Improve _get_file_annex_backend to verify annex directory structure - Add detailed comment about contentUrl array usage - Use tuple for DataladMode choices in CLI - Keep __str__ method as it's needed for proper string conversion Co-authored-by: yarikoptic <39889+yarikoptic@users.noreply.github.com> --- dandi/cli/cmd_upload.py | 2 +- dandi/upload.py | 19 ++++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/dandi/cli/cmd_upload.py b/dandi/cli/cmd_upload.py index 98c90daa0..49b928811 100644 --- a/dandi/cli/cmd_upload.py +++ b/dandi/cli/cmd_upload.py @@ -50,7 +50,7 @@ ) @click.option( "--datalad", - type=click.Choice(list(DataladMode)), + type=click.Choice([e.value for e in DataladMode]), default="no", show_default=True, help=( diff --git a/dandi/upload.py b/dandi/upload.py index a651af4f6..8df250386 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -111,10 +111,15 @@ def _get_file_annex_backend(filepath: Path) -> str | None: # Parse the backend from the annex key format # Typical format: ../../.git/annex/objects/XX/YY/SHA256E-sNNNNN--HASH/HASH - parts = Path(target).parts - for part in parts: - if "-s" in part: - # This looks like an annex key + # Verify this is actually in the annex objects directory + target_path = Path(target) + if "annex" not in target_path.parts or "objects" not in target_path.parts: + return None + + # Find the part that looks like an annex key (contains backend and size) + for part in target_path.parts: + if "-s" in part and "--" in part: + # This looks like an annex key: BACKEND-sSIZE--HASH backend = part.split("-")[0] return backend except Exception as e: @@ -547,11 +552,15 @@ def process_path(dfile: DandiFile) -> Iterator[dict]: ) # Get the remote URL from asset metadata + # The contentUrl field contains an array of URLs where the + # asset can be accessed. We use the first URL, which is + # typically the S3 URL. Additional URLs may include API + # endpoints or alternative storage locations. asset_metadata = uploaded_asset.get_raw_metadata() content_urls = asset_metadata.get("contentUrl", []) if content_urls: - # Register the first URL (typically the S3 URL) + # Register the first URL with git-annex remote_url = content_urls[0] file_size = asset_metadata.get("contentSize") From 4eb69e5e01811d8b9654b397ab2806ac81b8784d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 13 Feb 2026 20:25:27 +0000 Subject: [PATCH 6/7] Use tuple comprehension for DataladMode choices - Change from list to tuple comprehension to avoid creating new list on each import - More efficient and follows best practices Co-authored-by: yarikoptic <39889+yarikoptic@users.noreply.github.com> --- dandi/cli/cmd_upload.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dandi/cli/cmd_upload.py b/dandi/cli/cmd_upload.py index 49b928811..3b0372493 100644 --- a/dandi/cli/cmd_upload.py +++ b/dandi/cli/cmd_upload.py @@ -50,7 +50,7 @@ ) @click.option( "--datalad", - type=click.Choice([e.value for e in DataladMode]), + type=click.Choice(tuple(e.value for e in DataladMode)), default="no", show_default=True, help=( From fbb8785e6e73684057e4dbf1a0443fce270af9e2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 13 Feb 2026 21:50:17 +0000 Subject: [PATCH 7/7] Address PR review feedback - Rename DataladMode to GitAnnexMode and --datalad to --git-annex - Change default from 'no' to 'auto' - Improve git-annex repo detection (check for git-annex branch and annex.uuid) - Use filename for backend extraction - Improve log messages with URL details - Add polling loop for contentUrl (20s timeout with 2s intervals) - Select appropriate URL (prefer S3 for public access) - Remove exception swallowing - let errors raise with helpful hints - Consolidate tests into test_git_annex_repo_ops - Move subprocess import to top of test module - Remove ai_generated marker from tox.ini (already in pytest_plugin.py) Co-authored-by: yarikoptic <39889+yarikoptic@users.noreply.github.com> --- dandi/cli/cmd_upload.py | 20 ++--- dandi/tests/test_upload.py | 54 +++---------- dandi/upload.py | 157 ++++++++++++++++++++++++------------- tox.ini | 2 - 4 files changed, 126 insertions(+), 107 deletions(-) diff --git a/dandi/cli/cmd_upload.py b/dandi/cli/cmd_upload.py index 3b0372493..5711fd170 100644 --- a/dandi/cli/cmd_upload.py +++ b/dandi/cli/cmd_upload.py @@ -9,7 +9,7 @@ instance_option, map_to_click_exceptions, ) -from ..upload import DataladMode, UploadExisting, UploadValidation +from ..upload import GitAnnexMode, UploadExisting, UploadValidation @click.command() @@ -49,15 +49,15 @@ show_default=True, ) @click.option( - "--datalad", - type=click.Choice(tuple(e.value for e in DataladMode)), - default="no", + "--git-annex", + type=click.Choice(tuple(e.value for e in GitAnnexMode)), + default="auto", show_default=True, help=( - "Enable datalad/git-annex support to link local uploaded files with remote URLs. " - "'yes' - always use datalad; " - "'no' - never use datalad (default); " - "'auto' - auto-detect git-annex repository and use datalad if found." + "Enable git-annex support to link local uploaded files with remote URLs. " + "'yes' - always use git-annex; " + "'no' - never use git-annex; " + "'auto' - auto-detect git-annex repository and use if found (default)." ), ) @click.argument("paths", nargs=-1) # , type=click.Path(exists=True, dir_okay=False)) @@ -87,7 +87,7 @@ def upload( dandi_instance: str, existing: UploadExisting, validation: UploadValidation, - datalad: DataladMode, + git_annex: GitAnnexMode, # Development options should come as kwargs allow_any_path: bool = False, upload_dandiset_metadata: bool = False, @@ -128,5 +128,5 @@ def upload( jobs=jobs, jobs_per_file=jobs_per_file, sync=sync, - datalad=datalad, + git_annex=git_annex, ) diff --git a/dandi/tests/test_upload.py b/dandi/tests/test_upload.py index 9e2829632..b997e2c07 100644 --- a/dandi/tests/test_upload.py +++ b/dandi/tests/test_upload.py @@ -5,6 +5,7 @@ import os from pathlib import Path from shutil import copyfile, rmtree +import subprocess from typing import Any from unittest.mock import Mock from urllib.parse import urlparse @@ -710,11 +711,9 @@ def test_upload_rejects_dandidownload_nwb_file(new_dandiset: SampleDandiset) -> @pytest.mark.ai_generated -def test_is_git_annex_repo(tmp_path: Path) -> None: - """Test detection of git-annex repositories.""" - import subprocess - - from ..upload import _is_git_annex_repo +def test_git_annex_repo_ops(tmp_path: Path) -> None: + """Test git-annex repository detection and backend operations.""" + from ..upload import _get_file_annex_backend, _is_git_annex_repo, GitAnnexMode # Check if git-annex is available try: @@ -726,7 +725,13 @@ def test_is_git_annex_repo(tmp_path: Path) -> None: except (FileNotFoundError, subprocess.CalledProcessError): pytest.skip("git-annex not available") - # Not a git repo + # Test enum values + assert GitAnnexMode.YES.value == "yes" + assert GitAnnexMode.NO.value == "no" + assert GitAnnexMode.AUTO.value == "auto" + assert str(GitAnnexMode.YES) == "yes" + + # Test detection: Not a git repo assert not _is_git_annex_repo(tmp_path) # Create a git repo @@ -739,31 +744,7 @@ def test_is_git_annex_repo(tmp_path: Path) -> None: ) assert _is_git_annex_repo(tmp_path) - -@pytest.mark.ai_generated -def test_get_file_annex_backend(tmp_path: Path) -> None: - """Test getting the git-annex backend for a file.""" - import subprocess - - from ..upload import _get_file_annex_backend - - # Check if git-annex is available - try: - subprocess.run( - ["git", "annex", "version"], - capture_output=True, - check=True, - ) - except (FileNotFoundError, subprocess.CalledProcessError): - pytest.skip("git-annex not available") - - # Create a git-annex repo - subprocess.run(["git", "init"], cwd=tmp_path, check=True, capture_output=True) - subprocess.run( - ["git", "annex", "init"], cwd=tmp_path, check=True, capture_output=True - ) - - # Create a test file + # Test backend detection: Create a test file test_file = tmp_path / "test.txt" test_file.write_text("test content") @@ -781,14 +762,3 @@ def test_get_file_annex_backend(tmp_path: Path) -> None: # Should detect backend (default is SHA256E on most systems) backend = _get_file_annex_backend(test_file) assert backend is not None - - -@pytest.mark.ai_generated -def test_datalad_mode_enum() -> None: - """Test DataladMode enum values.""" - from ..upload import DataladMode - - assert DataladMode.YES.value == "yes" - assert DataladMode.NO.value == "no" - assert DataladMode.AUTO.value == "auto" - assert str(DataladMode.YES) == "yes" diff --git a/dandi/upload.py b/dandi/upload.py index 8df250386..adf3d3ef3 100644 --- a/dandi/upload.py +++ b/dandi/upload.py @@ -72,7 +72,8 @@ def _is_git_annex_repo(path: Path) -> bool: """ Check if the given path is within a git-annex repository. - Returns True if .git/annex directory exists in the repository root. + Returns True if git-annex is properly initialized (has git-annex branch + and annex.uuid config). """ try: # Find git repository root @@ -83,10 +84,33 @@ def _is_git_annex_repo(path: Path) -> bool: text=True, check=False, ) - if result.returncode == 0: - repo_root = Path(result.stdout.strip()) - annex_dir = repo_root / ".git" / "annex" - return annex_dir.exists() + if result.returncode != 0: + return False + + repo_root = Path(result.stdout.strip()) + + # Check for git-annex branch + branch_result = subprocess.run( + ["git", "show-ref", "--verify", "refs/heads/git-annex"], + cwd=repo_root, + capture_output=True, + check=False, + ) + if branch_result.returncode != 0: + return False + + # Check for annex.uuid config + config_result = subprocess.run( + ["git", "config", "annex.uuid"], + cwd=repo_root, + capture_output=True, + text=True, + check=False, + ) + if config_result.returncode != 0 or not config_result.stdout.strip(): + return False + + return True except Exception as e: lgr.debug("Error checking for git-annex repository: %s", e) return False @@ -120,7 +144,8 @@ def _get_file_annex_backend(filepath: Path) -> str | None: for part in target_path.parts: if "-s" in part and "--" in part: # This looks like an annex key: BACKEND-sSIZE--HASH - backend = part.split("-")[0] + # Extract just the filename (last part of path) + backend = Path(target).name.split("-")[0] return backend except Exception as e: lgr.debug("Error checking git-annex backend for %s: %s", filepath, e) @@ -166,11 +191,12 @@ def _register_url_with_annex( ) if result.returncode == 0: - lgr.info("Successfully registered %s with git-annex", filepath) + lgr.info("Successfully registered url %s to %s with git-annex", remote_url, filepath) return True else: lgr.warning( - "Failed to register %s with git-annex: %s", + "Failed to register url %s for %s with git-annex: %s", + remote_url, filepath, result.stderr.strip(), ) @@ -206,8 +232,8 @@ def __str__(self) -> str: return self.value -class DataladMode(str, Enum): - """Mode for datalad/git-annex integration during upload""" +class GitAnnexMode(str, Enum): + """Mode for git-annex integration during upload""" YES = "yes" NO = "no" @@ -228,7 +254,7 @@ def upload( jobs: int | None = None, jobs_per_file: int | None = None, sync: bool = False, - datalad: DataladMode = DataladMode.NO, + git_annex: GitAnnexMode = GitAnnexMode.AUTO, ) -> None: if paths: paths = [Path(p).absolute() for p in paths] @@ -241,15 +267,15 @@ def upload( " paths. Use 'dandi download' or 'organize' first." ) - # Determine if we should use datalad based on mode - use_datalad = False - if datalad == DataladMode.YES: - use_datalad = True - elif datalad == DataladMode.AUTO: + # Determine if we should use git-annex based on mode + use_git_annex = False + if git_annex == GitAnnexMode.YES: + use_git_annex = True + elif git_annex == GitAnnexMode.AUTO: # Auto-detect git-annex repository - use_datalad = _is_git_annex_repo(dandiset.path_obj) - if use_datalad: - lgr.info("Auto-detected git-annex repository; enabling datalad support") + use_git_annex = _is_git_annex_repo(dandiset.path_obj) + if use_git_annex: + lgr.info("Auto-detected git-annex repository; enabling git-annex support") with ExitStack() as stack: # We need to use the client as a context manager in order to ensure the @@ -539,49 +565,74 @@ def process_path(dfile: DandiFile) -> Iterator[dict]: else: yield r - # Handle datalad integration after successful upload - if use_datalad and uploaded_asset is not None: - try: - # Check backend and warn if not SHA256E - current_backend = _get_file_annex_backend(dfile.filepath) - if current_backend and current_backend != "SHA256E": - lgr.warning( - "%s: File backend is %s, but dandiset standard is SHA256E", - strpath, - current_backend, - ) + # Handle git-annex integration after successful upload + if use_git_annex and uploaded_asset is not None: + # Check backend and warn if not SHA256E + current_backend = _get_file_annex_backend(dfile.filepath) + if current_backend and current_backend != "SHA256E": + lgr.warning( + "%s: File backend is %s, but dandiset standard is SHA256E", + strpath, + current_backend, + ) - # Get the remote URL from asset metadata - # The contentUrl field contains an array of URLs where the - # asset can be accessed. We use the first URL, which is - # typically the S3 URL. Additional URLs may include API - # endpoints or alternative storage locations. + # Poll for contentUrl with timeout (may take time to appear) + # The contentUrl field contains an array of URLs where the + # asset can be accessed. For public dandisets, we prefer + # S3 URLs. For embargoed dandisets, we may need API URLs + # with authentication. + content_urls = None + max_wait = 20 # seconds + poll_interval = 2 # seconds + for attempt in range(max_wait // poll_interval): asset_metadata = uploaded_asset.get_raw_metadata() content_urls = asset_metadata.get("contentUrl", []) - if content_urls: - # Register the first URL with git-annex - remote_url = content_urls[0] - file_size = asset_metadata.get("contentSize") - - if _register_url_with_annex( - dfile.filepath, remote_url, file_size - ): - lgr.debug( - "%s: Successfully registered remote URL with git-annex", - strpath, - ) - else: + break + if attempt < (max_wait // poll_interval) - 1: lgr.debug( - "%s: No contentUrl found in asset metadata; " - "skipping datalad registration", + "%s: Waiting for contentUrl to be populated (attempt %d/%d)", strpath, + attempt + 1, + max_wait // poll_interval, ) - except Exception as e: - lgr.warning( - "%s: Error during datalad integration: %s", strpath, e + sleep(poll_interval) + + if not content_urls: + raise RuntimeError( + f"{strpath}: No contentUrl found in asset metadata after {max_wait}s. " + "Cannot register with git-annex. Use --git-annex no to disable." + ) + + # Select appropriate URL based on dandiset status + # For public dandisets: prefer S3 URLs (direct access) + # For embargoed: may need to use API URLs with auth + remote_url = None + for url in content_urls: + # Prefer S3 URLs for public access + if "s3.amazonaws.com" in url.lower() or "amazonaws.com" in url.lower(): + remote_url = url + break + + # Fall back to first URL if no S3 URL found + if not remote_url: + remote_url = content_urls[0] + + file_size = asset_metadata.get("contentSize") + + if not _register_url_with_annex( + dfile.filepath, remote_url, file_size + ): + raise RuntimeError( + f"{strpath}: Failed to register URL with git-annex. " + "Use --git-annex no to disable." ) + lgr.debug( + "%s: Successfully registered remote URL with git-annex", + strpath, + ) + yield {"status": "done"} except Exception as exc: diff --git a/tox.ini b/tox.ini index 136d1fac9..ffdb9b6e9 100644 --- a/tox.ini +++ b/tox.ini @@ -44,8 +44,6 @@ commands = sphinx-build -E -W -b html source build [pytest] addopts = --tb=short --durations=10 --timeout=300 -markers = - ai_generated: marks tests as AI-generated (deselect with '-m "not ai_generated"') filterwarnings = error ignore:No cached namespaces found .*:UserWarning