pass job data to executors

andrii-i · andrii-i · commit 16b85caedf85 · 2025-09-23T07:39:25.000-07:00
diff --git a/jupyter_scheduler/executors.py b/jupyter_scheduler/executors.py
@@ -5,7 +5,7 @@
 import tarfile
 import traceback
 from abc import ABC, abstractmethod
-from typing import Dict
+from typing import Dict, Optional
 
 import fsspec
 import nbconvert
@@ -37,11 +37,13 @@ def __init__(
         db_url: str,
         staging_paths: Dict[str, str],
         database_manager_class,
+        job_data: Optional[Dict] = None,  # NEW: Optional job data for passing metadata
     ):
         self.job_id = job_id
         self.staging_paths = staging_paths
         self.root_dir = root_dir
         self.db_url = db_url
+        self.job_data = job_data  # Store for use by subclasses
 
         self.database_manager = self._create_database_manager(database_manager_class)
 
diff --git a/jupyter_scheduler/job_files_manager.py b/jupyter_scheduler/job_files_manager.py
@@ -57,6 +57,9 @@ def generate_filepaths(self):
         """A generator that produces filepaths"""
         output_formats = self.output_formats + ["input"]
         for output_format in output_formats:
+            # Skip if this format is not in staging_paths (e.g., input file for CronJob jobs)
+            if output_format not in self.staging_paths:
+                continue
             input_filepath = self.staging_paths[output_format]
             output_filepath = os.path.join(self.output_dir, self.output_filenames[output_format])
             if not os.path.exists(output_filepath) or self.redownload:
@@ -79,8 +82,19 @@ def generate_filepaths(self):
                             yield input_filepath, output_filepath
 
         if self.include_staging_files:
-            staging_dir = os.path.dirname(self.staging_paths["input"])
-            for file_relative_path in self.output_filenames["files"]:
+            # Handle missing "input" key gracefully - it may not exist for CronJob jobs
+            if "input" in self.staging_paths:
+                staging_dir = os.path.dirname(self.staging_paths["input"])
+            elif self.staging_paths:
+                # Fall back to any available staging path directory
+                staging_dir = os.path.dirname(next(iter(self.staging_paths.values())))
+            else:
+                # No staging paths available, skip
+                return
+
+            # Handle missing "files" key gracefully - it may not exist if packaged_files was empty
+            files_list = self.output_filenames.get("files", [])
+            for file_relative_path in files_list:
                 input_filepath = os.path.join(staging_dir, file_relative_path)
                 output_filepath = os.path.join(self.output_dir, file_relative_path)
                 if not os.path.exists(output_filepath) or self.redownload:
diff --git a/jupyter_scheduler/scheduler.py b/jupyter_scheduler/scheduler.py
@@ -489,6 +489,20 @@ def create_job(self, model: CreateJob) -> str:
             #
             # See: https://github.com/python/cpython/issues/66285
             # See also: https://github.com/jupyter/jupyter_core/pull/362
+            # Serialize job data for cross-process passing
+            job_data = {
+                'job_id': job.job_id,
+                'name': job.name if hasattr(job, 'name') else None,
+                'input_filename': job.input_filename if hasattr(job, 'input_filename') else None,
+                'runtime_environment_name': job.runtime_environment_name if hasattr(job, 'runtime_environment_name') else None,
+                'runtime_environment_parameters': job.runtime_environment_parameters if hasattr(job, 'runtime_environment_parameters') else None,
+                'output_formats': job.output_formats if hasattr(job, 'output_formats') else [],
+                'parameters': job.parameters if hasattr(job, 'parameters') else None,
+                'tags': job.tags if hasattr(job, 'tags') else [],
+                'package_input_folder': job.package_input_folder if hasattr(job, 'package_input_folder') else False,
+                'packaged_files': job.packaged_files if hasattr(job, 'packaged_files') else [],
+            }
+
             mp_ctx = mp.get_context("spawn")
             p = mp_ctx.Process(
                 target=self.execution_manager_class(
@@ -497,6 +511,7 @@ def create_job(self, model: CreateJob) -> str:
                     root_dir=self.root_dir,
                     db_url=self.db_url,
                     database_manager_class=self.database_manager_class,
+                    job_data=job_data,
                 ).process
             )
             p.start()