treeverse · nopcoder · Nov 30, 2025 · Dec 3, 2025 · Dec 3, 2025 · arielshaqed
diff --git a/clients/spark/src/main/scala/io/treeverse/clients/LakeFSContext.scala b/clients/spark/src/main/scala/io/treeverse/clients/LakeFSContext.scala
@@ -11,7 +11,6 @@ import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 import org.apache.spark.util.SerializableConfiguration
 import org.slf4j.{Logger, LoggerFactory}
 
-import java.io.File
 import java.util.concurrent.TimeUnit
 
 object LakeFSJobParams {
@@ -186,7 +185,7 @@ object LakeFSContext {
       ranges.flatMap((range: Range) => {
         val path = new Path(apiClient.getRangeURL(repoName, range.id))
         val fs = path.getFileSystem(conf)
-        val localFile = File.createTempFile("lakefs.", ".range")
+        val localFile = StorageUtils.createTempFile(conf, "lakefs.", ".range")
 
         fs.copyToLocalFile(false, path, new Path(localFile.getAbsolutePath), true)
         val companion = Entry.messageCompanion

diff --git a/clients/spark/src/main/scala/io/treeverse/clients/LakeFSInputFormat.scala b/clients/spark/src/main/scala/io/treeverse/clients/LakeFSInputFormat.scala
@@ -17,7 +17,6 @@ import scalapb.GeneratedMessageCompanion
 
 import java.io.DataInput
 import java.io.DataOutput
-import java.io.File
 import java.net.URI
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ListBuffer
@@ -92,7 +91,7 @@ class EntryRecordReader[Proto <: GeneratedMessage with scalapb.Message[Proto]](
   var item: Item[Proto] = _
   var rangeID: String = ""
   override def initialize(split: InputSplit, context: TaskAttemptContext): Unit = {
-    localFile = File.createTempFile("lakefs.", ".range")
+    localFile = StorageUtils.createTempFile(context.getConfiguration, "lakefs.", ".range")
     // Cleanup the local file - using the same technic as other data sources:
     // https://github.com/apache/spark/blob/c0b1735c0bfeb1ff645d146e262d7ccd036a590e/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala#L123
     Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => localFile.delete()))

diff --git a/clients/spark/src/main/scala/io/treeverse/clients/SSTableReader.scala b/clients/spark/src/main/scala/io/treeverse/clients/SSTableReader.scala
@@ -63,7 +63,7 @@ object SSTableReader {
   private def copyToLocal(configuration: Configuration, url: String) = {
     val p = new Path(url)
     val fs = p.getFileSystem(configuration)
-    val localFile = File.createTempFile("lakefs.", ".sstable")
+    val localFile = StorageUtils.createTempFile(configuration, "lakefs.", ".sstable")
     // Cleanup the local file - using the same technic as other data sources:
     // https://github.com/apache/spark/blob/c0b1735c0bfeb1ff645d146e262d7ccd036a590e/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextFileFormat.scala#L123
     Option(TaskContext.get()).foreach(_.addTaskCompletionListener(_ => localFile.delete()))

diff --git a/clients/spark/src/main/scala/io/treeverse/clients/StorageUtils.scala b/clients/spark/src/main/scala/io/treeverse/clients/StorageUtils.scala
@@ -7,8 +7,10 @@ import com.amazonaws.retry.RetryUtils
 import com.amazonaws.services.s3.model.{Region, GetBucketLocationRequest}
 import com.amazonaws.services.s3.{AmazonS3, AmazonS3ClientBuilder}
 import com.amazonaws._
+import org.apache.hadoop.conf.Configuration
 import org.slf4j.{Logger, LoggerFactory}
 
+import java.io.File
 import java.net.URI
 import java.util.concurrent.TimeUnit
 
@@ -161,6 +163,21 @@ object StorageUtils {
     val GCSMaxBulkSize =
       500 // 1000 is the max size, 500 is the recommended size to avoid timeouts or hitting HTTP size limits
   }
+
+  /** Create a temporary file in the Spark local directory if configured.
+   *  This ensures temporary files are stored in executor storage rather than system temp.
+   */
+  def createTempFile(configuration: Configuration, prefix: String, suffix: String): File = {
+    val sparkLocalDir = configuration.get("spark.local.dir")
+    if (sparkLocalDir != null && !sparkLocalDir.isEmpty) {
+      val dir = new File(sparkLocalDir)
+      if (dir.exists() || dir.mkdirs()) {
+        return File.createTempFile(prefix, suffix, dir)
+      }
+    }
+    // Fallback to system temp directory
+    File.createTempFile(prefix, suffix)
+  }
 }
 
 class S3RetryDeleteObjectsCondition extends SDKDefaultRetryCondition {