fix evict policy (#4127)

Tsundoku958 · web-flow · commit dda27d0a6f8c · 2025-11-28T17:14:49.000+08:00
* fix evict policy

* fix test schedule
diff --git a/lmdeploy/pytorch/paging/scheduler.py b/lmdeploy/pytorch/paging/scheduler.py
@@ -247,7 +247,11 @@ def _reorder_waiting():
     def _schedule_decoding(self, prealloc_size: int = 0):
         """Schedule decoding."""
 
-        running = self.running
+        def _reorder_running():
+            """Reorder running."""
+            return sorted(self.running, key=lambda seq: seq.arrive_time)
+
+        running = _reorder_running()
         assert len(running) != 0
 
         eviction_helper = self.eviction_helper
@@ -271,9 +275,9 @@ def __evict_for_seq(seq: SchedulerSequence, num_required_blocks: int):
             return eviction_helper.evict_for_seq(seq, evictable, prealloc_size)
 
         # 1. running
-        for seq in running:
+        while len(running) > 0:
             # token + n
-
+            seq = running.pop(0)
             num_required_blocks = self.block_manager.num_required_blocks(seq, prealloc_size)
             if len(seq.logical_blocks) + num_required_blocks > self.block_manager.num_gpu_blocks:
                 # Reach max gpu cache size.
@@ -285,7 +289,13 @@ def __evict_for_seq(seq: SchedulerSequence, num_required_blocks: int):
                 seq.set_step(0)
                 continue
 
-            if not __evict_for_seq(seq, num_required_blocks):
+            while not __evict_for_seq(seq, num_required_blocks):
+                if len(running) == 0:
+                    break
+                seq_preempted = running.pop(-1)
+                self._set_message_status(seq_preempted, MessageStatus.WAITING)
+
+            if self.block_manager.get_num_free_gpu_blocks() < num_required_blocks:
                 self._set_message_status(seq, MessageStatus.WAITING)
                 continue
 
diff --git a/tests/pytorch/paging/test_scheduler.py b/tests/pytorch/paging/test_scheduler.py
@@ -175,9 +175,9 @@ def test_evict(self, scheduler, block_size, num_gpu_blocks, num_cpu_blocks):
         seq2.update_token_ids(torch.tensor([1] * block_size))
         assert len(scheduler.running) == 2
         scheduler.schedule(is_prefill=False)
-        # seq1: 1 waiting cpu
-        # seq2: 4 running gpu
+        # seq1: 2 running gpu
+        # seq2: 4 waiting cpu
         # seq3: 3 nan
-        assert seq1.status == MessageStatus.WAITING
-        assert seq2.status == MessageStatus.RUNNING
-        assert block_manager.get_num_free_gpu_blocks() == 0
+        assert seq1.status == MessageStatus.RUNNING
+        assert seq2.status == MessageStatus.WAITING
+        assert block_manager.get_num_free_gpu_blocks() == 2