InternLM
diff --git a/‎CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎benchmark/profile_throughput.py‎
Lines changed: 2 additions & 0 deletions b/‎benchmark/profile_throughput.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎builder/windows/generate.ps1‎
Lines changed: 1 addition & 1 deletion b/‎builder/windows/generate.ps1‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/en/advance/context_parallel.md‎
Lines changed: 24 additions & 0 deletions b/‎docs/en/advance/context_parallel.md‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎docs/en/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/en/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/zh_cn/advance/context_parallel.md‎
Lines changed: 24 additions & 0 deletions b/‎docs/zh_cn/advance/context_parallel.md‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎docs/zh_cn/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/zh_cn/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lmdeploy/cli/cli.py‎
Lines changed: 1 addition & 0 deletions b/‎lmdeploy/cli/cli.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lmdeploy/cli/serve.py‎
Lines changed: 6 additions & 2 deletions b/‎lmdeploy/cli/serve.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎lmdeploy/cli/utils.py‎
Lines changed: 10 additions & 0 deletions b/‎lmdeploy/cli/utils.py‎
Lines changed: 10 additions & 0 deletions
@@ -19,6 +19,7 @@ project(TurboMind LANGUAGES CXX CUDA)
 if (MSVC)
     # use standard conformant preprocessor
     add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/Zc:preprocessor>)
+    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/Zc:__cplusplus>)
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=/Zc:preprocessor -Xcompiler=/Zc:__cplusplus")
 endif ()
 
@@ -101,6 +102,10 @@ if(NOT xgrammar_POPULATED)
 
   # Bring the populated content into the build
   add_subdirectory(${xgrammar_SOURCE_DIR} ${xgrammar_BINARY_DIR})
+  if(TARGET xgrammar)
+    target_compile_options(xgrammar PRIVATE $<$<CXX_COMPILER_ID:MSVC>:/utf-8>)
+    target_compile_options(xgrammar PRIVATE $<$<C_COMPILER_ID:MSVC>:/utf-8>)
+  endif()
 endif()
 
 # the environment variable
 
@@ -327,6 +327,7 @@ def parse_args():
     tb_group._group_actions.append(dtype_act)
 
     ArgumentHelper.dp(tb_group)
+    ArgumentHelper.cp(tb_group)
     ArgumentHelper.model_format(tb_group, default='hf')
     ArgumentHelper.num_tokens_per_iter(tb_group)
     ArgumentHelper.max_prefill_iters(tb_group)
@@ -344,6 +345,7 @@ def main():
             max_batch_size=args.concurrency // args.dp,
             tp=args.tp,
             dp=args.dp,
+            cp=args.cp,
             cache_max_entry_count=args.cache_max_entry_count,
             cache_block_seq_len=args.cache_block_seq_len,
             model_format=args.model_format,
 
@@ -1,4 +1,4 @@
-cmake .. -A x64 -T "v142,cuda=$env:CUDA_PATH" `
+cmake .. -A x64 -T "v143,cuda=$env:CUDA_PATH" `
     -DCMAKE_BUILD_TYPE=Release `
     -DCMAKE_INSTALL_PREFIX=install `
     -DBUILD_PY_FFI=ON `
 
@@ -0,0 +1,24 @@
+# Context Parallel
+
+When the memory on a single GPU is insufficient to deploy a model, it is often deployed using tensor parallelism (TP), which generally requires `num_key_value_heads` to be divisible by `TP`. If you want to deploy with `TP > num_key_value_heads`, the kv-heads should be duplicated to meet the divisibility requirement. However, this has two disadvantages:
+
+1. The amount of available kv_cache is halved, which reducing the maximum supported session length.
+2. The maximum inference batch size is reduced, leading to lower throughput.
+
+To address this issue, the TurboMind inference backend supports setting `attn_dp_size`, which avoids creating copies of kv-heads, but this introduces data imbalance. To eliminate data imbalance, TurboMind supports sequence parallelism, which allowing kv_cache to be stored interleaved on different cp_ranks. See the example below:
+
+```
+cp_rank=2, prompt_len=5, generation_len=4
+kv_cache stored on cp_rank0: 0, 2, 4, 6, 8
+kv_cache stored on cp_rank1: 1, 3, 5, 7
+```
+
+## Usage
+
+Taking Intern-S1 / Qwen3-235B-A22B as an example, their `num_key_value_heads` is 4. If you want to deploy with `TP=8` and avoid duplication of kv_cache, you can deploy in the following way:
+
+```
+lmdeploy serve api_server internlm/Intern-S1 --tp 8 --cp 2
+
+lmdeploy serve api_server Qwen/Qwen3-235B-A22B --tp 8 --cp 2
+```
@@ -103,6 +103,7 @@ Documentation
    advance/pytorch_multinodes.md
    advance/pytorch_profiling.md
    advance/metrics.md
+   advance/context_parallel.md
 
 .. toctree::
    :maxdepth: 1
 
@@ -0,0 +1,24 @@
+# 序列并行
+
+在单卡显存不足以部署模型的时候，通常会以 `TP` 的方式进行部署，而这一般要求 `num_key_value_heads` 被 `TP` 整除。如果要以 `TP > num_key_value_heads` 的方式进行部署，需要创建 kv-heads 的副本，以满足整除需求。但是这样会有两个缺点：
+
+1. 可用的 kvcache 数量减半，进而减少请求最大推理长度
+2. 降低推理的最大 batch 数量，减少吞吐量。
+
+为了解决这个问题，TurboMind 推理后端支持设置 `attn_dp_size`，避免了创建 kv-heads 的副本，但是这会引入数据的不均衡性。为了消除数据的不均衡，TurboMind 支持了序列并行，支持将 kv_cache 交错存储到不同的 cp_rank 上，例如
+
+```
+cp_rank=2, prompt_len=5, generation_len=4
+kv_cache stored on cp_rank0: 0, 2, 4, 6, 8
+kv_cache stored on cp_rank1: 1, 3, 5, 7
+```
+
+## 使用说明
+
+以 `Intern-S1` / `Qwen3-235B-A22B` 为例，他们的 `num_key_value_heads` 为 4，若要用 `TP=8` 的方式部署，并避免 kv_cache 的拷贝，可以用如下的方式部署
+
+```
+lmdeploy serve api_server internlm/Intern-S1 --tp 8 --cp 2
+
+lmdeploy serve api_server Qwen/Qwen3-235B-A22B --tp 8 --cp 2
+```
@@ -104,6 +104,7 @@ LMDeploy 工具箱提供以下核心功能：
    advance/pytorch_multinodes.md
    advance/pytorch_profiling.md
    advance/metrics.md
+   advance/context_parallel.md
 
 .. toctree::
    :maxdepth: 1
 
@@ -76,6 +76,7 @@ def add_parser_chat():
         ArgumentHelper.model_format(tb_group)
         ArgumentHelper.rope_scaling_factor(tb_group)
         ArgumentHelper.communicator(tb_group)
+        ArgumentHelper.cp(tb_group)
 
     @staticmethod
     def add_parser_checkenv():
 
@@ -112,7 +112,7 @@ def add_parser_api_server():
         model_format = ArgumentHelper.model_format(pt_group)
         hf_overrides = ArgumentHelper.hf_overrides(pt_group)
         disable_metrics = ArgumentHelper.disable_metrics(pt_group)
-        ArgumentHelper.dp(pt_group)
+        dp = ArgumentHelper.dp(pt_group)
         ArgumentHelper.ep(pt_group)
         ArgumentHelper.enable_microbatch(pt_group)
         ArgumentHelper.enable_eplb(pt_group)
@@ -137,6 +137,8 @@ def add_parser_api_server():
         tb_group._group_actions.append(model_format)
         tb_group._group_actions.append(hf_overrides)
         tb_group._group_actions.append(disable_metrics)
+        tb_group._group_actions.append(dp)
+        ArgumentHelper.cp(tb_group)
         ArgumentHelper.rope_scaling_factor(tb_group)
         ArgumentHelper.num_tokens_per_iter(tb_group)
         ArgumentHelper.max_prefill_iters(tb_group)
@@ -235,6 +237,8 @@ def api_server(args):
             from lmdeploy.messages import TurbomindEngineConfig
             backend_config = TurbomindEngineConfig(dtype=args.dtype,
                                                    tp=args.tp,
+                                                   dp=args.dp,
+                                                   cp=args.cp,
                                                    max_batch_size=max_batch_size,
                                                    session_len=args.session_len,
                                                    model_format=args.model_format,
@@ -253,7 +257,7 @@ def api_server(args):
 
         from lmdeploy.messages import VisionConfig
         vision_config = VisionConfig(args.vision_max_batch_size)
-        if args.dp == 1:
+        if args.dp == 1 or backend == 'turbomind':
             from lmdeploy.serve.openai.api_server import serve as run_api_server
 
             run_api_server(args.model_path,
 
@@ -188,6 +188,16 @@ def ep(parser):
                                    default=1,
                                    help='expert parallelism. dp is required when pytorch engine is used.')
 
+    @staticmethod
+    def cp(parser):
+        """Add argument cp to parser."""
+
+        return parser.add_argument(
+            '--cp',
+            type=int,
+            default=1,
+            help='context parallelism size in attention for turbomind backend, tp % cp should be 0.')
+
     @staticmethod
     def dp_rank(parser):
         """Add argument dp_rank to parser."""
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		-cmake .. -A x64 -T "v142,cuda=$env:CUDA_PATH" `
	`1`	+cmake .. -A x64 -T "v143,cuda=$env:CUDA_PATH" `
`2`	`2`	-DCMAKE_BUILD_TYPE=Release `
`3`	`3`	-DCMAKE_INSTALL_PREFIX=install `
`4`	`4`	-DBUILD_PY_FFI=ON `