diff --git a/.github/workflows/check_failed_tests.yml b/.github/workflows/check_failed_tests.yml
index ec40bee377f4..1238d9c569f9 100644
--- a/.github/workflows/check_failed_tests.yml
+++ b/.github/workflows/check_failed_tests.yml
@@ -115,7 +115,7 @@ jobs:
         if: ${{ env.process == 'true' && inputs.pr_number != '' }}
         uses: actions/github-script@v6
         with:
-          script: |            
+          script: |
             const { data: pr } = await github.rest.pulls.get({
               owner: context.repo.owner,
               repo: context.repo.repo,
diff --git a/.github/workflows/self-comment-ci.yml b/.github/workflows/self-comment-ci.yml
index 99785c16fc71..b48d700d3b41 100644
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@@ -96,9 +96,9 @@ jobs:
         run: |
           python -m pip install GitPython
           python utils/pr_slow_ci_models.py --message "$PR_COMMENT" | tee output.txt
-          echo "models=$(tail -n 1 output.txt)" >> $GITHUB_ENV
+          echo 'models=$(tail -n 1 output.txt)' >> $GITHUB_ENV
           python utils/pr_slow_ci_models.py --message "$PR_COMMENT" --quantization | tee output2.txt
-          echo "quantizations=$(tail -n 1 output2.txt)" >> $GITHUB_ENV
+          echo 'quantizations=$(tail -n 1 output2.txt)' >> $GITHUB_ENV
 
       - name: Show models to test
         id: models_to_run
@@ -135,6 +135,27 @@ jobs:
             "repos/${github_repository}/issues/${pr_number}/comments" \
             -f body="💔 This comment contains \`run-slow\`, but unknown error occurred and [the workflow run]($GITHUB_RUN_URL) aborted!"
 
+  # Report back if we are not able to get the tests (for example, security check is failing)
+  report_error_earlier:
+    name: Report error earlier
+    if: ${{ always() && needs.get-pr-info.result == 'success' && needs.get-tests.result != 'success' }}
+    needs: [get-pr-number, get-pr-info, get-tests]
+    permissions:
+      pull-requests: write
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Reply to the comment
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
+        run: |
+          gh api \
+            --method POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
+            -f body="💔 This comment contains \`run-slow\`, but unknown error occurred and [the workflow run]($GITHUB_RUN_URL) aborted!"
+
   reply_to_comment:
     name: Reply to the comment
     if: ${{ needs.get-tests.outputs.models != '[]'  || needs.get-tests.outputs.quantizations != '[]' }}
@@ -251,21 +272,21 @@ jobs:
           python3 << 'PYTHON_SCRIPT'
           import json
           import os
-          
+
           def filter_and_format_report(data):
             """
             Filter out entries where commit is `None` (failing tests who status is not certain) and format as text
             """
             lines = []
-            
+
             for model, model_result in data.items():
                 model_lines = []
                 for device, failures in model_result.items():
-                    
+
                     # Filter out None commits and extract just the test names
                     test_names = [
-                        failure['test'] 
-                        for failure in failures 
+                        failure['test']
+                        for failure in failures
                         if isinstance(failure, dict) and failure.get('commit') is not None
                     ]
 
@@ -274,32 +295,32 @@ jobs:
                         if idx == 0:
                             job_link = failures[idx]['job_link']
                             model_lines.append(f"- [{model}]({job_link}):")
-          
+
                         model_lines.append(f"    {test_name}")
 
                 # Only add model section if it has tests
                 if len(model_lines) > 0:
                     lines.extend(model_lines)
                     lines.append("")  # Empty line between models
-            
+
             return "\n".join(lines).strip()
-          
+
           # Load and filter reports
           model_report_str = os.environ.get('MODEL_REPORT', '{}')
           quant_report_str = os.environ.get('QUANT_REPORT', '{}')
-          
+
           model_report = json.loads(model_report_str) if model_report_str else {}
           quant_report = json.loads(quant_report_str) if quant_report_str else {}
-          
+
           formatted_model = filter_and_format_report(model_report)
           formatted_quant = filter_and_format_report(quant_report)
-          
+
           # Write to files
           with open('model_ci.txt', 'w') as f:
               f.write(formatted_model)
               if formatted_model:
                   f.write('\n')
-          
+
           with open('quantization_ci.txt', 'w') as f:
               f.write(formatted_quant)
               if formatted_quant:
@@ -339,7 +360,7 @@ jobs:
                 cat model_ci.txt
                 echo ''
               fi
-              
+
               # Check if quantization_ci.txt has content
               if [ -s quantization_ci.txt ]; then
                 echo '### Quantization CI Report'
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 219e570469f1..57693322a771 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -545,11 +545,11 @@ jobs:
       - name: Reinstall transformers in edit mode
         working-directory: /transformers
         run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[testing]
-  
+
       - name: Install kernels
         working-directory: /transformers
         run: python3 -m pip install -U kernels
-  
+
       - name: NVIDIA-SMI
         run: nvidia-smi
 
@@ -579,7 +579,7 @@ jobs:
 
           echo "$machine_type"
           echo "machine_type=$machine_type" >> $GITHUB_ENV
-    
+
       - name: Run kernel tests on GPU
         working-directory: /transformers
         run: |
@@ -597,6 +597,70 @@ jobs:
           name: ${{ env.machine_type }}_run_kernels_gpu_test_reports
           path: /transformers/reports/${{ env.machine_type }}_run_kernels_gpu_test_reports
 
+  run_kernels_gpu:
+    if: ${{ inputs.job == 'run_kernels_gpu' }}
+    name: Kernel tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [aws-g5-4xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: ${{ inputs.docker }}
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+
+      - name: Reinstall transformers in edit mode
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[testing]
+
+      - name: Install kernels
+        working-directory: /transformers
+        run: python3 -m pip install -U kernels
+
+      - name: NVIDIA-SMI
+        run: nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Run kernel tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_kernels_gpu_test_reports tests/kernels/test_kernels.py
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ env.machine_type }}_run_kernels_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_kernels_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_kernels_gpu_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_kernels_gpu_test_reports
+
   run_extract_warnings:
     # Let's only do this for the job `run_models_gpu` to simplify the (already complex) logic.
     if: ${{ always() && inputs.job == 'run_models_gpu' }}
diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
index ac927b8d2306..cc4b5e2a02fc 100644
--- a/src/transformers/models/align/processing_align.py
+++ b/src/transformers/models/align/processing_align.py
@@ -17,6 +17,7 @@
 """
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
 class AlignProcessorKwargs(ProcessingKwargs, total=False):
@@ -29,36 +30,8 @@ class AlignProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class AlignProcessor(ProcessorMixin):
-    r"""
-    Constructs an ALIGN processor which wraps [`EfficientNetImageProcessor`] and
-    [`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that inherits both the image processor and
-    tokenizer functionalities. See the [`~AlignProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
-    information.
-    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
-        ```python
-        from transformers import AlignProcessor
-        from PIL import Image
-        model_id = "kakaobrain/align-base"
-        processor = AlignProcessor.from_pretrained(model_id)
-
-        processor(
-            images=your_pil_image,
-            text=["What is that?"],
-            images_kwargs = {"crop_size": {"height": 224, "width": 224}},
-            text_kwargs = {"padding": "do_not_pad"},
-            common_kwargs = {"return_tensors": "pt"},
-        )
-        ```
-
-    Args:
-        image_processor ([`EfficientNetImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`BertTokenizer`, `BertTokenizerFast`]):
-            The tokenizer is a required input.
-
-    """
-
     valid_processor_kwargs = AlignProcessorKwargs
 
     def __init__(self, image_processor, tokenizer):
diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
index 933a5e48dfed..989dc7ecdbdd 100644
--- a/src/transformers/models/altclip/processing_altclip.py
+++ b/src/transformers/models/altclip/processing_altclip.py
@@ -17,24 +17,12 @@
 """
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 from ...utils.deprecation import deprecate_kwarg
 
 
+@auto_docstring
 class AltCLIPProcessor(ProcessorMixin):
-    r"""
-    Constructs a AltCLIP processor which wraps a CLIP image processor and a XLM-Roberta tokenizer into a single
-    processor.
-
-    [`AltCLIPProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`XLMRobertaTokenizerFast`]. See
-    the [`~AltCLIPProcessor.__call__`] and [`~AltCLIPProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`CLIPImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`XLMRobertaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-    """
-
     @deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor")
     def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)
diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py
index c29c289649da..e8754c1a3df9 100644
--- a/src/transformers/models/aria/processing_aria.py
+++ b/src/transformers/models/aria/processing_aria.py
@@ -27,6 +27,7 @@
 from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils import PreTokenizedInput, TextInput
 from ...utils import TensorType
+from ...utils.auto_docstring import auto_docstring
 from ..auto import AutoTokenizer
 
 
@@ -52,21 +53,8 @@ class AriaProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class AriaProcessor(ProcessorMixin):
-    """
-    AriaProcessor is a processor for the Aria model which wraps the Aria image preprocessor and the LLama slow tokenizer.
-
-    Args:
-        image_processor (`AriaImageProcessor`, *optional*):
-            The AriaImageProcessor to use for image preprocessing.
-        tokenizer (`PreTrainedTokenizerBase`, *optional*):
-            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
-        chat_template (`str`, *optional*):
-            A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
-        size_conversion (`Dict`, *optional*):
-            A dictionary indicating size conversions for images.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -74,6 +62,10 @@ def __init__(
         chat_template: Optional[str] = None,
         size_conversion: Optional[dict[Union[float, int], int]] = None,
     ):
+        """
+        size_conversion (`Dict`, *optional*):
+            A dictionary indicating size conversions for images.
+        """
         if size_conversion is None:
             size_conversion = {490: 128, 980: 256}
         self.size_conversion = {int(k): v for k, v in size_conversion.items()}
@@ -85,6 +77,7 @@ def __init__(
 
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
@@ -92,18 +85,6 @@ def __call__(
         **kwargs: Unpack[AriaProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s).
-
-        Args:
-            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`ImageInput`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
diff --git a/src/transformers/models/aya_vision/processing_aya_vision.py b/src/transformers/models/aya_vision/processing_aya_vision.py
index 049b0e5d24eb..08d42fff08a5 100644
--- a/src/transformers/models/aya_vision/processing_aya_vision.py
+++ b/src/transformers/models/aya_vision/processing_aya_vision.py
@@ -21,6 +21,7 @@
 from ...image_utils import ImageInput, make_flat_list_of_images
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class AyaVisionProcessorKwargs(ProcessingKwargs, total=False):
@@ -36,16 +37,26 @@ class AyaVisionProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class AyaVisionProcessor(ProcessorMixin):
-    r"""
-    Constructs a AyaVision processor which wraps a [`AutoImageProcessor`] and
-    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
-    tokenizer functionalities. See the [`~AyaVisionProcessor.__call__`] and [`~AyaVisionProcessor.decode`] for more information.
-    Args:
-        image_processor ([`AutoImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
-            The tokenizer is a required input.
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size: int = 28,
+        img_size: int = 364,
+        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+        downsample_factor: int = 1,
+        start_of_img_token="<|START_OF_IMG|>",
+        end_of_img_token="<|END_OF_IMG|>",
+        img_patch_token="<|IMG_PATCH|>",
+        img_line_break_token="<|IMG_LINE_BREAK|>",
+        tile_token="TILE",
+        tile_global_token="TILE_GLOBAL",
+        chat_template=None,
+        **kwargs,
+    ):
+        """
         patch_size (`int`, *optional*, defaults to 28):
             The size of image patches for tokenization.
         img_size (`int`, *optional*, defaults to 364):
@@ -66,27 +77,7 @@ class AyaVisionProcessor(ProcessorMixin):
             The token to be used to represent an image patch in the text.
         tile_global_token (`str`, *optional*, defaults to `"TILE_GLOBAL"`):
             The token to be used to represent the cover image in the text.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
-    def __init__(
-        self,
-        image_processor=None,
-        tokenizer=None,
-        patch_size: int = 28,
-        img_size: int = 364,
-        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
-        downsample_factor: int = 1,
-        start_of_img_token="<|START_OF_IMG|>",
-        end_of_img_token="<|END_OF_IMG|>",
-        img_patch_token="<|IMG_PATCH|>",
-        img_line_break_token="<|IMG_LINE_BREAK|>",
-        tile_token="TILE",
-        tile_global_token="TILE_GLOBAL",
-        chat_template=None,
-        **kwargs,
-    ):
+        """
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
         self.image_token = image_token
@@ -125,6 +116,7 @@ def _prompt_split_image(self, num_patches):
         img_string += f"{self.end_of_img_token}"
         return img_string
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -132,24 +124,6 @@ def __call__(
         **kwargs: Unpack[AyaVisionProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text.
-        To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
-        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py
index 403d107f48f9..9c702013c740 100644
--- a/src/transformers/models/bark/processing_bark.py
+++ b/src/transformers/models/bark/processing_bark.py
@@ -26,6 +26,7 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...utils.hub import cached_file
 from ..auto import AutoTokenizer
 
@@ -33,13 +34,16 @@
 logger = logging.get_logger(__name__)
 
 
+@auto_docstring
 class BarkProcessor(ProcessorMixin):
-    r"""
-    Constructs a Bark processor which wraps a text tokenizer and optional Bark voice presets into a single processor.
+    preset_shape = {
+        "semantic_prompt": 1,  # 1D array of shape (X,)
+        "coarse_prompt": 2,  # 2D array of shape (2,X)
+        "fine_prompt": 2,  # 2D array of shape (8,X)
+    }
 
-    Args:
-        tokenizer ([`PreTrainedTokenizer`]):
-            An instance of [`PreTrainedTokenizer`].
+    def __init__(self, tokenizer, speaker_embeddings=None):
+        """
         speaker_embeddings (`dict[dict[str]]`, *optional*):
             Optional nested speaker embeddings dictionary. The first level contains voice preset names (e.g
             `"en_speaker_4"`). The second level contains `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`
@@ -47,15 +51,7 @@ class BarkProcessor(ProcessorMixin):
             [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c) for
             a list of `voice_preset_names`.
 
-    """
-
-    preset_shape = {
-        "semantic_prompt": 1,  # 1D array of shape (X,)
-        "coarse_prompt": 2,  # 2D array of shape (2,X)
-        "fine_prompt": 2,  # 2D array of shape (8,X)
-    }
-
-    def __init__(self, tokenizer, speaker_embeddings=None):
+        """
         super().__init__(tokenizer)
 
         self.speaker_embeddings = speaker_embeddings
@@ -260,6 +256,7 @@ def _verify_speaker_embeddings(self, remove_unavailable: bool = True):
                 for voice_preset in unavailable_keys:
                     del self.speaker_embeddings[voice_preset]
 
+    @auto_docstring
     def __call__(
         self,
         text=None,
@@ -272,26 +269,11 @@ def __call__(
         **kwargs,
     ) -> BatchEncoding:
         """
-        Main method to prepare for the model one or several sequences(s). This method forwards the `text` and `kwargs`
-        arguments to the AutoTokenizer's [`~AutoTokenizer.__call__`] to encode the text. The method also proposes a
-        voice preset which is a dictionary of arrays that conditions `Bark`'s output. `kwargs` arguments are forwarded
-        to the tokenizer and to `cached_file` method if `voice_preset` is a valid filename.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            voice_preset (`str`, `dict[np.ndarray]`):
-                The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g
-                `"en_speaker_1"`, or directly a dictionary of `np.ndarray` embeddings for each submodel of `Bark`. Or
-                it can be a valid file name of a local `.npz` single voice preset containing the keys
-                `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
+        voice_preset (`str`, `dict[np.ndarray]`):
+            The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g
+            `"en_speaker_1"`, or directly a dictionary of `np.ndarray` embeddings for each submodel of `Bark`. Or
+            it can be a valid file name of a local `.npz` single voice preset containing the keys
+            `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`.
 
         Returns:
             [`BatchEncoding`]: A [`BatchEncoding`] object containing the output of the `tokenizer`.
diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py
index 965164206c5a..50c7ae96a130 100644
--- a/src/transformers/models/blip/processing_blip.py
+++ b/src/transformers/models/blip/processing_blip.py
@@ -17,6 +17,7 @@
 """
 
 from typing import Optional, Union
+from ...utils.auto_docstring import auto_docstring
 
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
@@ -39,48 +40,20 @@ class BlipProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class BlipProcessor(ProcessorMixin):
-    r"""
-    Constructs a BLIP processor which wraps a BERT tokenizer and BLIP image processor into a single processor.
-
-    [`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`BertTokenizerFast`]. See the
-    docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`BlipImageProcessor`):
-            An instance of [`BlipImageProcessor`]. The image processor is a required input.
-        tokenizer (`BertTokenizerFast`):
-            An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
-    """
 
     def __init__(self, image_processor, tokenizer, **kwargs):
         tokenizer.return_token_type_ids = False
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
         **kwargs: Unpack[BlipProcessorKwargs],
     ) -> BatchEncoding:
-        """
-        This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
-        [`BertTokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-        Args:
-            images (`ImageInput`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                    - `'np'`: Return NumPy `np.ndarray` objects.
-        """
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
 
diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py
index 5949e2c648ce..36ba8bcf4557 100644
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@@ -23,6 +23,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import AddedToken, BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -44,23 +45,13 @@ class Blip2ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Blip2Processor(ProcessorMixin):
-    r"""
-    Constructs a BLIP-2 processor which wraps a BLIP image processor and an OPT/T5 tokenizer into a single processor.
-
-    [`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the docstring
-    of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`BlipImageProcessor`):
-            An instance of [`BlipImageProcessor`]. The image processor is a required input.
-        tokenizer (`AutoTokenizer`):
-            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+    def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
+        """
         num_query_tokens (`int`, *optional*):
             Number of tokens used by the Qformer as queries, should be same as in model's config.
-    """
-
-    def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
+        """
         tokenizer.return_token_type_ids = False
         if not hasattr(tokenizer, "image_token"):
             self.image_token = AddedToken("<image>", normalized=False, special=True)
@@ -71,30 +62,13 @@ def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs):
 
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
         **kwargs: Unpack[Blip2ProcessorKwargs],
     ) -> BatchEncoding:
-        """
-        This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
-        [`BertTokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-        Args:
-            images (`ImageInput`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                    - `'np'`: Return NumPy `np.ndarray` objects.
-        """
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
         output_kwargs = self._merge_kwargs(
diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py
index 5de97ec411dc..5ccd7aab5658 100644
--- a/src/transformers/models/bridgetower/processing_bridgetower.py
+++ b/src/transformers/models/bridgetower/processing_bridgetower.py
@@ -17,6 +17,7 @@
 """
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
 class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False):
@@ -38,22 +39,8 @@ class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class BridgeTowerProcessor(ProcessorMixin):
-    r"""
-    Constructs a BridgeTower processor which wraps a Roberta tokenizer and BridgeTower image processor into a single
-    processor.
-
-    [`BridgeTowerProcessor`] offers all the functionalities of [`BridgeTowerImageProcessor`] and
-    [`RobertaTokenizerFast`]. See the docstring of [`~BridgeTowerProcessor.__call__`] and
-    [`~BridgeTowerProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`BridgeTowerImageProcessor`):
-            An instance of [`BridgeTowerImageProcessor`]. The image processor is a required input.
-        tokenizer (`RobertaTokenizerFast`):
-            An instance of ['RobertaTokenizerFast`]. The tokenizer is a required input.
-    """
-
     valid_processor_kwargs = BridgeTowerProcessorKwargs
 
     def __init__(self, image_processor, tokenizer):
diff --git a/src/transformers/models/bros/processing_bros.py b/src/transformers/models/bros/processing_bros.py
index d92b163955a7..a2317c43205e 100644
--- a/src/transformers/models/bros/processing_bros.py
+++ b/src/transformers/models/bros/processing_bros.py
@@ -17,6 +17,7 @@
 """
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
 class BrosProcessorKwargs(ProcessingKwargs, total=False):
@@ -34,17 +35,8 @@ class BrosProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class BrosProcessor(ProcessorMixin):
-    r"""
-    Constructs a Bros processor which wraps a BERT tokenizer.
-
-    [`BrosProcessor`] offers all the functionalities of [`BertTokenizerFast`]. See the docstring of
-    [`~BrosProcessor.__call__`] and [`~BrosProcessor.decode`] for more information.
-
-    Args:
-        tokenizer (`BertTokenizerFast`, *optional*):
-            An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
-    """
 
     valid_processor_kwargs = BrosProcessorKwargs
 
diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py
index 694be7ab8f26..418ef1d30147 100644
--- a/src/transformers/models/chameleon/processing_chameleon.py
+++ b/src/transformers/models/chameleon/processing_chameleon.py
@@ -30,6 +30,7 @@
     Unpack,
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class ChameleonTextKwargs(TextKwargs, total=False):
@@ -50,26 +51,15 @@ class ChameleonProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class ChameleonProcessor(ProcessorMixin):
-    r"""
-    Constructs a Chameleon processor which wraps a Chameleon image processor and a Chameleon tokenizer into a single
-    processor.
-
-    [`ChameleonProcessor`] offers all the functionalities of [`ChameleonImageProcessor`] and [`LlamaTokenizerFast`].
-    See the [`~ChameleonProcessor.__call__`] and [`~ChameleonProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`ChameleonImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`]):
-            The tokenizer is a required input.
+    def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = "<image>"):
+        """
         image_seq_length (`int`, *optional*, defaults to 1024):
             Sequence length of one image embedding.
         image_token (`str`, *optional*, defaults to `"<image>"`):
             The special token used to indicate image in the text.
-    """
-
-    def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = "<image>"):
+        """
         self.image_seq_length = image_seq_length
         self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
         self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
@@ -84,6 +74,7 @@ def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, ima
 
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -91,26 +82,6 @@ def __call__(
         **kwargs: Unpack[ChameleonProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/chinese_clip/processing_chinese_clip.py b/src/transformers/models/chinese_clip/processing_chinese_clip.py
index 6508136f772e..e60944c330e7 100644
--- a/src/transformers/models/chinese_clip/processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py
@@ -17,22 +17,11 @@
 """
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class ChineseCLIPProcessor(ProcessorMixin):
-    r"""
-    Constructs a Chinese-CLIP processor which wraps a Chinese-CLIP image processor and a Chinese-CLIP tokenizer into a
-    single processor.
-
-    [`ChineseCLIPProcessor`] offers all the functionalities of [`ChineseCLIPImageProcessor`] and [`BertTokenizerFast`].
-    See the [`~ChineseCLIPProcessor.__call__`] and [`~ChineseCLIPProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`ChineseCLIPImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`BertTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-    """
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py
index a72151cb9b63..4a733f33cc4a 100644
--- a/src/transformers/models/clap/processing_clap.py
+++ b/src/transformers/models/clap/processing_clap.py
@@ -22,30 +22,20 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...utils.deprecation import deprecate_kwarg
 
 
 logger = logging.get_logger(__name__)
 
 
+@auto_docstring
 class ClapProcessor(ProcessorMixin):
-    r"""
-    Constructs a CLAP processor which wraps a CLAP feature extractor and a RoBerta tokenizer into a single processor.
-
-    [`ClapProcessor`] offers all the functionalities of [`ClapFeatureExtractor`] and [`RobertaTokenizerFast`]. See the
-    [`~ClapProcessor.__call__`] and [`~ClapProcessor.decode`] for more information.
-
-    Args:
-        feature_extractor ([`ClapFeatureExtractor`]):
-            The audio processor is a required input.
-        tokenizer ([`RobertaTokenizerFast`]):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
     @deprecate_kwarg("audios", version="v4.59.0", new_name="audio")
+    @auto_docstring
     def __call__(
         self,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
@@ -53,11 +43,6 @@ def __call__(
         audio: Optional[AudioInput] = None,
         **kwargs: Unpack[ProcessingKwargs],
     ):
-        """
-        Forwards the `audio` and `sampling_rate` arguments to [`~ClapFeatureExtractor.__call__`] and the `text`
-        argument to [`~RobertaTokenizerFast.__call__`]. Please refer to the docstring of the above two methods for more
-        information.
-        """
         # The `deprecate_kwarg` will not work if the inputs are passed as arguments, so we check
         # again that the correct naming is used
         if audios is not None and audio is None:
diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py
index 9258d2e8fee3..3b20fff7490a 100644
--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@@ -17,22 +17,11 @@
 """
 
 from ...processing_utils import ProcessorMixin
+from ...utils import auto_docstring
 
 
+@auto_docstring
 class CLIPProcessor(ProcessorMixin):
-    r"""
-    Constructs a CLIP processor which wraps a CLIP image processor and a CLIP tokenizer into a single processor.
-
-    [`CLIPProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`CLIPTokenizerFast`]. See the
-    [`~CLIPProcessor.__call__`] and [`~CLIPProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`CLIPImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`AutoTokenizer`], *optional*):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py
index 4d431181cb4f..f856b97a4490 100644
--- a/src/transformers/models/clipseg/processing_clipseg.py
+++ b/src/transformers/models/clipseg/processing_clipseg.py
@@ -18,51 +18,21 @@
 
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class CLIPSegProcessor(ProcessorMixin):
-    r"""
-    Constructs a CLIPSeg processor which wraps a CLIPSeg image processor and a CLIP tokenizer into a single processor.
-
-    [`CLIPSegProcessor`] offers all the functionalities of [`ViTImageProcessor`] and [`CLIPTokenizerFast`]. See the
-    [`~CLIPSegProcessor.__call__`] and [`~CLIPSegProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`ViTImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`CLIPTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=None, **kwargs):
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        ViTImageProcessor's [`~ViTImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring of
-        the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            visual_prompt (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The visual prompt image or batch of images to be prepared. Each visual prompt image can be a PIL image,
-                NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape
-                (C, H, W), where C is a number of channels, H and W are image height and width.
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
+        visual_prompt (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+            The visual prompt image or batch of images to be prepared. Each visual prompt image can be a PIL image,
+            NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape
+            (C, H, W), where C is a number of channels, H and W are image height and width.
 
         Returns:
             [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
diff --git a/src/transformers/models/clvp/processing_clvp.py b/src/transformers/models/clvp/processing_clvp.py
index 331589a23999..41812fe21195 100644
--- a/src/transformers/models/clvp/processing_clvp.py
+++ b/src/transformers/models/clvp/processing_clvp.py
@@ -19,34 +19,20 @@
 
 from ...processing_utils import ProcessorMixin
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
 
 
+@auto_docstring
 class ClvpProcessor(ProcessorMixin):
-    r"""
-    Constructs a CLVP processor which wraps a CLVP Feature Extractor and a CLVP Tokenizer into a single processor.
-
-    [`ClvpProcessor`] offers all the functionalities of [`ClvpFeatureExtractor`] and [`ClvpTokenizer`]. See the
-    [`~ClvpProcessor.__call__`], [`~ClvpProcessor.decode`] and [`~ClvpProcessor.batch_decode`] for more information.
-
-    Args:
-        feature_extractor (`ClvpFeatureExtractor`):
-            An instance of [`ClvpFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`ClvpTokenizer`):
-            An instance of [`ClvpTokenizer`]. The tokenizer is a required input.
-    """
 
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
+    @auto_docstring
     def __call__(self, *args, **kwargs):
-        """
-        Forwards the `audio` and `sampling_rate` arguments to [`~ClvpFeatureExtractor.__call__`] and the `text`
-        argument to [`~ClvpTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
-        information.
-        """
         raw_speech = kwargs.pop("raw_speech", None)
         if raw_speech is not None:
             logger.warning(
diff --git a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
index b34fd1c5594e..690a1906e7cf 100644
--- a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
+++ b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py
@@ -21,6 +21,7 @@
 from ...image_utils import ImageInput
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class Cohere2VisionProcessorKwargs(ProcessingKwargs, total=False):
@@ -33,20 +34,8 @@ class Cohere2VisionProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Cohere2VisionProcessor(ProcessorMixin):
-    r"""
-    Constructs a Cohere2Vision processor which wraps a [`AutoImageProcessor`] and
-    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
-    tokenizer functionalities. See the [`~Cohere2VisionProcessor.__call__`] and [`~Cohere2VisionProcessor.decode`] for more information.
-    Args:
-        image_processor ([`AutoImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -72,6 +61,7 @@ def __init__(
             ]
         )
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -79,24 +69,6 @@ def __call__(
         **kwargs: Unpack[Cohere2VisionProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text.
-        To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
-        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/colpali/processing_colpali.py b/src/transformers/models/colpali/processing_colpali.py
index 1ad511ced7a7..463be38d818d 100644
--- a/src/transformers/models/colpali/processing_colpali.py
+++ b/src/transformers/models/colpali/processing_colpali.py
@@ -27,6 +27,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput
 from ...utils import is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -72,27 +73,8 @@ def build_string_from_input(prompt, bos_token, image_seq_len, image_token, num_i
     return f"{image_token * image_seq_len * num_images}{bos_token}{prompt}\n"
 
 
+@auto_docstring
 class ColPaliProcessor(ProcessorMixin):
-    r"""
-    Constructs a ColPali processor which wraps a PaliGemmaProcessor and special methods to process images and queries, as
-    well as to compute the late-interaction retrieval score.
-
-    [`ColPaliProcessor`] offers all the functionalities of [`PaliGemmaProcessor`]. See the [`~PaliGemmaProcessor.__call__`]
-    for more information.
-
-    Args:
-        image_processor ([`SiglipImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        visual_prompt_prefix (`str`, *optional*, defaults to `"Describe the image."`):
-            A string that gets tokenized and prepended to the image tokens.
-        query_prefix (`str`, *optional*, defaults to `"Question: "`):
-            A prefix to be used for the query.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -101,6 +83,12 @@ def __init__(
         visual_prompt_prefix: str = "Describe the image.",
         query_prefix: str = "Question: ",
     ):
+        """
+        visual_prompt_prefix (`str`, *optional*, defaults to `"Describe the image."`):
+            A string that gets tokenized and prepended to the image tokens.
+        query_prefix (`str`, *optional*, defaults to `"Question: "`):
+            A prefix to be used for the query.
+        """
         self.visual_prompt_prefix = visual_prompt_prefix
         self.query_prefix = query_prefix
         if not hasattr(image_processor, "image_seq_length"):
@@ -124,6 +112,7 @@ def __init__(
 
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -131,31 +120,6 @@ def __call__(
         **kwargs: Unpack[ColPaliProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model either (1) one or several texts, either (2) one or several image(s). This method is a custom
-        wrapper around the PaliGemmaProcessor's [`~PaliGemmaProcessor.__call__`] method adapted for the ColPali model. It cannot process
-        both text and images at the same time.
-
-        When preparing the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
-        [`~LlamaTokenizerFast.__call__`].
-        When preparing the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
-        [`~SiglipImageProcessor.__call__`].
-        Please refer to the docstring of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
-                number of channels, H and W are image height and width.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/colqwen2/processing_colqwen2.py b/src/transformers/models/colqwen2/processing_colqwen2.py
index 00f00c920856..a8d1db4fff4e 100644
--- a/src/transformers/models/colqwen2/processing_colqwen2.py
+++ b/src/transformers/models/colqwen2/processing_colqwen2.py
@@ -26,6 +26,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -45,25 +46,8 @@ class ColQwen2ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class ColQwen2Processor(ProcessorMixin):
-    r"""
-    Constructs a ColQwen2 processor which wraps a Qwen2VLProcessor and special methods to process images and queries, as
-    well as to compute the late-interaction retrieval score.
-
-    [`ColQwen2Processor`] offers all the functionalities of [`Qwen2VLProcessor`]. See the [`~Qwen2VLProcessor.__call__`]
-    for more information.
-
-    Args:
-        image_processor ([`Qwen2VLImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`Qwen2TokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        visual_prompt_prefix (`str`, *optional*): A string that gets tokenized and prepended to the image tokens.
-        query_prefix (`str`, *optional*): A prefix to be used for the query.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -85,6 +69,7 @@ def __init__(
             query_prefix = "Query: "
         self.query_prefix = query_prefix
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -92,31 +77,10 @@ def __call__(
         **kwargs: Unpack[ColQwen2ProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model either (1) one or several texts, either (2) one or several image(s). This method is a custom
-        wrapper around the Qwen2VLProcessor's [`~Qwen2VLProcessor.__call__`] method adapted for the ColQwen2 model. It cannot process
-        both text and images at the same time.
-
-        When preparing the the text(s), this method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's
-        [`~Qwen2TokenizerFast.__call__`].
-        When preparing the the image(s), this method forwards the `images` and `kwargs` arguments to Qwen2VLImageProcessor's
-        [`~Qwen2VLImageProcessor.__call__`].
-        Please refer to the doctsring of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
-                number of channels, H and W are image height and width.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
+        visual_prompt_prefix (`str`, *optional*, defaults to `"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>"`):
+            A string that gets tokenized and prepended to the image tokens.
+        query_prefix (`str`, *optional*, defaults to `"Query: "`):
+            A prefix to be used for the query.
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/csm/processing_csm.py b/src/transformers/models/csm/processing_csm.py
index d77ffeffd896..ff59ee3f912b 100644
--- a/src/transformers/models/csm/processing_csm.py
+++ b/src/transformers/models/csm/processing_csm.py
@@ -20,6 +20,7 @@
 import numpy as np
 
 from ...utils import is_soundfile_available, is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -59,42 +60,8 @@ class CsmProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class CsmProcessor(ProcessorMixin):
-    r"""
-    Constructs a Csm processor which wraps [`EncodecFeatureExtractor`] and
-    [`PretrainedTokenizerFast`] into a single processor that inherits both the audio feature extraction and
-    tokenizer functionalities. See the [`~CsmProcessor.__call__`] for more
-    information.
-    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
-        ```python
-        from transformers import CsmProcessor
-        from datasets import load_dataset
-
-        ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
-        audio = ds[0]["audio"]["array"]
-
-        processor = CsmProcessor.from_pretrained("sesame/csm-1b")
-
-        processor(
-            text=["<|begin_of_text|>[0]What are you working on?<|end_of_text|><|AUDIO|><|audio_eos|><|begin_of_text|>[1]I'm figuring out my budget.<|end_of_text|>"],
-            audio=audio,
-            text_kwargs = {"padding": False},
-            audio_kwargs = {"sampling_rate": 16000},
-            common_kwargs = {"return_tensors": "pt"},
-        )
-        # this should error out because EncodecFeatureExtractor expects a 24kHz audio :)
-        ```
-
-    Args:
-        feature_extractor ([`EncodecFeatureExtractor`]):
-            The feature extractor is a required input.
-        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-
-    """
-
     def __init__(
         self,
         feature_extractor,
@@ -189,6 +156,7 @@ def save_audio(
                 audio_value = audio_value.cpu().float().numpy()
             sf.write(p, audio_value, sampling_rate)
 
+    @auto_docstring
     def __call__(
         self,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]],
@@ -197,21 +165,7 @@ def __call__(
         depth_decoder_labels_ratio: Optional[float] = 1.0,
         **kwargs: Unpack[CsmProcessorKwargs],
     ):
-        r"""
-        Main method to prepare text(s) and audio to be fed as input to the model. This method forwards the `text`
-        arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode
-        the text. To prepare the audio, this method forwards the `audio` arguments to
-        EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`]. Please refer
-        to the docstring of the above two methods for more information.
-
-        Args:
-            audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch
-                tensor.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+        """
             output_labels (bool, *optional*, default=False):
                 Whether to return labels for training. Indices will be in `[config.audio_token_id, -100, -101]`.
                 - `config.audio_token_id` indicates an audio frame (considering sequence length elements as frames)
@@ -219,10 +173,7 @@ def __call__(
                 - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)
             depth_decoder_labels_ratio (float, *optional*, default=1.0):
                 The ratio of audio frames to keep for the depth decoder labels.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                    - `'np'`: Return NumPy `np.ndarray` objects.
+
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
index 22b1c2ab71dd..2e1ff47ad437 100644
--- a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
+++ b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py
@@ -24,6 +24,7 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False):
@@ -33,25 +34,8 @@ class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class DeepseekVLProcessor(ProcessorMixin):
-    r"""
-    Constructs a DeepseekVL processor which wraps a DeepseekVL Image Processor and a Llama tokenizer into a single processor.
-
-    [`DeepseekVLProcessor`] offers all the functionalities of [`DeepseekVLImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~DeepseekVLProcessor.__call__`] and [`~DeepseekVLProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`DeepseekVLImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`]):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*):
-            A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        num_image_tokens (`int`, *optional*, defaults to 576):
-            The number of special image tokens used as placeholders for visual content in text sequences.
-    """
-
     def __init__(
         self,
         image_processor,
@@ -59,11 +43,16 @@ def __init__(
         chat_template=None,
         num_image_tokens=576,
     ):
+        """
+        num_image_tokens (`int`, *optional*, defaults to 576):
+            The number of special image tokens used as placeholders for visual content in text sequences.
+        """
         self.image_token = tokenizer.image_token
         self.num_image_tokens = num_image_tokens
 
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
@@ -71,25 +60,6 @@ def __call__(
         **kwargs: Unpack[DeepseekVLProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        DeepseekVLImageProcessor's [`~DeepseekVLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
index 8f842db7346f..db19160f0a71 100644
--- a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
+++ b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py
@@ -24,6 +24,7 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class DeepseekVLHybridProcessorKwargs(ProcessingKwargs, total=False):
@@ -33,25 +34,8 @@ class DeepseekVLHybridProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class DeepseekVLHybridProcessor(ProcessorMixin):
-    r"""
-    Constructs a DeepseekVLHybrid processor which wraps a DeepseekVLHybrid Image Processor and a Llama tokenizer into a single processor.
-
-    [`DeepseekVLHybridProcessor`] offers all the functionalities of [`DeepseekVLHybridImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~DeepseekVLHybridProcessor.__call__`] and [`~DeepseekVLHybridProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`DeepseekVLHybridImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`]):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*):
-            A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        num_image_tokens (`int`, *optional*, defaults to 576):
-            The number of special image tokens used as placeholders for visual content in text sequences.
-    """
-
     def __init__(
         self,
         image_processor,
@@ -59,11 +43,16 @@ def __init__(
         chat_template=None,
         num_image_tokens=576,
     ):
+        """
+        num_image_tokens (`int`, *optional*, defaults to 576):
+            The number of special image tokens used as placeholders for visual content in text sequences.
+        """
         self.image_token = tokenizer.image_token
         self.num_image_tokens = num_image_tokens
 
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
@@ -71,32 +60,13 @@ def __call__(
         **kwargs: Unpack[DeepseekVLHybridProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        DeepseekVLHybridImageProcessor's [`~DeepseekVLHybridImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-            `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-            `None`).
+                `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+                `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
         output_kwargs = self._merge_kwargs(
diff --git a/src/transformers/models/dia/processing_dia.py b/src/transformers/models/dia/processing_dia.py
index 23c04687308c..0d1281f0744b 100644
--- a/src/transformers/models/dia/processing_dia.py
+++ b/src/transformers/models/dia/processing_dia.py
@@ -22,6 +22,7 @@
 from ...feature_extraction_utils import BatchFeature
 from ...processing_utils import AudioKwargs, ProcessingKwargs, ProcessorMixin, Unpack
 from ...utils import is_soundfile_available, is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -61,27 +62,18 @@ class DiaProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class DiaProcessor(ProcessorMixin):
-    r"""
-    Constructs a Dia processor which wraps a [`DiaFeatureExtractor`], [`DiaTokenizer`], and a [`DacModel`] into
-    a single processor. It inherits, the audio feature extraction, tokenizer, and audio encode/decode functio-
-    nalities. See [`~DiaProcessor.__call__`], [`~DiaProcessor.encode`], and [`~DiaProcessor.decode`] for more
-    information.
-
-    Args:
-        feature_extractor (`DiaFeatureExtractor`):
-            An instance of [`DiaFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`DiaTokenizer`):
-            An instance of [`DiaTokenizer`]. The tokenizer is a required input.
-        audio_tokenizer (`DacModel`):
-            An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input.
-    """
-
     audio_tokenizer_class = "DacModel"
 
     def __init__(self, feature_extractor, tokenizer, audio_tokenizer):
+        """
+        audio_tokenizer (`DacModel`):
+            An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input.
+        """
         super().__init__(feature_extractor, tokenizer, audio_tokenizer=audio_tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[str, list[str]],
@@ -89,12 +81,6 @@ def __call__(
         output_labels: Optional[bool] = False,
         **kwargs: Unpack[DiaProcessorKwargs],
     ):
-        """
-        Main method to prepare text(s) and audio to be fed as input to the model. The `audio` argument is
-        forwarded to the DiaFeatureExtractor's [`~DiaFeatureExtractor.__call__`] and subsequently to the
-        DacModel's [`~DacModel.encode`]. The `text` argument to [`~DiaTokenizer.__call__`]. Please refer
-        to the docstring of the above methods for more information.
-        """
         if not is_torch_available():
             raise ValueError(
                 "The `DiaProcessor` relies on the `audio_tokenizer` which requires `torch` but we couldn't "
diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py
index fedd173117eb..c004b9499df7 100644
--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@@ -23,6 +23,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 class DonutProcessorKwargs(ProcessingKwargs, total=False):
@@ -32,37 +33,18 @@ class DonutProcessorKwargs(ProcessingKwargs, total=False):
 logger = logging.get_logger(__name__)
 
 
+@auto_docstring
 class DonutProcessor(ProcessorMixin):
-    r"""
-    Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single
-    processor.
-
-    [`DonutProcessor`] offers all the functionalities of [`DonutImageProcessor`] and
-    [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. See the [`~DonutProcessor.__call__`] and
-    [`~DonutProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`DonutImageProcessor`], *optional*):
-            An instance of [`DonutImageProcessor`]. The image processor is a required input.
-        tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`], *optional*):
-            An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
         **kwargs: Unpack[DonutProcessorKwargs],
     ):
-        """
-        When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
-        [`~AutoImageProcessor.__call__`] and returns its output. If used in the context
-        [`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's
-        [`~DonutTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
-        """
         if images is None and text is None:
             raise ValueError("You need to specify either an `images` or `text` input to process.")
 
diff --git a/src/transformers/models/emu3/image_processing_emu3.py b/src/transformers/models/emu3/image_processing_emu3.py
index 0c550937581f..735046dd9390 100644
--- a/src/transformers/models/emu3/image_processing_emu3.py
+++ b/src/transformers/models/emu3/image_processing_emu3.py
@@ -48,6 +48,13 @@
 
 
 class Emu3ImageProcessorKwargs(ImagesKwargs, total=False):
+    """
+    ratio (`str`, *optional*, defaults to `"1:1"`):
+        The ratio of the image to resize the image.
+    image_area (`int`, *optional*, defaults to `518400`):
+        The area of the image to resize the image.
+    """
+
     ratio: str
     image_area: int
 
diff --git a/src/transformers/models/emu3/processing_emu3.py b/src/transformers/models/emu3/processing_emu3.py
index 52f39a913c54..c7355c67effa 100644
--- a/src/transformers/models/emu3/processing_emu3.py
+++ b/src/transformers/models/emu3/processing_emu3.py
@@ -22,11 +22,11 @@
 from ...image_utils import ImageInput
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
-from ...utils import is_vision_available
+from ...utils import auto_docstring, is_vision_available
 
 
 if is_vision_available():
-    from .image_processing_emu3 import smart_resize
+    from .image_processing_emu3 import Emu3ImageProcessorKwargs, smart_resize
 
 
 class Emu3TextKwargs(TextKwargs, total=False):
@@ -35,6 +35,7 @@ class Emu3TextKwargs(TextKwargs, total=False):
 
 class Emu3ProcessorKwargs(ProcessingKwargs, total=False):
     text_kwargs: Emu3TextKwargs
+    images_kwargs: Emu3ImageProcessorKwargs
     _defaults = {
         "text_kwargs": {
             "return_for_image_generation": False,
@@ -47,23 +48,8 @@ class Emu3ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Emu3Processor(ProcessorMixin):
-    r"""
-    Constructs a Emu3 processor which wraps a Emu3 image processor and a GPT2 tokenizer into a single
-    processor.
-
-    [`Emu3Processor`] offers all the functionalities of [`Emu3ImageProcessor`] and [`GPT2TokenizerFast`].
-    See the [`~Emu3Processor.__call__`] and [`~Emu3Processor.decode`] for more information.
-
-    Args:
-        image_processor ([`Emu3ImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`Emu3TokenizerFast`]):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(
         self,
         image_processor,
@@ -81,6 +67,7 @@ def __init__(
         self.downsample_ratio = 8
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -88,26 +75,6 @@ def __call__(
         **kwargs: Unpack[Emu3ProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to Emu3TokenizerFast's [`~Emu3TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/evolla/processing_evolla.py b/src/transformers/models/evolla/processing_evolla.py
index 807bd294c406..afc0812f5f80 100644
--- a/src/transformers/models/evolla/processing_evolla.py
+++ b/src/transformers/models/evolla/processing_evolla.py
@@ -22,30 +22,23 @@
 from ...processing_utils import (
     ProcessorMixin,
 )
+from ...utils.auto_docstring import auto_docstring
 
 
 PROTEIN_VALID_KEYS = ["aa_seq", "foldseek", "msa"]
 
 
+@auto_docstring
 class EvollaProcessor(ProcessorMixin):
-    r"""
-    Constructs a EVOLLA processor which wraps a LLama tokenizer and SaProt tokenizer (EsmTokenizer) into a single processor.
-
-    [`EvollaProcessor`] offers all the functionalities of [`EsmTokenizer`] and [`LlamaTokenizerFast`]. See the
-    docstring of [`~EvollaProcessor.__call__`] and [`~EvollaProcessor.decode`] for more information.
-
-    Args:
+    def __init__(self, protein_tokenizer, tokenizer=None, protein_max_length=1024, text_max_length=512, **kwargs):
+        """
         protein_tokenizer (`EsmTokenizer`):
             An instance of [`EsmTokenizer`]. The protein tokenizer is a required input.
-        tokenizer (`LlamaTokenizerFast`, *optional*):
-            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
         protein_max_length (`int`, *optional*, defaults to 1024):
             The maximum length of the sequence to be generated.
         text_max_length (`int`, *optional*, defaults to 512):
             The maximum length of the text to be generated.
-    """
-
-    def __init__(self, protein_tokenizer, tokenizer=None, protein_max_length=1024, text_max_length=512, **kwargs):
+        """
         if protein_tokenizer is None:
             raise ValueError("You need to specify an `protein_tokenizer`.")
         if tokenizer is None:
@@ -94,6 +87,7 @@ def process_text(
         )
         return prompt_inputs
 
+    @auto_docstring
     def __call__(
         self,
         proteins: Optional[Union[list[dict], dict]] = None,
@@ -102,7 +96,8 @@ def __call__(
         text_max_length: Optional[int] = None,
         **kwargs,
     ):
-        r"""This method takes batched or non-batched proteins and messages_list and converts them into format that can be used by
+        r"""
+        This method takes batched or non-batched proteins and messages_list and converts them into format that can be used by
         the model.
 
         Args:
diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py
index 7e5b3c0e012e..0bb603753788 100644
--- a/src/transformers/models/flava/processing_flava.py
+++ b/src/transformers/models/flava/processing_flava.py
@@ -17,20 +17,11 @@
 """
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class FlavaProcessor(ProcessorMixin):
-    r"""
-    Constructs a FLAVA processor which wraps a FLAVA image processor and a FLAVA tokenizer into a single processor.
-
-    [`FlavaProcessor`] offers all the functionalities of [`FlavaImageProcessor`] and [`BertTokenizerFast`]. See the
-    [`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`FlavaImageProcessor`], *optional*): The image processor is a required input.
-        tokenizer ([`BertTokenizerFast`], *optional*): The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/florence2/processing_florence2.py b/src/transformers/models/florence2/processing_florence2.py
index c8d699e4bc3e..d6fca01ef2c1 100644
--- a/src/transformers/models/florence2/processing_florence2.py
+++ b/src/transformers/models/florence2/processing_florence2.py
@@ -28,6 +28,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_torch_available, logging
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -42,26 +43,8 @@ class Florence2ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Florence2Processor(ProcessorMixin):
-    r"""
-    Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor.
-
-    [`Florence2Processor`] offers all the functionalities of [`AutoImageProcessor`] and [`BartTokenizerFast`]. See the
-    [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information.
-
-    Args:
-        image_processor (`AutoImageProcessor`, *optional*):
-            The image processor is a required input.
-        tokenizer (`Union[BartTokenizer, BartTokenizerFast]`, *optional*):
-            The tokenizer is a required input.
-        num_additional_image_tokens (`int`, *optional*, defaults to 0):
-            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
-            extra tokens appended, no need to set this arg.
-        post_processor_config (`dict`,  *optional*, defaults to 0):
-            Task-specific parsing rules for [`Florence2PostProcessor`], e.g. regex patterns,
-            thresholds, or banned tokens.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -70,6 +53,14 @@ def __init__(
         post_processor_config: Optional[dict] = None,
         **kwargs,
     ):
+        """
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
+        post_processor_config (`dict`,  *optional*, defaults to 0):
+            Task-specific parsing rules for [`Florence2PostProcessor`], e.g. regex patterns,
+            thresholds, or banned tokens.
+        """
         self.tasks_answer_post_processing_type = {
             "<OCR>": "pure_text",
             "<OCR_WITH_REGION>": "ocr",
@@ -143,6 +134,7 @@ def _construct_prompts(self, text: Union[str, list[str]]) -> list[str]:
             prompts.append(prompt)
         return prompts
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -150,25 +142,6 @@ def __call__(
         **kwargs: Unpack[Florence2ProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py
index ee697deccf9e..ceda84912ba5 100644
--- a/src/transformers/models/fuyu/processing_fuyu.py
+++ b/src/transformers/models/fuyu/processing_fuyu.py
@@ -30,6 +30,7 @@
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_torch_available, logging, requires_backends
+from ...utils.auto_docstring import auto_docstring
 from ...utils.import_utils import requires
 
 
@@ -333,20 +334,8 @@ def scale_bbox_to_transformed_image(
 
 
 @requires(backends=("vision",))
+@auto_docstring
 class FuyuProcessor(ProcessorMixin):
-    r"""
-    Constructs a Fuyu processor which wraps a Fuyu image processor and a Llama tokenizer into a single processor.
-
-    [`FuyuProcessor`] offers all the functionalities of [`FuyuImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~FuyuProcessor.__call__`] and [`~FuyuProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`FuyuImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`]):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor, tokenizer, **kwargs):
         super().__init__(image_processor=image_processor, tokenizer=tokenizer)
         self.image_processor = image_processor
@@ -478,6 +467,7 @@ def get_sample_encoding(
         }
         return batch_encoding
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -485,21 +475,6 @@ def __call__(
         **kwargs: Unpack[FuyuProcessorKwargs],
     ) -> "FuyuBatchFeature":
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to
-        encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        FuyuImageProcessor's [`~FuyuImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `list[PIL.Image.Image]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-
         Returns:
             [`FuyuBatchEncoding`]: A [`FuyuBatchEncoding`] with the following fields:
 
diff --git a/src/transformers/models/gemma3/processing_gemma3.py b/src/transformers/models/gemma3/processing_gemma3.py
index 11574e30b7c1..d8d8e3d61e26 100644
--- a/src/transformers/models/gemma3/processing_gemma3.py
+++ b/src/transformers/models/gemma3/processing_gemma3.py
@@ -23,6 +23,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import to_py_obj
+from ...utils.auto_docstring import auto_docstring
 
 
 class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
@@ -41,6 +42,7 @@ class Gemma3ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Gemma3Processor(ProcessorMixin):
     def __init__(
         self,
@@ -64,6 +66,7 @@ def __init__(
             **kwargs,
         )
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
diff --git a/src/transformers/models/gemma3n/processing_gemma3n.py b/src/transformers/models/gemma3n/processing_gemma3n.py
index 51b686557ed0..9be0c408c129 100644
--- a/src/transformers/models/gemma3n/processing_gemma3n.py
+++ b/src/transformers/models/gemma3n/processing_gemma3n.py
@@ -21,6 +21,7 @@
 from ...image_utils import ImageInput, make_nested_list_of_images
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class Gemma3nProcessorKwargs(ProcessingKwargs, total=False):
@@ -29,28 +30,8 @@ class Gemma3nProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Gemma3nProcessor(ProcessorMixin):
-    """
-    A processor for Gemma 3n, wrapping the full capabilities of a feature extractor, image processor, and tokenizer
-    into a single processor.
-
-    Args:
-        feature_extractor (`Gemma3nAudioFeatureExtractor`):
-            Feature extractor that converts raw audio waveforms into MEL spectrograms for the audio encoder. This
-            should return a `BatchFeature` with `input_features` and `input_features_mask` features.
-        image_processor (`SiglipImageProcessorFast`):
-            Image processor that prepares batches of images for the vision encoder. This should return a `BatchFeature`
-            with a `pixel_values` feature.
-        tokenizer (`GemmaTokenizerFast`):
-            The text tokenizer for the model.
-        chat_template (`string`, *optional*):
-            A Jinja template for generating text prompts from a set of messages.
-        audio_seq_length (int, *optional*, defaults to 188):
-            The number of audio soft tokens that will be added to the text prompt
-        image_seq_length (int, *optional*, defaults to 256):
-            The number of image soft tokens that should be added to
-    """
-
     def __init__(
         self,
         feature_extractor,
@@ -61,6 +42,12 @@ def __init__(
         image_seq_length: int = 256,
         **kwargs,
     ):
+        """
+        audio_seq_length (int, *optional*, defaults to 188):
+            The number of audio soft tokens that will be added to the text prompt
+        image_seq_length (int, *optional*, defaults to 256):
+            The number of image soft tokens that should be added to
+        """
         self.audio_seq_length = audio_seq_length
         self.audio_token_id = tokenizer.audio_token_id
         self.boa_token = tokenizer.boa_token
@@ -83,6 +70,7 @@ def __init__(
             **kwargs,
         )
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py
index 89cfc9618987..80e4f31ad3e1 100644
--- a/src/transformers/models/git/processing_git.py
+++ b/src/transformers/models/git/processing_git.py
@@ -17,22 +17,11 @@
 """
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class GitProcessor(ProcessorMixin):
-    r"""
-    Constructs a GIT processor which wraps a CLIP image processor and a BERT tokenizer into a single processor.
-
-    [`GitProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`BertTokenizerFast`]. See the
-    [`~GitProcessor.__call__`] and [`~GitProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`AutoImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`AutoTokenizer`]):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py
index 79935cbde7b4..8dd68393a76b 100644
--- a/src/transformers/models/glm4v/processing_glm4v.py
+++ b/src/transformers/models/glm4v/processing_glm4v.py
@@ -27,6 +27,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -44,21 +45,8 @@ class Glm4vProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Glm4vProcessor(ProcessorMixin):
-    r"""
-    Constructs a GLM-4V processor which wraps a GLM-4V image processor and a GLM-4 tokenizer into a single processor.
-    [`~Glm4vProcessor.__call__`] and [`~Glm4vProcessor.decode`] for more information.
-    Args:
-        image_processor ([`Glm4vProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        video_processor ([`Glm4vVideoProcessor`], *optional*):
-            The video processor is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         self.image_token = "<|image|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         self.video_token = "<|video|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
@@ -74,6 +62,7 @@ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, c
         )
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -82,26 +71,6 @@ def __call__(
         **kwargs: Unpack[Glm4vProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/got_ocr2/processing_got_ocr2.py b/src/transformers/models/got_ocr2/processing_got_ocr2.py
index 162efef5e9f9..9b5a773ef4ec 100644
--- a/src/transformers/models/got_ocr2/processing_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/processing_got_ocr2.py
@@ -23,6 +23,7 @@
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_vision_available, logging
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_vision_available():
@@ -79,20 +80,8 @@ def preprocess_box_annotation(box: Union[list, tuple], image_size: tuple[int, in
     return list(box)
 
 
+@auto_docstring
 class GotOcr2Processor(ProcessorMixin):
-    r"""
-    Constructs a GotOcr2 processor which wraps a [`GotOcr2ImageProcessor`] and
-    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
-    tokenizer functionalities. See the [`~GotOcr2Processor.__call__`] and [`~GotOcr2Processor.decode`] for more information.
-    Args:
-        image_processor ([`GotOcr2ImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
@@ -127,6 +116,7 @@ def _make_list_of_inputs(self, images, text, box, color, multi_page):
 
         return images, text, box, color
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -134,45 +124,6 @@ def __call__(
         **kwargs: Unpack[GotOcr2ProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
-        is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
-        `crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
-        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            format (`bool`, *optional*):
-                If set, will add the format token to the query, and the model will return the OCR result with formatting.
-            box (`list[float]`, `list[tuple[float, float]]`, `list[tuple[float, float, float, float]]`, *optional*):
-                The box annotation to be added to the query. If a list of floats or a tuple of floats is provided, it
-                will be interpreted as [x1, y1, x2, y2]. If a list of tuples is provided, each tuple should be in the
-                form (x1, y1, x2, y2).
-            color (`str`, *optional*):
-                The color annotation to be added to the query. The model will return the OCR result within the box with
-                the specified color.
-            multi_page (`bool`, *optional*):
-                If set, will enable multi-page inference. The model will return the OCR result across multiple pages.
-            crop_to_patches (`bool`, *optional*):
-                If set, will crop the image to patches. The model will return the OCR result upon the patch reference.
-            min_patches (`int`, *optional*):
-                The minimum number of patches to be cropped from the image. Only used when `crop_to_patches` is set to
-                `True`.
-            max_patches (`int`, *optional*):
-                The maximum number of patches to be cropped from the image. Only used when `crop_to_patches` is set to
-                `True`.
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
index 910840bd661c..51e7ab8c9031 100644
--- a/src/transformers/models/granite_speech/processing_granite_speech.py
+++ b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -20,6 +20,7 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils import PreTokenizedInput, TextInput
 from ...utils import is_torch_available, logging
+from ...utils.auto_docstring import auto_docstring
 from ...utils.import_utils import requires_backends
 
 
@@ -29,6 +30,7 @@
 logger = logging.get_logger(__name__)
 
 
+@auto_docstring
 class GraniteSpeechProcessor(ProcessorMixin):
     def __init__(
         self,
@@ -37,9 +39,14 @@ def __init__(
         audio_token="<|audio|>",
         chat_template=None,
     ):
+        """
+        audio_toke (str, *optional*, defaults to "<|audio|>"):
+            The audio token to use for the processor.
+        """
         self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
         super().__init__(audio_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py
index 74565588d852..60258b209d19 100644
--- a/src/transformers/models/grounding_dino/processing_grounding_dino.py
+++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py
@@ -24,6 +24,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import TensorType, is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -114,47 +115,20 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class GroundingDinoProcessor(ProcessorMixin):
-    r"""
-    Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a
-    single processor.
-
-    [`GroundingDinoProcessor`] offers all the functionalities of [`GroundingDinoImageProcessor`] and
-    [`AutoTokenizer`]. See the docstring of [`~GroundingDinoProcessor.__call__`] and [`~GroundingDinoProcessor.decode`]
-    for more information.
-
-    Args:
-        image_processor (`GroundingDinoImageProcessor`):
-            An instance of [`GroundingDinoImageProcessor`]. The image processor is a required input.
-        tokenizer (`AutoTokenizer`):
-            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
-    """
-
     valid_processor_kwargs = GroundingDinoProcessorKwargs
 
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
         **kwargs: Unpack[GroundingDinoProcessorKwargs],
     ) -> BatchEncoding:
-        """
-        This method uses [`GroundingDinoImageProcessor.__call__`] method to prepare image(s) for the model, and
-        [`BertTokenizerFast.__call__`] to prepare text for the model.
-
-        Args:
-            images (`ImageInput`, `list[ImageInput]`, *optional*):
-                The image or batch of images to be processed. The image might be either PIL image, numpy array or a torch tensor.
-            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`, *optional*):
-                Candidate labels to be detected on the image. The text might be one of the following:
-                - A list of candidate labels (strings) to be detected on the image (e.g. ["a cat", "a dog"]).
-                - A batch of candidate labels to be detected on the batch of images (e.g. [["a cat", "a dog"], ["a car", "a person"]]).
-                - A merged candidate labels string to be detected on the image, separated by "." (e.g. "a cat. a dog.").
-                - A batch of merged candidate labels text to be detected on the batch of images (e.g. ["a cat. a dog.", "a car. a person."]).
-        """
         if text is not None:
             text = self._preprocess_input_text(text)
         return super().__call__(images=images, text=text, **kwargs)
diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py
index 7cb640e56854..00d368c1fc2e 100644
--- a/src/transformers/models/idefics/processing_idefics.py
+++ b/src/transformers/models/idefics/processing_idefics.py
@@ -29,6 +29,7 @@
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_torch_available
+from ...utils.auto_docstring import auto_docstring
 from ...utils.deprecation import deprecate_kwarg
 
 
@@ -135,25 +136,15 @@ def is_url(string):
     return all([result.scheme, result.netloc])
 
 
+@auto_docstring
 class IdeficsProcessor(ProcessorMixin):
-    r"""
-    Constructs a IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor.
-
-    [`IdeficsProcessor`] offers all the functionalities of [`IdeficsImageProcessor`] and [`LlamaTokenizerFast`]. See
-    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`IdeficsImageProcessor`):
-            An instance of [`IdeficsImageProcessor`]. The image processor is a required input.
-        tokenizer (`LlamaTokenizerFast`):
-            An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
-        image_size (`int`, *optional*, defaults to 224):
-            Image size (assuming a square image)
-        add_end_of_utterance_token (`str`, *optional*):
-            The string representation of token representing end of utterance
-    """
-
     def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs):
+        """
+        image_size (int, *optional*, defaults to 224):
+            The size of the image to be processed.
+        add_end_of_utterance_token (bool, *optional*, defaults to None):
+            Whether to add the end of utterance token to the text.
+        """
         super().__init__(image_processor, tokenizer)
         self.image_token_id = (
             tokenizer.image_token_id
@@ -172,6 +163,7 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u
         )
 
     @deprecate_kwarg(old_name="prompts", version="5.0.0", new_name="text", raise_if_both_names=True)
+    @auto_docstring
     def __call__(
         self,
         images: Union[ImageInput, list[ImageInput], str, list[str], list[list[str]]] = None,
@@ -185,29 +177,16 @@ def __call__(
         ] = None,
         **kwargs: Unpack[IdeficsProcessorKwargs],
     ) -> BatchFeature:
-        """This method takes batched or non-batched prompts made of text and images and converts them into prompts that
-        the model was trained on and prepares the image pixel values for the model to process.
-
-        Args:
-            images (`Union[ImageInput, list[ImageInput], str, list[str], list[list[str]]]`):
-                either a single image or a batched list of images - can be passed in when text contains only text prompts,
-                in order to use the image-text-to-text behavior.
-            text (`Union[list[TextInput], [list[list[TextInput]]]]`):
-                either a single prompt or a batched list of prompts - see the detailed description immediately after
-                the end of the arguments doc section.
-            return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`):
-                The type of tensors to return. Can be one of:
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-
+        """
         Returns:
             a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be
             directly passed to `model.generate`
 
-        Detailed explanation:
+            Detailed explanation:
 
-        Each entry in `text` is either a text to be passed as is or an image that will be processed.
+            Each entry in `text` is either a text to be passed as is or an image that will be processed.
 
-        An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.
+            An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.
 
         When the processor encounters an image it'll inject `<fake_token_around_image><image><fake_token_around_image>`
         entry into the prompt.
diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py
index df5f9ca73a8b..05f7153fdfea 100644
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@@ -28,6 +28,7 @@
 )
 from ...tokenization_utils_base import AddedToken, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 if TYPE_CHECKING:
@@ -55,29 +56,17 @@ class Idefics2ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Idefics2Processor(ProcessorMixin):
-    r"""
-    Constructs a IDEFICS2 processor which wraps a LLama tokenizer and IDEFICS2 image processor into a single processor.
-
-    [`IdeficsProcessor`] offers all the functionalities of [`Idefics2ImageProcessor`] and [`LlamaTokenizerFast`]. See
-    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`Idefics2ImageProcessor`):
-            An instance of [`Idefics2ImageProcessor`]. The image processor is a required input.
-        tokenizer (`PreTrainedTokenizerBase`, *optional*):
-            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
+    def __init__(
+        self, image_processor, tokenizer=None, image_seq_len: int = 64, chat_template: Optional[str] = None, **kwargs
+    ):
+        """
         image_seq_len (`int`, *optional*, defaults to 64):
             The length of the image sequence i.e. the number of <image> tokens per image in the input.
             This parameter is used to build the string from the input prompt and image tokens and should match the
             config.perceiver_config.resampler_n_latents value for the model used.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
-    def __init__(
-        self, image_processor, tokenizer=None, image_seq_len: int = 64, chat_template: Optional[str] = None, **kwargs
-    ):
+        """
         if not hasattr(tokenizer, "image_token"):
             self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
             self.image_token = AddedToken("<image>", normalized=False, special=True).content
@@ -107,58 +96,13 @@ def _extract_images_from_prompts(self, prompts):
             prompt_images.append(images)
         return prompt_images
 
+    @auto_docstring
     def __call__(
         self,
         images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None,
         text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None,
         **kwargs: Unpack[Idefics2ProcessorKwargs],
     ) -> BatchFeature:
-        """
-        Processes the input prompts and returns a BatchEncoding.
-
-        Example:
-
-        ```python
-        >>> import requests
-        >>> from transformers import Idefics2Processor
-        >>> from transformers.image_utils import load_image
-
-        >>> processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2)
-        >>> processor.image_processor.do_image_splitting = False  # Force as False to simplify the example
-
-        >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
-        >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
-
-        >>> image1, image2 = load_image(url1), load_image(url2)
-        >>> images = [[image1], [image2]]
-
-        >>> text = [
-        ...     "<image>In this image, we see",
-        ...     "bla bla bla<image>",
-        ... ]
-        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
-        >>> input_ids = outputs.input_ids
-        >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
-        >>> print(input_tokens)
-        ['<s><fake_token_around_image><image><image><fake_token_around_image> In this image, we see', '<s> bla bla bla<fake_token_around_image><image><image><fake_token_around_image>']
-        ```
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. If is of type `list[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
-            text (`Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]`, *optional*):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-
-                Wherever an image token, `<image>` is encountered it is expanded to
-                `<fake_token_around_image>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
-            return_tensors (`Union[str, TensorType]`, *optional*):
-                If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
-                information.
-
-        """
         if text is None and images is None:
             raise ValueError("You must provide either `text` or `images`.")
 
diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py
index 5c978eb3b230..73584110b55e 100644
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -27,6 +27,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import AddedToken, BatchEncoding, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 if TYPE_CHECKING:
@@ -101,29 +102,17 @@ class Idefics3ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Idefics3Processor(ProcessorMixin):
-    r"""
-    Constructs a Idefics3 processor which wraps a LLama tokenizer and Idefics3 image processor into a single processor.
-
-    [`Idefics3Processor`] offers all the functionalities of [`Idefics3ImageProcessor`] and [`Idefics3TokenizerFast`]. See
-    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`Idefics3ImageProcessor`):
-            An instance of [`Idefics3ImageProcessor`]. The image processor is a required input.
-        tokenizer (`PreTrainedTokenizerBase`, *optional*):
-            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
+    def __init__(
+        self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, **kwargs
+    ):
+        """
         image_seq_len (`int`, *optional*, defaults to 169):
             The length of the image sequence i.e. the number of <image> tokens per image in the input.
             This parameter is used to build the string from the input prompt and image tokens and should match the
             value the model used. It is computed as: image_seq_len = int(((image_size // patch_size) ** 2) / (scale_factor**2))
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
-    def __init__(
-        self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, **kwargs
-    ):
+        """
         self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
         self.image_token = AddedToken("<image>", normalized=False, special=True).content
         self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True).content
@@ -164,6 +153,7 @@ def _extract_images_from_prompts(self, prompts):
             prompt_images.append(images)
         return prompt_images
 
+    @auto_docstring
     def __call__(
         self,
         images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None,
@@ -172,51 +162,9 @@ def __call__(
         **kwargs: Unpack[Idefics3ProcessorKwargs],
     ) -> BatchEncoding:
         """
-        Processes the input prompts and returns a BatchEncoding.
-
-        Example:
-
-        ```python
-        >>> import requests
-        >>> from transformers import Idefics3Processor
-        >>> from transformers.image_utils import load_image
-
-        >>> processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3")
-        >>> processor.image_processor.do_image_splitting = False  # Force as False to simplify the example
-
-        >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
-        >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
-
-        >>> image1, image2 = load_image(url1), load_image(url2)
-        >>> images = [[image1], [image2]]
-
-        >>> text = [
-        ...     "<image>In this image, we see",
-        ...     "bla bla bla<image>",
-        ... ]
-        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
-        >>> input_ids = outputs.input_ids
-        >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
-        >>> print(input_tokens)
-        ['<|begin_of_text|><fake_token_around_image><global-img>((<image>)*169)<fake_token_around_image> In this image, we see', '<|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|begin_of_text|>bla bla bla<fake_token_around_image><global-img>((<image>)*169)<fake_token_around_image>']
-        ```
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. If is of type `list[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
-            text (`Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]`, *optional*):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-                Wherever an image token, `<image>` is encountered it is expanded to
-                `<fake_token_around_image>` + `<row_x_col_y>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
-            image_seq_len (`int`, *optional*):
-                The length of the image sequence. If not provided, the default value of self.image_seq_len is used.
-                image_seq_len should be equal to int(((image_size // patch_size) ** 2) / (scale_factor**2))
-            return_tensors (`Union[str, TensorType]`, *optional*):
-                If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
-                information.
+        image_seq_len (`int`, *optional*):
+            The length of the image sequence. If not provided, the default value of self.image_seq_len is used.
+            image_seq_len should be equal to int(((image_size // patch_size) ** 2) / (scale_factor**2))
         """
         if text is None and images is None:
             raise ValueError("You must provide either `text` or `images`.")
diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py
index cfed52f745ae..017e0e2564d2 100644
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@@ -23,6 +23,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -44,26 +45,15 @@ class InstructBlipProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class InstructBlipProcessor(ProcessorMixin):
-    r"""
-    Constructs an InstructBLIP processor which wraps a BLIP image processor and a LLaMa/T5 tokenizer into a single
-    processor.
-
-    [`InstructBlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the
-    docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`BlipImageProcessor`):
-            An instance of [`BlipImageProcessor`]. The image processor is a required input.
-        tokenizer (`AutoTokenizer`):
-            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+    def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs):
+        """
         qformer_tokenizer (`AutoTokenizer`):
             An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
         num_query_tokens (`int`, *optional*):"
             Number of tokens used by the Qformer as queries, should be same as in model's config.
-    """
-
-    def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs):
+        """
         if not hasattr(tokenizer, "image_token"):
             self.image_token = AddedToken("<image>", normalized=False, special=True)
             tokenizer.add_tokens([self.image_token], special_tokens=True)
@@ -73,26 +63,13 @@ def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_toke
 
         super().__init__(image_processor, tokenizer, qformer_tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
         **kwargs: Unpack[InstructBlipProcessorKwargs],
     ) -> BatchFeature:
-        """
-        This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
-        [`BertTokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-        Args:
-            images (`ImageInput`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-        """
         if images is None and text is None:
             raise ValueError("You have to specify at least images or text.")
 
diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
index 81d0103b2742..cc776e5a1e70 100644
--- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py
@@ -28,32 +28,22 @@
     TruncationStrategy,
 )
 from ...utils import TensorType, logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
 logger = logging.get_logger(__name__)
 
 
+@auto_docstring
 class InstructBlipVideoProcessor(ProcessorMixin):
-    r"""
-    Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single
-    processor.
-
-    [`InstructBlipVideoProcessor`] offers all the functionalities of [`InstructBlipVideoVideoProcessor`] and [`AutoTokenizer`]. See the
-    docstring of [`~InstructBlipVideoProcessor.__call__`] and [`~InstructBlipVideoProcessor.decode`] for more information.
-
-    Args:
-        video_processor (`InstructBlipVideoVideoProcessor`):
-            An instance of [`InstructBlipVideoVideoProcessor`]. The video processor is a required input.
-        tokenizer (`AutoTokenizer`):
-            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
+    def __init__(self, video_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs):
+        """
         qformer_tokenizer (`AutoTokenizer`):
             An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
         num_query_tokens (`int`, *optional*):
             Number of tokens used by the Qformer as queries, should be same as in model's config.
-    """
-
-    def __init__(self, video_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs):
+        """
         if not hasattr(tokenizer, "video_token"):
             self.video_token = AddedToken("<video>", normalized=False, special=True)
             tokenizer.add_tokens([self.video_token], special_tokens=True)
@@ -62,6 +52,7 @@ def __init__(self, video_processor, tokenizer, qformer_tokenizer, num_query_toke
         self.num_query_tokens = num_query_tokens
         super().__init__(video_processor, tokenizer, qformer_tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[VideoInput] = None,
@@ -82,12 +73,6 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchFeature:
-        """
-        This method uses [`InstructBlipVideoVideoProcessor.__call__`] method to prepare image(s) or video(s) for the model, and
-        [`BertTokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
         if images is None and text is None:
             raise ValueError("You have to specify at least one of images or text.")
 
diff --git a/src/transformers/models/internvl/processing_internvl.py b/src/transformers/models/internvl/processing_internvl.py
index fd2a52a768ab..582e389d1d6c 100644
--- a/src/transformers/models/internvl/processing_internvl.py
+++ b/src/transformers/models/internvl/processing_internvl.py
@@ -21,6 +21,7 @@
 from ...image_utils import ImageInput, concatenate_list, make_flat_list_of_images
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -39,25 +40,8 @@ class InternVLProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class InternVLProcessor(ProcessorMixin):
-    r"""
-    Constructs a InternVL processor which wraps a [`AutoImageProcessor`] and
-    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
-    tokenizer functionalities. See the [`~InternVLProcessor.__call__`] and [`~InternVLProcessor.decode`] for more information.
-    Args:
-        image_processor ([`AutoImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        video_processor ([`AutoVideoProcessor`], *optional*):
-            The video processor is a required input.
-        image_seq_length (`int`, *optional*, defaults to 256):
-            The number of image token to use per image patch. it should be set so that:
-            image_seq_length = (config.image_size // config.patch_size) ** 2 * (config.scale_factor**2)
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -67,6 +51,11 @@ def __init__(
         chat_template=None,
         **kwargs,
     ):
+        """
+        image_seq_length (`int`, *optional*, defaults to 256):
+            The number of image token to use per image patch. it should be set so that:
+            image_seq_length = (config.image_size // config.patch_size) ** 2 * (config.scale_factor**2)
+        """
         self.image_seq_length = image_seq_length
         self.start_image_token = tokenizer.start_image_token
         self.end_image_token = tokenizer.end_image_token
@@ -143,6 +132,7 @@ def _insert_media_placeholders(
 
         return processed_text, image_video_patches, image_index, video_index
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -151,27 +141,6 @@ def __call__(
         **kwargs: Unpack[InternVLProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
-        is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
-        `crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
-        GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/janus/processing_janus.py b/src/transformers/models/janus/processing_janus.py
index 354570314a78..0d642594b60c 100644
--- a/src/transformers/models/janus/processing_janus.py
+++ b/src/transformers/models/janus/processing_janus.py
@@ -23,6 +23,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -46,25 +47,13 @@ class JanusProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class JanusProcessor(ProcessorMixin):
-    r"""
-    Constructs a Janus processor which wraps a Janus Image Processor and a Llama tokenizer into a single processor.
-
-    [`JanusProcessor`] offers all the functionalities of [`JanusImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~JanusProcessor.__call__`] and [`~JanusProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`JanusImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`]):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        use_default_system_prompt (`str`, *optional*, defaults to `False`):
-            Use default system prompt for Text Generation.
-    """
-
     def __init__(self, image_processor, tokenizer, chat_template=None, use_default_system_prompt=False, **kwargs):
+        """
+        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
+            Use default system prompt for Text Generation.
+        """
         self.num_image_tokens = 576
         self.image_token = tokenizer.image_token
         self.image_start_token = tokenizer.boi_token
@@ -73,6 +62,7 @@ def __init__(self, image_processor, tokenizer, chat_template=None, use_default_s
 
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
@@ -80,25 +70,6 @@ def __call__(
         **kwargs: Unpack[JanusProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        JanusImageProcessor's [`~JanusImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/kosmos2/processing_kosmos2.py b/src/transformers/models/kosmos2/processing_kosmos2.py
index d6fd1e6ec758..744c8785a23c 100644
--- a/src/transformers/models/kosmos2/processing_kosmos2.py
+++ b/src/transformers/models/kosmos2/processing_kosmos2.py
@@ -24,6 +24,7 @@
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from ...tokenization_utils import AddedToken
 from ...tokenization_utils_base import BatchEncoding, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 BboxInput = Union[
@@ -37,12 +38,28 @@
 
 
 class Kosmos2ImagesKwargs(ImagesKwargs, total=False):
+    """
+    bboxes (`Union[list[tuple[int]], list[tuple[float]], list[list[tuple[int]]], list[list[tuple[float]]]]`, *optional*):
+        The bounding bboxes associated to `texts`.
+    num_image_tokens (`int`, *optional* defaults to 64):
+        The number of (consecutive) places that are used to mark the placeholders to store image information.
+        This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
+    first_image_token_id (`int`, *optional*):
+        The token id that will be used for the first place of the subsequence that is reserved to store image
+        information. If unset, will default to `self.tokenizer.unk_token_id + 1`.
+    """
+
     bboxes: Optional[NestedList]  # NOTE: hub validators can't accept `Sequence`
     num_image_tokens: int
     first_image_token_id: Optional[int]
 
 
 class Kosmos2TextKwargs(TextKwargs, total=False):
+    """
+    add_eos_token (`bool`, defaults to `False`):
+    Whether or not to include `EOS` token id in the encoding when `add_special_tokens=True`.
+    """
+
     add_eos_token: bool
 
 
@@ -67,25 +84,13 @@ class Kosmos2ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Kosmos2Processor(ProcessorMixin):
-    r"""
-    Constructs an KOSMOS-2 processor which wraps a KOSMOS-2 image processor and a KOSMOS-2 tokenizer into a single
-    processor.
-
-    [`Kosmos2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and some functionalities of
-    [`XLMRobertaTokenizerFast`]. See the docstring of [`~Kosmos2Processor.__call__`] and [`~Kosmos2Processor.decode`]
-    for more information.
-
-    Args:
-        image_processor (`CLIPImageProcessor`):
-            An instance of [`CLIPImageProcessor`]. The image processor is a required input.
-        tokenizer (`XLMRobertaTokenizerFast`):
-            An instance of ['XLMRobertaTokenizerFast`]. The tokenizer is a required input.
+    def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024, *kwargs):
+        """
         num_patch_index_tokens (`int`, *optional*, defaults to 1024):
             The number of tokens that represent patch indices.
-    """
-
-    def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024, *kwargs):
+        """
         tokenizer.return_token_type_ids = False
 
         self.eod_token = "</doc>"
@@ -130,32 +135,13 @@ def __init__(self, image_processor, tokenizer, num_patch_index_tokens=1024, *kwa
 
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, list[TextInput]] = None,
         **kwargs: Unpack[Kosmos2ProcessorKwargs],
     ) -> BatchFeature:
-        """
-        This method uses [`CLIPImageProcessor.__call__`] method to prepare image(s) for the model, and
-        [`XLMRobertaTokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-
-        The rest of this documentation shows the arguments specific to `Kosmos2Processor`.
-
-        Args:
-            bboxes (`Union[list[tuple[int]], list[tuple[float]], list[list[tuple[int]]], list[list[tuple[float]]]]`, *optional*):
-                The bounding bboxes associated to `texts`.
-            num_image_tokens (`int`, *optional* defaults to 64):
-                The number of (consecutive) places that are used to mark the placeholders to store image information.
-                This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
-            first_image_token_id (`int`, *optional*):
-                The token id that will be used for the first place of the subsequence that is reserved to store image
-                information. If unset, will default to `self.tokenizer.unk_token_id + 1`.
-            add_eos_token (`bool`, defaults to `False`):
-                Whether or not to include `EOS` token id in the encoding when `add_special_tokens=True`.
-        """
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
 
diff --git a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
index 5d1ec20c75de..95b416eb79c4 100644
--- a/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
+++ b/src/transformers/models/kosmos2_5/processing_kosmos2_5.py
@@ -23,6 +23,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import TextInput
 from ...utils import is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -44,22 +45,13 @@ class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Kosmos2_5Processor(ProcessorMixin):
-    r"""
-    Constructs a Kosmos2_5 processor which wraps a PreTrainedTokenizerFast and Kosmos2_5 image processor into a single
-    processor.
-
-    [`Kosmos2_5Processor`] offers all the functionalities of [`Kosmos2_5ImageProcessor`] and [`PreTrainedTokenizerFast`]. See
-    the docstring of [`~Kosmos2_5Processor.__call__`] and [`~Kosmos2_5Processor.decode`] for more information.
-
-    Args:
-        image_processor (`Kosmos2_5ImageProcessor`):
-            An instance of [`Kosmos2_5ImageProcessor`]. The image processor is a required input.
-        tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
-            An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
+    def __init__(self, image_processor, tokenizer, num_image_tokens: int = 2048):
+        """
         num_image_tokens (`int`, *optional*, defaults to 2048):
             Number of image tokens used as a placeholder.
-    """
+        """
 
     def __init__(self, image_processor, tokenizer, num_image_tokens: int = 2048):
         self.image_start_token = tokenizer.boi_token  # "<image>" : fixed token for the start of image
@@ -68,20 +60,13 @@ def __init__(self, image_processor, tokenizer, num_image_tokens: int = 2048):
         self.num_image_tokens = num_image_tokens
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Union[TextInput, list[TextInput]] = None,
         **kwargs: Unpack[Kosmos2_5ProcessorKwargs],
     ) -> BatchFeature:
-        """
-        This method uses [`Kosmos2_5ImageProcessor.preprocess`] method to prepare image(s) for the model, and
-        [`PreTrainedTokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-
-        The rest of this documentation shows the arguments specific to `Kosmos2_5Processor`.
-        """
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
 
diff --git a/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
index 53c6b7d395df..f496bdc23ec1 100644
--- a/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
+++ b/src/transformers/models/kyutai_speech_to_text/processing_kyutai_speech_to_text.py
@@ -15,6 +15,7 @@
 
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
 class KyutaiSpeechToTextProcessorKwargs(ProcessingKwargs, total=False):
@@ -26,14 +27,8 @@ class KyutaiSpeechToTextProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class KyutaiSpeechToTextProcessor(ProcessorMixin):
-    r"""
-    Constructs a Moshi ASR processor which wraps [`EncodecFeatureExtractor`] and
-    [`PreTrainedTokenizerFast`] into a single processor that inherits both the audio feature extraction and
-    tokenizer functionalities. See the [`~KyutaiSpeechToTextProcessor.__call__`] for more
-    information.
-    """
-
     valid_processor_kwargs = KyutaiSpeechToTextProcessorKwargs
 
     def __init__(self, feature_extractor, tokenizer):
diff --git a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
index 0f3e7dc8a9d9..9095713911f9 100644
--- a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
@@ -21,31 +21,15 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class LayoutLMv2Processor(ProcessorMixin):
-    r"""
-    Constructs a LayoutLMv2 processor which combines a LayoutLMv2 image processor and a LayoutLMv2 tokenizer into a
-    single processor.
-
-    [`LayoutLMv2Processor`] offers all the functionalities you need to prepare data for the model.
-
-    It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
-    get words and normalized bounding boxes. These are then provided to [`LayoutLMv2Tokenizer`] or
-    [`LayoutLMv2TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
-    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
-    into token-level `labels` for token classification tasks (such as FUNSD, CORD).
-
-    Args:
-        image_processor (`LayoutLMv2ImageProcessor`, *optional*):
-            An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
-        tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`, *optional*):
-            An instance of [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images,
@@ -69,16 +53,6 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchEncoding:
-        """
-        This method first forwards the `images` argument to [`~LayoutLMv2ImageProcessor.__call__`]. In case
-        [`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
-        bounding boxes along with the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output,
-        together with resized `images`. In case [`LayoutLMv2ImageProcessor`] was initialized with `apply_ocr` set to
-        `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
-        arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, together with resized `images``.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
         # verify input
         if self.image_processor.apply_ocr and (boxes is not None):
             raise ValueError(
diff --git a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
index 5f7de3dd9147..97f192fddca9 100644
--- a/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
+++ b/src/transformers/models/layoutlmv3/processing_layoutlmv3.py
@@ -21,31 +21,15 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class LayoutLMv3Processor(ProcessorMixin):
-    r"""
-    Constructs a LayoutLMv3 processor which combines a LayoutLMv3 image processor and a LayoutLMv3 tokenizer into a
-    single processor.
-
-    [`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model.
-
-    It first uses [`LayoutLMv3ImageProcessor`] to resize and normalize document images, and optionally applies OCR to
-    get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or
-    [`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
-    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
-    into token-level `labels` for token classification tasks (such as FUNSD, CORD).
-
-    Args:
-        image_processor (`LayoutLMv3ImageProcessor`, *optional*):
-            An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
-        tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`, *optional*):
-            An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images,
@@ -69,17 +53,6 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchEncoding:
-        """
-        This method first forwards the `images` argument to [`~LayoutLMv3ImageProcessor.__call__`]. In case
-        [`LayoutLMv3ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
-        bounding boxes along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output,
-        together with resized and normalized `pixel_values`. In case [`LayoutLMv3ImageProcessor`] was initialized with
-        `apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along
-        with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, together with
-        resized and normalized `pixel_values`.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
         # verify input
         if self.image_processor.apply_ocr and (boxes is not None):
             raise ValueError(
diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py
index 887d150ab366..7882f6827e2f 100644
--- a/src/transformers/models/layoutxlm/processing_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -21,31 +21,15 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class LayoutXLMProcessor(ProcessorMixin):
-    r"""
-    Constructs a LayoutXLM processor which combines a LayoutXLM image processor and a LayoutXLM tokenizer into a single
-    processor.
-
-    [`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model.
-
-    It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
-    get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
-    [`LayoutXLMTokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
-    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
-    into token-level `labels` for token classification tasks (such as FUNSD, CORD).
-
-    Args:
-        image_processor (`LayoutLMv2ImageProcessor`, *optional*):
-            An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
-        tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`, *optional*):
-            An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images,
@@ -69,16 +53,6 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchEncoding:
-        """
-        This method first forwards the `images` argument to [`~LayoutLMv2ImagePrpcessor.__call__`]. In case
-        [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
-        bounding boxes along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output,
-        together with resized `images`. In case [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to
-        `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
-        arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together with resized `images``.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
         # verify input
         if self.image_processor.apply_ocr and (boxes is not None):
             raise ValueError(
diff --git a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
index 73038b9f37aa..7766f5a5043b 100755
--- a/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
+++ b/src/transformers/models/lfm2_vl/processing_lfm2_vl.py
@@ -25,6 +25,7 @@
 )
 from ...tokenization_utils_base import BatchEncoding, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -49,21 +50,8 @@ class Lfm2VlProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Lfm2VlProcessor(ProcessorMixin):
-    r"""
-    Constructs a Lfm2Vl processor which wraps a Lfm2Tokenizer tokenizer and Lfm2VlImageProcessor into a single processor.
-
-    [`Lfm2VlProcessor`] offers all the functionalities of [`Lfm2ImageProcessor`] and [`Lfm2Tokenizer`].
-
-    Args:
-        image_processor (`Lfm2VlImageProcessor`):
-             An instance of [`Lfm2VlImageProcessor`]. The image processor is a required input.
-        tokenizer (`PreTrainedTokenizerBase`):
-            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
-        chat_template (`str`, *optional*):
-            A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
-    """
-
     def __init__(
         self,
         image_processor,
@@ -78,25 +66,13 @@ def __init__(
         self.image_thumbnail_token = tokenizer.image_thumbnail
         super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[Union[ImageInput, list[ImageInput], list[list[ImageInput]]]] = None,
         text: Optional[Union[TextInput, list[TextInput]]] = None,
         **kwargs: Unpack[Lfm2VlProcessorKwargs],
     ) -> BatchEncoding:
-        """
-        Processes the input prompts and returns a BatchFeature.
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. If is of type `list[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
-            text (`TextInput`, *optional*):
-                The sequence or batch of sequences to be encoded.
-                Wherever an image token, `<image>` is encountered it is expanded to a proper sequence of image tokens.
-            return_tensors (`Optional[str, TensorType]`, *optional*):
-                If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
-                information.
-        """
         if text is None and images is None:
             raise ValueError("You must provide one of `text` or `images`.")
 
diff --git a/src/transformers/models/llama4/processing_llama4.py b/src/transformers/models/llama4/processing_llama4.py
index c9ad6884fa8d..162c156e5c63 100644
--- a/src/transformers/models/llama4/processing_llama4.py
+++ b/src/transformers/models/llama4/processing_llama4.py
@@ -21,6 +21,7 @@
 
 from ...image_processing_utils import BatchFeature
 from ...image_utils import ImageInput, make_flat_list_of_images
+from ...utils.auto_docstring import auto_docstring
 
 
 class Llama4ProcessorKwargs(ProcessingKwargs, total=False):
@@ -34,16 +35,25 @@ class Llama4ProcessorKwargs(ProcessingKwargs, total=False):
 chat_template = "{{- bos_token }}\n{%- if custom_tools is defined %}\n    {%- set tools = custom_tools %}\n{%- endif %}\n{%- if not tools_in_user_message is defined %}\n    {%- set tools_in_user_message = true %}\n{%- endif %}\n{%- if not date_string is defined %}\n    {%- if strftime_now is defined %}\n        {%- set date_string = strftime_now(\"%d %b %Y\") %}\n    {%- else %}\n        {%- set date_string = \"26 Jul 2024\" %}\n    {%- endif %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}    \n    {%- if messages[0]['content'] is string %}\n        {%- set system_message = messages[0]['content']|trim %}\n    {%- else %}\n        {#- FIXME: The processor requires an array, always. #}\n        {%- set system_message = messages[0]['content'][0]['text']|trim %}\n    {%- endif %}\n    {%- set messages = messages[1:] %}\n    {%- set user_supplied_system_message = true %}\n{%- else %}\n    {%- set system_message = \"\" %}\n    {%- set user_supplied_system_message = false %}\n{%- endif %}\n\n{#- System message if the user supplied one #}\n{%- if user_supplied_system_message %}\n    {{- \"<|header_start|>system<|header_end|>\n\n\" }}\n    {%- if tools is not none %}\n        {{- \"Environment: ipython\n\" }}\n    {%- endif %}\n    {%- if tools is not none and not tools_in_user_message %}\n        {{- \"You have access to the following functions. To call a function, please respond with JSON for a function call.\" }}\n        {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n        {{- \"Do not use variables.\n\n\" }}\n        {%- for t in tools %}\n            {{- t | tojson(indent=4) }}\n            {{- \"\n\n\" }}\n        {%- endfor %}\n    {%- endif %}\n    {{- system_message }}\n    {{- \"<|eot|>\" }}\n{%- endif %}\n\n{#- Custom tools are passed in a user message with some extra guidance #}\n{%- if tools_in_user_message and not tools is none %}\n    {#- Extract the first user message so we can plug it in here #}\n    {%- if messages | length != 0 %}\n        {%- set first_user_message = messages[0]['content']|trim %}\n        {%- set messages = messages[1:] %}\n    {%- else %}\n        {{- raise_exception(\"Cannot put tools in the first user message when there's no first user message!\") }}\n{%- endif %}\n    {{- '<|header_start|>user<|header_end|>\n\n' -}}\n    {{- \"Given the following functions, please respond with a JSON for a function call \" }}\n    {{- \"with its proper arguments that best answers the given prompt.\n\n\" }}\n    {{- 'Respond in the format {\"name\": function name, \"parameters\": dictionary of argument name and its value}.' }}\n    {{- \"Do not use variables.\n\n\" }}\n    {%- for t in tools %}\n        {{- t | tojson(indent=4) }}\n        {{- \"\n\n\" }}\n    {%- endfor %}\n    {{- first_user_message + \"<|eot|>\"}}\n{%- endif %}\n\n{%- for message in messages %}\n    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}\n    {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}\n        {%- if message['content'] is string %}\n            {{- message['content'] }}\n        {%- else %}\n            {%- for content in message['content'] %}\n                {%- if content['type'] == 'image' %}\n                    {{- '<|image|>' }}\n                {%- elif content['type'] == 'text' %}\n                    {{- content['text'] }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n        {{- \"<|eot|>\" }}\n    {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}\n       {{- '<|header_start|>assistant<|header_end|>\n\n' -}}\n       {{- '<|python_start|>' }}\n        {%- if message['content'] is string %}\n            {{- message['content'] }}\n        {%- else %}\n            {%- for content in message['content'] %}\n                {%- if content['type'] == 'image' %}\n                    {{- '<|image|>' }}\n                {%- elif content['type'] == 'text' %}\n                    {{- content['text'] }}\n                {%- endif %}\n            {%- endfor %}\n        {%- endif %}\n       {{- '<|python_end|>' }}\n        {%- for tool_call in message.tool_calls %}\n           {{- '{\"name\": \"' + tool_call.function.name + '\", ' }}\n           {{- '\"parameters\": ' }}\n           {{- tool_call.function.arguments | tojson }}\n           {{- \"}\" }}\n        {%- endfor %}\n       {{- \"<|eot|>\" }}\n    {%- elif message.role == \"tool\" or message.role == \"ipython\" %}\n        {{- \"<|header_start|>ipython<|header_end|>\n\n\" }}\n        {%- if message.content is mapping or message.content is iterable %}\n            {{- message.content | tojson }}\n        {%- else %}\n            {{- message.content }}\n        {%- endif %}\n        {{- \"<|eot|>\" }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|header_start|>assistant<|header_end|>\n\n' }}\n{%- endif %}\n"
 
 
+@auto_docstring
 class Llama4Processor(ProcessorMixin):
-    r"""
-    Constructs a Llama4 processor which wraps a [`AutoImageProcessor`] and
-    [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
-    tokenizer functionalities. See the [`~Llama4Processor.__call__`] and [`~Llama4Processor.decode`] for more information.
-    Args:
-        image_processor ([`AutoImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
-            The tokenizer is a required input.
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size: int = 14,
+        pixel_shuffle_ratio: float = 0.5,
+        fake_image_token="<|image|>",
+        image_token="<|image|>",
+        start_of_image_token="<|image_start|>",
+        end_of_image_token="<|image_end|>",
+        patch_token="<|patch|>",
+        tile_x_separator_token="<|tile_x_separator|>",
+        tile_y_separator_token="<|tile_y_separator|>",
+        chat_template=chat_template,
+        **kwargs,
+    ):
+        """
         patch_size (`int`, *optional*, defaults to 28):
             The size of image patches for tokenization.
         img_size (`int`, *optional*, defaults to 364):
@@ -64,26 +74,7 @@ class Llama4Processor(ProcessorMixin):
             The token to be used to represent an image patch in the text.
         tile_global_token (`str`, *optional*, defaults to `"TILE_GLOBAL"`):
             The token to be used to represent the cover image in the text.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
-    def __init__(
-        self,
-        image_processor=None,
-        tokenizer=None,
-        patch_size: int = 14,
-        pixel_shuffle_ratio: float = 0.5,
-        fake_image_token="<|image|>",
-        image_token="<|image|>",
-        start_of_image_token="<|image_start|>",
-        end_of_image_token="<|image_end|>",
-        patch_token="<|patch|>",
-        tile_x_separator_token="<|tile_x_separator|>",
-        tile_y_separator_token="<|tile_y_separator|>",
-        chat_template=chat_template,
-        **kwargs,
-    ):
+        """
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
         self.downsample_ratio = int(round(1.0 / (pixel_shuffle_ratio**2)))
@@ -125,6 +116,7 @@ def _prompt_split_image(self, aspect_ratio, num_patches_per_chunk):
 
         return img_string
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -132,24 +124,6 @@ def __call__(
         **kwargs: Unpack[Llama4ProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text.
-        To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to
-        Llama4ImageProcessor's [`~Llama4ImageProcessor.__call__`] if `images` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
index a11e80280b74..62bbe28e7f98 100644
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -30,6 +30,7 @@
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -41,32 +42,8 @@ class LlavaProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class LlavaProcessor(ProcessorMixin):
-    r"""
-    Constructs a LLaVa processor which wraps a LLaVa image processor and a LLaMa tokenizer into a single processor.
-
-    [`LlavaProcessor`] offers all the functionalities of [`LlavaImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~LlavaProcessor.__call__`] and [`~LlavaProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`LlavaImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        patch_size (`int`, *optional*):
-            Patch size from the vision tower.
-        vision_feature_select_strategy (`str`, *optional*):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Should be same as in model's config
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        image_token (`str`, *optional*, defaults to `"<image>"`):
-            Special token used to denote image location.
-        num_additional_image_tokens (`int`, *optional*, defaults to 0):
-            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
-            extra tokens appended, no need to set this arg.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -78,6 +55,18 @@ def __init__(
         num_additional_image_tokens=0,
         **kwargs,
     ):
+        """
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Should be same as in model's config
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
+        """
         self.patch_size = patch_size
         self.num_additional_image_tokens = num_additional_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
@@ -85,6 +74,7 @@ def __init__(
         self.image_token_id = tokenizer.encode(self.image_token, add_special_tokens=False)[0]
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -92,25 +82,6 @@ def __call__(
         **kwargs: Unpack[LlavaProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py
index d79c5a0edf6b..9ed72c3a550e 100644
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -31,6 +31,7 @@
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -48,32 +49,8 @@ class LlavaNextProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class LlavaNextProcessor(ProcessorMixin):
-    r"""
-    Constructs a LLaVa-NeXT processor which wraps a LLaVa-NeXT image processor and a LLaMa tokenizer into a single processor.
-
-    [`LlavaNextProcessor`] offers all the functionalities of [`LlavaNextImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~LlavaNextProcessor.__call__`] and [`~LlavaNextProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`LlavaNextImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        patch_size (`int`, *optional*):
-            Patch size from the vision tower.
-        vision_feature_select_strategy (`str`, *optional*):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Should be same as in model's config
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        image_token (`str`, *optional*, defaults to `"<image>"`):
-            Special token used to denote image location.
-        num_additional_image_tokens (`int`, *optional*, defaults to 0):
-            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
-            extra tokens appended, no need to set this arg.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -85,6 +62,18 @@ def __init__(
         num_additional_image_tokens=0,
         **kwargs,
     ):
+        """
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Should be same as in model's config
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
+        """
         self.patch_size = patch_size
         self.num_additional_image_tokens = num_additional_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
@@ -96,6 +85,7 @@ def __init__(
         )
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -103,21 +93,6 @@ def __call__(
         **kwargs: Unpack[LlavaNextProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py
index 582002b6165c..42d0eeca57ad 100644
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -26,6 +26,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -44,37 +45,8 @@ class LlavaNextVideoProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class LlavaNextVideoProcessor(ProcessorMixin):
-    r"""
-    Constructs a LLaVa-NeXT-Video processor which wraps a LLaVa-NeXT image processor, LLaVa-NeXT-Video video processor and
-    a LLaMa tokenizer into a single processor.
-
-    [`LlavaNextVideoProcessor`] offers all the functionalities of [`LlavaNextImageProcessor`], [`LlavaNextVideoVideoProcessor`] and
-    [`LlamaTokenizerFast`]. See the [`~LlavaNextVideoProcessor.__call__`] and [`~LlavaNextVideoProcessor.decode`] for more information.
-
-    Args:
-        video_processor ([`LlavaNextVideoVideoProcessor`], *optional*):
-            The video processor is a required input.
-        image_processor ([`LlavaNextImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*):
-            Jinja chat template that will be used in tokenizer's `apply_chat_template`
-        patch_size (`int`, *optional*):
-            Patch size from the vision tower.
-        vision_feature_select_strategy (`str`, *optional*):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Should be same as in model's config
-        video_token (`str`, *optional*, defaults to `"<video>"`):
-            Special token used to denote video location.
-        image_token (`str`, *optional*, defaults to `"<image>"`):
-            Special token used to denote image location.
-        num_additional_image_tokens (`int`, *optional*, defaults to 0):
-            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
-            extra tokens appended, no need to set this arg.
-    """
-
     # video and image processor share same args, but have different processing logic
     # only image processor config is saved in the hub
     def __init__(
@@ -90,6 +62,20 @@ def __init__(
         num_additional_image_tokens=0,
         **kwargs,
     ):
+        """
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Should be same as in model's config
+        video_token (`str`, *optional*, defaults to `"<video>"`):
+            Special token used to denote video location.
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
+        """
         self.patch_size = patch_size
         self.num_additional_image_tokens = num_additional_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
@@ -107,6 +93,7 @@ def __init__(
         )
         super().__init__(video_processor, image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -115,31 +102,6 @@ def __call__(
         **kwargs: Unpack[LlavaNextVideoProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. To prepare the video(s),
-        this method forwards the `videos` and `kwargs` arguments to LlavaNextVideoVideoProcessor's
-        [`~LlavaNextVideoVideoProcessor.__call__`] if `videos` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py
index 4ea891e50cf1..35d8271774bd 100644
--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -28,6 +28,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -45,35 +46,8 @@ class LlavaOnevisionProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class LlavaOnevisionProcessor(ProcessorMixin):
-    r"""
-    Constructs a LLaVa-Onevision processor which wraps a LLaVa-Onevision video processor, LLaVa-NeXT image processor and a LLaMa tokenizer into a single processor.
-
-    [`LlavaNextProcessor`] offers all the functionalities of [`LlavaOnevisionVideoProcessor`], [`LlavaOnevisionImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~LlavaOnevisionVideoProcessor.__call__`], [`~LlavaNextProcessor.__call__`] and [`~LlavaNextProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`LlavaOnevisionImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        video_processor ([`LlavaOnevisionVideoProcessor`], *optional*):
-            The video processor is a required input.
-        num_image_tokens (`int`, *optional*):
-            Number of image tokens for one imagethat will be returned by vision tower.
-        vision_feature_select_strategy (`str`, *optional*):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Should be same as in model's config
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        image_token (`str`, *optional*, defaults to `"<image>"`):
-            Special token used to denote image location.
-        video_token (`str`, *optional*, defaults to `"<video>"`):
-            Special token used to denote video location.
-        vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`):
-            Aspect ratio used when processong image features. The default value is "anyres_max_9".
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -87,6 +61,19 @@ def __init__(
         vision_aspect_ratio="anyres_max_9",
         **kwargs,
     ):
+        """
+        num_image_tokens (`int`, *optional*):
+            Number of image tokens for one imagethat will be returned by vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Should be same as in model's config
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        video_token (`str`, *optional*, defaults to `"<video>"`):
+            Special token used to denote video location.
+        vision_aspect_ratio (`str`, *optional*, defaults to `"anyres_max_9"`):
+            Aspect ratio used when processong image features. The default value is "anyres_max_9".
+        """
         self.num_image_tokens = num_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
         self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
@@ -104,6 +91,7 @@ def __init__(
         self.vision_aspect_ratio = vision_aspect_ratio
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -112,23 +100,6 @@ def __call__(
         **kwargs: Unpack[LlavaOnevisionProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/markuplm/processing_markuplm.py b/src/transformers/models/markuplm/processing_markuplm.py
index 5c2f181d35a6..2b1ffc3e1e17 100644
--- a/src/transformers/models/markuplm/processing_markuplm.py
+++ b/src/transformers/models/markuplm/processing_markuplm.py
@@ -21,33 +21,17 @@
 from ...file_utils import TensorType
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TruncationStrategy
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class MarkupLMProcessor(ProcessorMixin):
-    r"""
-    Constructs a MarkupLM processor which combines a MarkupLM feature extractor and a MarkupLM tokenizer into a single
-    processor.
-
-    [`MarkupLMProcessor`] offers all the functionalities you need to prepare data for the model.
-
-    It first uses [`MarkupLMFeatureExtractor`] to extract nodes and corresponding xpaths from one or more HTML strings.
-    Next, these are provided to [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`], which turns them into token-level
-    `input_ids`, `attention_mask`, `token_type_ids`, `xpath_tags_seq` and `xpath_subs_seq`.
-
-    Args:
-        feature_extractor (`MarkupLMFeatureExtractor`):
-            An instance of [`MarkupLMFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`MarkupLMTokenizer` or `MarkupLMTokenizerFast`):
-            An instance of [`MarkupLMTokenizer`] or [`MarkupLMTokenizerFast`]. The tokenizer is a required input.
-        parse_html (`bool`, *optional*, defaults to `True`):
-            Whether or not to use `MarkupLMFeatureExtractor` to parse HTML strings into nodes and corresponding xpaths.
-    """
-
     parse_html = True
 
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         html_strings=None,
@@ -71,15 +55,6 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchEncoding:
-        """
-        This method first forwards the `html_strings` argument to [`~MarkupLMFeatureExtractor.__call__`]. Next, it
-        passes the `nodes` and `xpaths` along with the additional arguments to [`~MarkupLMTokenizer.__call__`] and
-        returns the output.
-
-        Optionally, one can also provide a `text` argument which is passed along as first sequence.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
         # first, create nodes and xpaths
         if self.parse_html:
             if html_strings is None:
diff --git a/src/transformers/models/mgp_str/processing_mgp_str.py b/src/transformers/models/mgp_str/processing_mgp_str.py
index 7686b43f00e8..2319dcfc17c9 100644
--- a/src/transformers/models/mgp_str/processing_mgp_str.py
+++ b/src/transformers/models/mgp_str/processing_mgp_str.py
@@ -19,6 +19,7 @@
 from transformers.utils.generic import ExplicitEnum
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 from ...utils.import_utils import requires
 
 
@@ -36,20 +37,8 @@ class DecodeType(ExplicitEnum):
 
 
 @requires(backends=("sentencepiece",))
+@auto_docstring
 class MgpstrProcessor(ProcessorMixin):
-    r"""
-    Constructs a MGP-STR processor which wraps an image processor and MGP-STR tokenizers into a single
-
-    [`MgpstrProcessor`] offers all the functionalities of `ViTImageProcessor`] and [`MgpstrTokenizer`]. See the
-    [`~MgpstrProcessor.__call__`] and [`~MgpstrProcessor.batch_decode`] for more information.
-
-    Args:
-        image_processor (`ViTImageProcessor`, *optional*):
-            An instance of `ViTImageProcessor`. The image processor is a required input.
-        tokenizer ([`MgpstrTokenizer`], *optional*):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         self.char_tokenizer = tokenizer
         self.bpe_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
@@ -57,13 +46,8 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
 
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
-        """
-        When used in normal mode, this method forwards all its arguments to ViTImageProcessor's
-        [`~ViTImageProcessor.__call__`] and returns its output. This method also forwards the `text` and `kwargs`
-        arguments to MgpstrTokenizer's [`~MgpstrTokenizer.__call__`] if `text` is not `None` to encode the text. Please
-        refer to the docstring of the above methods for more information.
-        """
         if images is None and text is None:
             raise ValueError("You need to specify either an `images` or `text` input to process.")
 
diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py
index 7c1148f19cf3..7d3b31b69eac 100644
--- a/src/transformers/models/mllama/processing_mllama.py
+++ b/src/transformers/models/mllama/processing_mllama.py
@@ -23,6 +23,7 @@
 from ...image_utils import ImageInput, make_nested_list_of_images
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class MllamaProcessorKwargs(ProcessingKwargs, total=False):
@@ -166,38 +167,8 @@ def build_string_from_input(prompt: str, bos_token: str, image_token: str) -> st
     return f"{image_token * num_image_tokens_on_start}{bos_token}{prompt}"
 
 
+@auto_docstring
 class MllamaProcessor(ProcessorMixin):
-    r"""
-    Constructs a Mllama processor which wraps [`MllamaImageProcessor`] and
-    [`PretrainedTokenizerFast`] into a single processor that inherits both the image processor and
-    tokenizer functionalities. See the [`~MllamaProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
-    information.
-    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
-        ```python
-        from transformers import MllamaProcessor
-        from PIL import Image
-
-        processor = MllamaProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision")
-
-        processor(
-            images=your_pil_image,
-            text=["<|image|>If I had to write a haiku for this one"],
-            images_kwargs = {"size": {"height": 448, "width": 448}},
-            text_kwargs = {"padding": "right"},
-            common_kwargs = {"return_tensors": "pt"},
-        )
-        ```
-
-    Args:
-        image_processor ([`MllamaImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-
-    """
-
     def __init__(self, image_processor, tokenizer, chat_template=None):
         if not hasattr(tokenizer, "image_token"):
             self.image_token = "<|image|>"
@@ -211,6 +182,7 @@ def __init__(self, image_processor, tokenizer, chat_template=None):
         self.bos_token = tokenizer.bos_token
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -218,24 +190,6 @@ def __call__(
         **kwargs: Unpack[MllamaProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
-        arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` arguments to
-        MllamaImageProcessor's [`~MllamaImageProcessor.__call__`] if `images` is not `None`. Please refer
-        to the docstring of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                    - `'np'`: Return NumPy `np.ndarray` objects.
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/musicgen/processing_musicgen.py b/src/transformers/models/musicgen/processing_musicgen.py
index 228253e20993..47bed76cbacd 100644
--- a/src/transformers/models/musicgen/processing_musicgen.py
+++ b/src/transformers/models/musicgen/processing_musicgen.py
@@ -22,35 +22,19 @@
 
 from ...processing_utils import ProcessorMixin
 from ...utils import to_numpy
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class MusicgenProcessor(ProcessorMixin):
-    r"""
-    Constructs a MusicGen processor which wraps an EnCodec feature extractor and a T5 tokenizer into a single processor
-    class.
-
-    [`MusicgenProcessor`] offers all the functionalities of [`EncodecFeatureExtractor`] and [`TTokenizer`]. See
-    [`~MusicgenProcessor.__call__`] and [`~MusicgenProcessor.decode`] for more information.
-
-    Args:
-        feature_extractor (`EncodecFeatureExtractor`):
-            An instance of [`EncodecFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`T5Tokenizer`):
-            An instance of [`T5Tokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
     def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
         return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
 
+    @auto_docstring
     def __call__(self, *args, **kwargs):
-        """
-        Forwards the `audio` argument to EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`] and the `text`
-        argument to [`~T5Tokenizer.__call__`]. Please refer to the docstring of the above two methods for more
-        information.
-        """
         if len(args) > 0:
             kwargs["audio"] = args[0]
         return super().__call__(*args, **kwargs)
diff --git a/src/transformers/models/musicgen_melody/processing_musicgen_melody.py b/src/transformers/models/musicgen_melody/processing_musicgen_melody.py
index 49092f80cd45..f0422c3a17f7 100644
--- a/src/transformers/models/musicgen_melody/processing_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/processing_musicgen_melody.py
@@ -22,25 +22,13 @@
 
 from ...processing_utils import ProcessorMixin
 from ...utils import to_numpy
+from ...utils.auto_docstring import auto_docstring
 from ...utils.import_utils import requires
 
 
 @requires(backends=("torchaudio",))
+@auto_docstring
 class MusicgenMelodyProcessor(ProcessorMixin):
-    r"""
-    Constructs a MusicGen Melody processor which wraps a Wav2Vec2 feature extractor - for raw audio waveform processing - and a T5 tokenizer into a single processor
-    class.
-
-    [`MusicgenProcessor`] offers all the functionalities of [`MusicgenMelodyFeatureExtractor`] and [`T5Tokenizer`]. See
-    [`~MusicgenProcessor.__call__`] and [`~MusicgenProcessor.decode`] for more information.
-
-    Args:
-        feature_extractor (`MusicgenMelodyFeatureExtractor`):
-            An instance of [`MusicgenMelodyFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`T5Tokenizer`):
-            An instance of [`T5Tokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
@@ -48,13 +36,8 @@ def __init__(self, feature_extractor, tokenizer):
     def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
         return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
 
+    @auto_docstring
     def __call__(self, *args, **kwargs):
-        """
-        Forwards the `audio` argument to EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`] and the `text`
-        argument to [`~T5Tokenizer.__call__`]. Please refer to the docstring of the above two methods for more
-        information.
-        """
-
         if len(args) > 0:
             kwargs["audio"] = args[0]
         return super().__call__(*args, **kwargs)
diff --git a/src/transformers/models/nougat/processing_nougat.py b/src/transformers/models/nougat/processing_nougat.py
index 4e071dbd8109..e11559ac70c1 100644
--- a/src/transformers/models/nougat/processing_nougat.py
+++ b/src/transformers/models/nougat/processing_nougat.py
@@ -22,25 +22,15 @@
 
 from ...processing_utils import ProcessorMixin
 from ...utils import PaddingStrategy, TensorType
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class NougatProcessor(ProcessorMixin):
-    r"""
-    Constructs a Nougat processor which wraps a Nougat image processor and a Nougat tokenizer into a single processor.
-
-    [`NougatProcessor`] offers all the functionalities of [`NougatImageProcessor`] and [`NougatTokenizerFast`]. See the
-    [`~NougatProcessor.__call__`] and [`~NougatProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`NougatImageProcessor`]):
-            An instance of [`NougatImageProcessor`]. The image processor is a required input.
-        tokenizer ([`NougatTokenizerFast`]):
-            An instance of [`NougatTokenizerFast`]. The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images=None,
diff --git a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
index 3601655e3e99..1d79179f3697 100644
--- a/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
+++ b/src/transformers/models/omdet_turbo/processing_omdet_turbo.py
@@ -29,6 +29,7 @@
     is_torch_available,
     is_torchvision_available,
 )
+from ...utils.auto_docstring import auto_docstring
 from ...utils.import_utils import requires
 
 
@@ -199,51 +200,18 @@ def _post_process_boxes_for_image(
 
 
 @requires(backends=("vision", "torchvision"))
+@auto_docstring
 class OmDetTurboProcessor(ProcessorMixin):
-    r"""
-    Constructs a OmDet-Turbo processor which wraps a Deformable DETR image processor and an AutoTokenizer into a
-    single processor.
-
-    [`OmDetTurboProcessor`] offers all the functionalities of [`DetrImageProcessor`] and
-    [`AutoTokenizer`]. See the docstring of [`~OmDetTurboProcessor.__call__`] and [`~OmDetTurboProcessor.decode`]
-    for more information.
-
-    Args:
-        image_processor (`DetrImageProcessor`):
-            An instance of [`DetrImageProcessor`]. The image processor is a required input.
-        tokenizer (`AutoTokenizer`):
-            An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[list[str], list[list[str]]]] = None,
         **kwargs: Unpack[OmDetTurboProcessorKwargs],
     ) -> BatchFeature:
-        """
-        This method uses [*DetrImageProcessor.__call__] method to prepare image(s) for the model, and
-        [CLIPTokenizerFast.__call__] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-
-        Args:
-            images (`ImageInput`):
-               Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255.
-            text (`Union[str, list[str], list[list[str]]]`):
-                The classes used to limit the scope of the open vocabulary detection. Expects a list of strings or a list
-                of list of strings. Batched classes can be of different lengths.
-                Examples: ["cat", "dog", "bird"], [["cat", "dog", "bird"], ["hat", "person"], ["car"]]
-        Kwargs:
-            task (`Union[str, list[str], TextInput, PreTokenizedInput]`):
-                The grounded text used to guide open vocabulary detection. Expects a single string or a list of strings.
-                Examples: "Detect a cat, a dog, and a bird.",[ "Detect everything.", "Detect trees and flowers."]
-                When not provided, the default task is "Detect [class1], [class2], [class3]" etc.
-            ...
-        """
         if images is None or text is None:
             raise ValueError("You have to specify both `images` and `text`")
 
diff --git a/src/transformers/models/oneformer/processing_oneformer.py b/src/transformers/models/oneformer/processing_oneformer.py
index ec90d63f7bd7..172ff7e7b029 100644
--- a/src/transformers/models/oneformer/processing_oneformer.py
+++ b/src/transformers/models/oneformer/processing_oneformer.py
@@ -18,32 +18,24 @@
 
 from ...processing_utils import ProcessorMixin
 from ...utils import is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
     import torch
 
 
+@auto_docstring
 class OneFormerProcessor(ProcessorMixin):
-    r"""
-    Constructs an OneFormer processor which wraps [`OneFormerImageProcessor`] and
-    [`CLIPTokenizer`]/[`CLIPTokenizerFast`] into a single processor that inherits both the image processor and
-    tokenizer functionalities.
-
-    Args:
-        image_processor ([`OneFormerImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
-            The tokenizer is a required input.
+    def __init__(
+        self, image_processor=None, tokenizer=None, max_seq_length: int = 77, task_seq_length: int = 77, **kwargs
+    ):
+        """
         max_seq_len (`int`, *optional*, defaults to 77)):
             Sequence length for input text list.
         task_seq_len (`int`, *optional*, defaults to 77):
             Sequence length for input task token.
-    """
-
-    def __init__(
-        self, image_processor=None, tokenizer=None, max_seq_length: int = 77, task_seq_length: int = 77, **kwargs
-    ):
+        """
         self.max_seq_length = max_seq_length
         self.task_seq_length = task_seq_length
 
@@ -65,32 +57,20 @@ def _preprocess_text(self, text_list=None, max_length=77):
         token_inputs = torch.cat(token_inputs, dim=0)
         return token_inputs
 
+    @auto_docstring
     def __call__(self, images=None, task_inputs=None, segmentation_maps=None, **kwargs):
         """
-        Main method to prepare for the model one or several task input(s) and image(s). This method forwards the
-        `task_inputs` and `kwargs` arguments to CLIPTokenizer's [`~CLIPTokenizer.__call__`] if `task_inputs` is not
-        `None` to encode. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        OneFormerImageProcessor's [`~OneFormerImageProcessor.__call__`] if `images` is not `None`. Please refer to the
-        docstring of the above two methods for more information.
-
-        Args:
-            task_inputs (`str`, `list[str]`):
-                The sequence or batch of task_inputs sequences to be encoded. Each sequence can be a string or a list
-                of strings of the template "the task is {task}".
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`,
-            `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            segmentation_maps (`ImageInput`, *optional*):
-                The corresponding semantic segmentation maps with the pixel-wise annotations.
-
-             (`bool`, *optional*, defaults to `True`):
-                Whether or not to pad images up to the largest image in a batch and create a pixel mask.
-
-                If left to the default, will return a pixel mask that is:
-
-                - 1 for pixels that are real (i.e. **not masked**),
-                - 0 for pixels that are padding (i.e. **masked**).
+        segmentation_maps (`ImageInput`, *optional*):
+            The corresponding semantic segmentation maps with the pixel-wise annotations.
+
+            (`bool`, *optional*, defaults to `True`):
+            Whether or not to pad images up to the largest image in a batch and create a pixel mask.
+
+            If left to the default, will return a pixel mask that is:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
             - **task_inputs** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
diff --git a/src/transformers/models/ovis2/processing_ovis2.py b/src/transformers/models/ovis2/processing_ovis2.py
index f67657c140d8..e127e96bfc0c 100644
--- a/src/transformers/models/ovis2/processing_ovis2.py
+++ b/src/transformers/models/ovis2/processing_ovis2.py
@@ -20,6 +20,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -34,26 +35,8 @@ class Ovis2ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Ovis2Processor(ProcessorMixin):
-    r"""
-    Constructs a Ovis2 processor which wraps Ovis2 image processor and a Qwen2 tokenizer into a single processor.
-
-    [`Ovis2Processor`] offers all the functionalities of [`Ovis2VideoProcessor`], [`Ovis2ImageProcessor`] and [`Qwen2TokenizerFast`]. See the
-    [`~Ovis2Processor.__call__`] and [`~Ovis2Processor.decode`] for more information.
-
-    Args:
-        image_processor ([`Ovis2ImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`Qwen2TokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        image_token (`str`, *optional*, defaults to `"<image>"`):
-            Special token used to denote image location.
-        image_seq_length (`int`, *optional*, defaults to 256):
-            The number of image tokens to be used for each image in the input.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -63,6 +46,12 @@ def __init__(
         image_seq_length=256,
         **kwargs,
     ):
+        """
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        image_seq_length (`int`, *optional*, defaults to 256):
+            The number of image tokens to be used for each image in the input.
+        """
         self.image_seq_length = image_seq_length
         self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
         self.image_token_id = (
@@ -72,6 +61,7 @@ def __init__(
         )
         super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -79,21 +69,6 @@ def __call__(
         **kwargs: Unpack[Ovis2ProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        Ovis2ImageProcessor's [`~Ovis2ImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py
index 52889721820f..2ef674c971a1 100644
--- a/src/transformers/models/owlv2/processing_owlv2.py
+++ b/src/transformers/models/owlv2/processing_owlv2.py
@@ -31,6 +31,7 @@
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import TensorType, is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if TYPE_CHECKING:
@@ -53,23 +54,13 @@ class Owlv2ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Owlv2Processor(ProcessorMixin):
-    r"""
-    Constructs an Owlv2 processor which wraps [`Owlv2ImageProcessor`]/[`Owlv2ImageProcessorFast`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`] into
-    a single processor that inherits both the image processor and tokenizer functionalities. See the
-    [`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`Owlv2ImageProcessor`, `Owlv2ImageProcessorFast`]):
-            The image processor is a required input.
-        tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`]):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor, tokenizer, **kwargs):
         super().__init__(image_processor, tokenizer)
 
     # Copied from transformers.models.owlvit.processing_owlvit.OwlViTProcessor.__call__ with OwlViT->Owlv2
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -77,29 +68,10 @@ def __call__(
         **kwargs: Unpack[Owlv2ProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
-        `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`,
-            `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The query image to be prepared, one query image is expected per target image to be queried. Each image
-                can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image
-                should be of shape (C, H, W), where C is a number of channels, H and W are image height and width.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
+        query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+            The query image to be prepared, one query image is expected per target image to be queried. Each image
+            can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image
+            should be of shape (C, H, W), where C is a number of channels, H and W are image height and width.
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py
index 0443ab64eda9..2e825d7d4bfb 100644
--- a/src/transformers/models/owlvit/processing_owlvit.py
+++ b/src/transformers/models/owlvit/processing_owlvit.py
@@ -31,6 +31,7 @@
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import TensorType, is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if TYPE_CHECKING:
@@ -53,22 +54,12 @@ class OwlViTProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class OwlViTProcessor(ProcessorMixin):
-    r"""
-    Constructs an OWL-ViT processor which wraps [`OwlViTImageProcessor`] and [`CLIPTokenizer`]/[`CLIPTokenizerFast`]
-    into a single processor that inherits both the image processor and tokenizer functionalities. See the
-    [`~OwlViTProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`OwlViTImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`CLIPTokenizer`, `CLIPTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -76,36 +67,17 @@ def __call__(
         **kwargs: Unpack[OwlViTProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several text(s) and image(s). This method forwards the `text` and
-        `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode:
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`,
-            `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The query image to be prepared, one query image is expected per target image to be queried. Each image
-                can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image
-                should be of shape (C, H, W), where C is a number of channels, H and W are image height and width.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
+        query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+            The query image to be prepared, one query image is expected per target image to be queried. Each image
+            can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image
+            should be of shape (C, H, W), where C is a number of channels, H and W are image height and width.
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
+            `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+            `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
             - **query_pixel_values** -- Pixel values of the query images to be fed to a model. Returned when `query_images` is not `None`.
         """
diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py
index 0c28f0eb4631..d41b096f2952 100644
--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@@ -31,6 +31,7 @@
 )
 from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -40,6 +41,12 @@
 
 
 class PaliGemmaTextKwargs(TextKwargs):
+    """
+    suffix (`str`, `list[str]`, `list[list[str]]`):
+        The suffixes or batch of suffixes to be encoded. Only necessary for finetuning. See https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md
+        for more information. If your prompt is "<image> What is on the image", the suffix corresponds to the expected prediction "a cow sitting on a bench".
+    """
+
     suffix: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
 
 
@@ -92,22 +99,8 @@ def build_string_from_input(prompt, bos_token, image_seq_len, image_token, num_i
     return f"{image_token * image_seq_len * num_images}{bos_token}{prompt}\n"
 
 
+@auto_docstring
 class PaliGemmaProcessor(ProcessorMixin):
-    r"""
-    Constructs a PaliGemma processor which wraps a PaliGemma image processor and a PaliGemma tokenizer into a single processor.
-
-    [`PaliGemmaProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`GemmaTokenizerFast`]. See the
-    [`~PaliGemmaProcessor.__call__`] and [`~PaliGemmaProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`SiglipImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`GemmaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -136,6 +129,7 @@ def __init__(
 
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -168,24 +162,6 @@ def __call__(
         Meaning the last three tokens are of "label" ("suffix") type while the other ones are of "prefix" type.
 
 
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
-                number of channels, H and W are image height and width.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-            suffix (`str`, `list[str]`, `list[list[str]]`):
-                The suffixes or batch of suffixes to be encoded. Only necessary for finetuning. See https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md
-                for more information. If your prompt is "<image> What is on the image", the suffix corresponds to the expected prediction "a cow sitting on a bench".
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/parakeet/processing_parakeet.py b/src/transformers/models/parakeet/processing_parakeet.py
index 9d69f1458b60..465711bc8384 100644
--- a/src/transformers/models/parakeet/processing_parakeet.py
+++ b/src/transformers/models/parakeet/processing_parakeet.py
@@ -18,6 +18,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -38,10 +39,12 @@ class ParakeetProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class ParakeetProcessor(ProcessorMixin):
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         audio: AudioInput,
diff --git a/src/transformers/models/perception_lm/processing_perception_lm.py b/src/transformers/models/perception_lm/processing_perception_lm.py
index 412996873807..71e3a01bfdea 100644
--- a/src/transformers/models/perception_lm/processing_perception_lm.py
+++ b/src/transformers/models/perception_lm/processing_perception_lm.py
@@ -25,6 +25,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -40,28 +41,8 @@ class PerceptionLMProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class PerceptionLMProcessor(ProcessorMixin):
-    r"""
-    Constructs a PerceptionLM processor which wraps a PerceptionLM image processor, a PerceptionLM video processor, and a tokenizer into a single processor.
-
-    [`PerceptionLMProcessor`] offers all the functionalities of [`PerceptionLMImageProcessorFast`], [`PerceptionLMVideoProcessor`], and the tokenizer (e.g. [`LlamaTokenizerFast`]). See the
-    [`~PerceptionLMProcessor.__call__`] and [`~PerceptionLMProcessor.decode`] for more information.
-
-    Args:
-        video_processor ([`PerceptionLMVideoProcessor`], *optional*):
-            The video processor to process video inputs.
-        image_processor ([`PerceptionLMImageProcessorFast`], *optional*):
-            The image processor to process image inputs.
-        tokenizer ([`LlamaTokenizerFast`] or similar, *optional*):
-            The tokenizer to process text inputs.
-        patch_size (`int`, *optional*):
-            Patch size from the vision tower.
-        chat_template (`str`, *optional*):
-            A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
-        pooling_ratio (`int`, *optional*, defaults to 2):
-            Pooling ratio for vision tokens. If not 1, 2D adaptive pooling is applied over projected vision tokens.
-    """
-
     def __init__(
         self,
         video_processor=None,
@@ -72,6 +53,12 @@ def __init__(
         pooling_ratio=2,
         **kwargs,
     ):
+        """
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        pooling_ratio (`int`, *optional*, defaults to 2):
+            Pooling ratio for vision tokens. If not 1, 2D adaptive pooling is applied over projected vision tokens.
+        """
         self.patch_size = patch_size
         self.pooling_ratio = pooling_ratio
         self.image_token = tokenizer.image_token
@@ -80,6 +67,7 @@ def __init__(
         self.video_token_id = tokenizer.video_token_id
         super().__init__(video_processor, image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -88,25 +76,6 @@ def __call__(
         **kwargs: Unpack[PerceptionLMProcessorKwargs],
     ) -> BatchFeature:
         """
-        Prepares a batch containing one or more sequences of text and/or images and/or videos.
-
-        If `text` is provided, it is tokenized using the tokenizer.
-        If `images` is provided, they are processed using the image processor.
-        If `videos` is provided, they are processed using the video processor.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
-                The image or batch of images to be processed. Each image can be a PIL image, NumPy array, or PyTorch tensor.
-                Both channels-first and channels-last formats are supported.
-            text (`str`, `List[str]`, *optional*):
-                The sequence or batch of sequences to be tokenized. Each sequence can be a string.
-            videos (`Any`, *optional*):
-                The video or batch of videos to be processed.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
index 8eec69b0448e..8e7782f3b7f5 100644
--- a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
@@ -25,6 +25,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -38,26 +39,8 @@ class Phi4MultimodalProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Phi4MultimodalProcessor(ProcessorMixin):
-    r"""
-    Constructs a Phi4Multimodal processor which raps an image processor, a audio processor, and a GPT tokenizer into a single processor.
-
-    [`Phi4MultimodalProcessor`] offers all the functionalities of [`Phi4MultimodalImageProcessorFast`] and [`GPT2Tokenizer`]. See the
-    [`~Phi4MultimodalProcessor.__call__`] and [`~Phi4MultimodalProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`Phi4MultimodalImageProcessorFast`):
-            The image processor to use for images.
-        audio_processor (`Phi4MultimodalFeatureExtractor`):
-            The audio processor to use for audio inputs.
-        tokenizer (`GPT2TokenizerFast`):
-            The tokenizer to use for text.
-        fake_image_token_pattern (`str`, *optional*, defaults to `r"<\|image_\d+\|>"`):
-            The fake image token pattern.
-        fake_audio_token_pattern (`str`, *optional*, defaults to `r"<\|audio_\d+\|>"`):
-            The fake audio token pattern.
-    """
-
     audio_processor_class = "Phi4MultimodalFeatureExtractor"
 
     def __init__(
@@ -67,12 +50,19 @@ def __init__(
         tokenizer,
         **kwargs,
     ):
+        r"""
+        fake_image_token_pattern (`str`, *optional*, defaults to `r"<\|image_\d+\|>"`):
+            The fake image token pattern.
+        fake_audio_token_pattern (`str`, *optional*, defaults to `r"<\|audio_\d+\|>"`):
+            The fake audio token pattern.
+        """
         self.image_token = tokenizer.image_token
         self.image_token_id = tokenizer.image_token_id
         self.audio_token = tokenizer.audio_token
         self.audio_token_id = tokenizer.audio_token_id
         super().__init__(image_processor, audio_processor, tokenizer, **kwargs)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, list[TextInput]],
@@ -81,23 +71,6 @@ def __call__(
         **kwargs: Unpack[ProcessingKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forards the `text`
-        and `kwargs` arguments to GPT2Tokenizer's [`~GPT2Tokenizer.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        Phi4MultimodalImageProcessorFast's [`~Phi4MultimodalImageProcessorFast.__call__`] if `images` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            audio (`list[Union[np.ndarray, torch.Tensor]]`):
-                List of the audios to be prepared.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/pix2struct/processing_pix2struct.py b/src/transformers/models/pix2struct/processing_pix2struct.py
index b7446cb69684..e260c66fbf58 100644
--- a/src/transformers/models/pix2struct/processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/processing_pix2struct.py
@@ -22,6 +22,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 
 
 class Pix2StructProcessorKwargs(ProcessingKwargs, total=False):
@@ -46,37 +47,19 @@ class Pix2StructProcessorKwargs(ProcessingKwargs, total=False):
 logger = logging.get_logger(__name__)
 
 
+@auto_docstring
 class Pix2StructProcessor(ProcessorMixin):
-    r"""
-    Constructs a PIX2STRUCT processor which wraps a BERT tokenizer and PIX2STRUCT image processor into a single
-    processor.
-
-    [`Pix2StructProcessor`] offers all the functionalities of [`Pix2StructImageProcessor`] and [`T5TokenizerFast`]. See
-    the docstring of [`~Pix2StructProcessor.__call__`] and [`~Pix2StructProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`Pix2StructImageProcessor`):
-            An instance of [`Pix2StructImageProcessor`]. The image processor is a required input.
-        tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
-            An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor, tokenizer):
         tokenizer.return_token_type_ids = False
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images=None,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
         **kwargs: Unpack[Pix2StructProcessorKwargs],
     ) -> Union[BatchEncoding, BatchFeature]:
-        """
-        This method uses [`Pix2StructImageProcessor.preprocess`] method to prepare image(s) for the model, and
-        [`T5TokenizerFast.__call__`] to prepare text for the model.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
 
diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py
index b62deee98300..4ebb527dc23a 100644
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -30,6 +30,7 @@
 )
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import is_vision_available, logging
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_vision_available():
@@ -61,32 +62,8 @@ def is_image_or_image_url(elem):
     return is_url(elem) or is_valid_image(elem)
 
 
+@auto_docstring
 class PixtralProcessor(ProcessorMixin):
-    r"""
-    Constructs a Pixtral processor which wraps a Pixtral image processor and a Pixtral tokenizer into a single processor.
-
-    [`PixtralProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~PixtralProcessor.__call__`] and [`~PixtralProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`PixtralImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        patch_size (`int`, *optional*, defaults to 16):
-            Patch size from the vision tower.
-        spatial_merge_size (`int`, *optional*, defaults to 1):
-            The downsampling factor for the spatial merge operation.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        image_token (`str`, *optional*, defaults to `"[IMG]"`):
-            Special token used to denote image location.
-        image_break_token (`str`, *optional*, defaults to `"[IMG_BREAK]"`):
-            Special token used to denote the end of a line of pixels in an image.
-        image_end_token (`str`, *optional*, defaults to `"[IMG_END]"`):
-            Special token used to denote the end of an image input.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -99,6 +76,18 @@ def __init__(
         image_end_token="[IMG_END]",
         **kwargs,
     ):
+        """
+        patch_size (`int`, *optional*, defaults to 16):
+            Patch size from the vision tower.
+        spatial_merge_size (`int`, *optional*, defaults to 1):
+            The downsampling factor for the spatial merge operation.
+        image_token (`str`, *optional*, defaults to `"[IMG]"`):
+            Special token used to denote image location.
+        image_break_token (`str`, *optional*, defaults to `"[IMG_BREAK]"`):
+            Special token used to denote the end of a line of pixels in an image.
+        image_end_token (`str`, *optional*, defaults to `"[IMG_END]"`):
+            Special token used to denote the end of an image input.
+        """
         self.patch_size = patch_size
         self.spatial_merge_size = spatial_merge_size
         self.image_token = image_token
@@ -111,6 +100,7 @@ def __init__(
         self.image_ids = [self.image_token_id, self.image_break_token_id, self.image_end_token_id]
         super().__init__(image_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -118,26 +108,6 @@ def __call__(
         **kwargs: Unpack[PixtralProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/pop2piano/processing_pop2piano.py b/src/transformers/models/pop2piano/processing_pop2piano.py
index a68168e36739..0d40fe7e021b 100644
--- a/src/transformers/models/pop2piano/processing_pop2piano.py
+++ b/src/transformers/models/pop2piano/processing_pop2piano.py
@@ -23,28 +23,17 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils import BatchEncoding, PaddingStrategy, TruncationStrategy
 from ...utils import TensorType
+from ...utils.auto_docstring import auto_docstring
 from ...utils.import_utils import requires
 
 
 @requires(backends=("essentia", "librosa", "pretty_midi", "scipy", "torch"))
+@auto_docstring
 class Pop2PianoProcessor(ProcessorMixin):
-    r"""
-    Constructs an Pop2Piano processor which wraps a Pop2Piano Feature Extractor and Pop2Piano Tokenizer into a single
-    processor.
-
-    [`Pop2PianoProcessor`] offers all the functionalities of [`Pop2PianoFeatureExtractor`] and [`Pop2PianoTokenizer`].
-    See the docstring of [`~Pop2PianoProcessor.__call__`] and [`~Pop2PianoProcessor.decode`] for more information.
-
-    Args:
-        feature_extractor (`Pop2PianoFeatureExtractor`):
-            An instance of [`Pop2PianoFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`Pop2PianoTokenizer`):
-            An instance of ['Pop2PianoTokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         audio: Union[np.ndarray, list[float], list[np.ndarray]] = None,
@@ -59,13 +48,6 @@ def __call__(
         verbose: bool = True,
         **kwargs,
     ) -> Union[BatchFeature, BatchEncoding]:
-        """
-        This method uses [`Pop2PianoFeatureExtractor.__call__`] method to prepare log-mel-spectrograms for the model,
-        and [`Pop2PianoTokenizer.__call__`] to prepare token_ids from notes.
-
-        Please refer to the docstring of the above two methods for more information.
-        """
-
         # Since Feature Extractor needs both audio and sampling_rate and tokenizer needs both token_ids and
         # feature_extractor_output, we must check for both.
         if (audio is None and sampling_rate is None) and (notes is None):
diff --git a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
index ead9dbe10da4..b4b4d9a10926 100644
--- a/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
+++ b/src/transformers/models/qwen2_5_omni/processing_qwen2_5_omni.py
@@ -27,6 +27,7 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
 from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -70,25 +71,8 @@ class Qwen2_5OmniProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Qwen2_5OmniProcessor(ProcessorMixin):
-    r"""
-    Constructs a Qwen2.5Omni processor.
-    [`Qwen2_5OmniProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`], [`WhisperFeatureExtractor`], and [`Qwen2TokenizerFast`]. See the
-    [`~Qwen2_5OmniProcessor.__call__`] and [`~Qwen2_5OmniProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`Qwen2VLImageProcessor`], *optional*):
-            The image processor.
-        video_processor ([`Qwen2VLVideoProcessor`], *optional*):
-            The video processor.
-        feature_extractor ([`WhisperFeatureExtractor`], *optional*):
-            The audio feature extractor.
-        tokenizer ([`Qwen2TokenizerFast`], *optional*):
-            The text tokenizer.
-        chat_template (`Optional[str]`, *optional*):
-            The Jinja template to use for formatting the conversation. If not provided, the default chat template is used.
-    """
-
     def __init__(
         self, image_processor=None, video_processor=None, feature_extractor=None, tokenizer=None, chat_template=None
     ):
@@ -101,6 +85,7 @@ def __init__(
         self.audio_bos_token = self.tokenizer.audio_bos_token
         self.audio_eos_token = self.tokenizer.audio_eos_token
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
@@ -109,30 +94,6 @@ def __call__(
         audio: Optional[AudioInput] = None,
         **kwargs: Unpack[Qwen2_5OmniProcessorKwargs],
     ) -> BatchFeature:
-        """
-        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
-        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the audio(s), this method forwards the `audio` and `kwargs` arguments to
-        WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audio` is not `None`. To prepare the vision inputs,
-        this method forwards the `vision_infos` and `kwargs` arguments to Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`]
-        if `vision_infos` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            audio (`np.ndarray`, `list[np.ndarray]`):
-                The audio or batch of audio to be prepared. Each audio can be a NumPy array.
-        """
-
         if text is None:
             raise ValueError("You need to specify either a `text` input to process.")
 
diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
index 8734f56b9418..1b24da65ed99 100644
--- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
@@ -32,6 +32,7 @@
 from ...image_utils import ImageInput
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -44,22 +45,8 @@ class Qwen2_5_VLProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Qwen2_5_VLProcessor(ProcessorMixin):
-    r"""
-    Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
-    [`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
-    [`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
-    Args:
-        image_processor ([`Qwen2VLImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`Qwen2TokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        video_processor ([`Qwen2_5_VLVideoProcessor`], *optional*):
-            The video processor is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
@@ -75,6 +62,7 @@ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, c
         )
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -83,27 +71,6 @@ def __call__(
         **kwargs: Unpack[Qwen2_5_VLProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
-        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
index 449480df4588..d164d6f7fd65 100644
--- a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
@@ -23,6 +23,7 @@
 from ...feature_extraction_utils import BatchFeature
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class Qwen2AudioProcessorKwargs(ProcessingKwargs, total=False):
@@ -34,29 +35,8 @@ class Qwen2AudioProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Qwen2AudioProcessor(ProcessorMixin):
-    r"""
-    Constructs a Qwen2Audio processor which wraps a Qwen2Audio feature extractor and a Qwen2Audio tokenizer into a single processor.
-
-    [`Qwen2AudioProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and [`Qwen2TokenizerFast`]. See the
-    [`~Qwen2AudioProcessor.__call__`] and [`~Qwen2AudioProcessor.decode`] for more information.
-
-    Args:
-        feature_extractor ([`WhisperFeatureExtractor`], *optional*):
-            The feature extractor is a required input.
-        tokenizer ([`Qwen2TokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        chat_template (`Optional[str]`, *optional*):
-                The Jinja template to use for formatting the conversation. If not provided, the default chat template
-                is used.
-        audio_token (`str`, *optional*, defaults to `"<|AUDIO|>"`):
-            The token to use for audio tokens.
-        audio_bos_token (`str`, *optional*, defaults to `"<|audio_bos|>"`):
-            The token to use for audio bos tokens.
-        audio_eos_token (`str`, *optional*, defaults to `"<|audio_eos|>"`):
-            The token to use for audio eos tokens.
-    """
-
     def __init__(
         self,
         feature_extractor=None,
@@ -66,6 +46,14 @@ def __init__(
         audio_bos_token="<|audio_bos|>",
         audio_eos_token="<|audio_eos|>",
     ):
+        """
+        audio_token (`str`, *optional*, defaults to `"<|AUDIO|>"`):
+            The token to use for audio tokens.
+        audio_bos_token (`str`, *optional*, defaults to `"<|audio_bos|>"`):
+            The token to use for audio bos tokens.
+        audio_eos_token (`str`, *optional*, defaults to `"<|audio_eos|>"`):
+            The token to use for audio eos tokens.
+        """
         if chat_template is None:
             chat_template = self.default_chat_template
         self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
@@ -74,27 +62,13 @@ def __init__(
         self.audio_eos_token = tokenizer.audio_eos_token if hasattr(tokenizer, "audio_eos_token") else audio_eos_token
         super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
         audio: Union[np.ndarray, list[np.ndarray]] = None,
         **kwargs: Unpack[Qwen2AudioProcessorKwargs],
     ) -> BatchFeature:
-        """
-        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
-        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the audio(s), this method forwards the `audios` and `kwargs` arguments to
-        WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audios` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            audio (`np.ndarray`, `list[np.ndarray]`):
-                The audio or batch of audios to be prepared. Each audio can be a NumPy array.
-        """
         if text is None:
             raise ValueError("You need to specify `text` input to process.")
         elif isinstance(text, str):
diff --git a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
index e9487a8197bf..4719b8f060ff 100644
--- a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
@@ -30,6 +30,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -45,22 +46,8 @@ class Qwen2VLProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Qwen2VLProcessor(ProcessorMixin):
-    r"""
-    Constructs a Qwen2-VL processor which wraps a Qwen2-VL image processor and a Qwen2 tokenizer into a single processor.
-    [`Qwen2VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
-    [`~Qwen2VLProcessor.__call__`] and [`~Qwen2VLProcessor.decode`] for more information.
-    Args:
-        image_processor ([`Qwen2VLImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`Qwen2TokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        video_processor ([`Qwen2VLVideoProcessor`], *optional*):
-            The video processor is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
@@ -76,6 +63,7 @@ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, c
         )
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -84,27 +72,6 @@ def __call__(
         **kwargs: Unpack[Qwen2VLProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
-        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
index ceacd2b854d2..ed8a3356286f 100644
--- a/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
+++ b/src/transformers/models/qwen3_omni_moe/processing_qwen3_omni_moe.py
@@ -29,6 +29,7 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, VideosKwargs
 from ...tokenization_utils_base import TextInput
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput, make_batched_videos
 
 
@@ -83,25 +84,8 @@ def _get_feat_extract_output_lengths(input_lengths):
     return output_lengths
 
 
+@auto_docstring
 class Qwen3OmniMoeProcessor(ProcessorMixin):
-    r"""
-    Constructs a Qwen2.5Omni processor.
-    [`Qwen3OmniMoeProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`], [`WhisperFeatureExtractor`], and [`Qwen2TokenizerFast`]. See the
-    [`~Qwen3OmniMoeProcessor.__call__`] and [`~Qwen3OmniMoeProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`Qwen2VLImageProcessor`], *optional*):
-            The image processor.
-        video_processor ([`Qwen2VLVideoProcessor`], *optional*):
-            The video processor.
-        feature_extractor ([`WhisperFeatureExtractor`], *optional*):
-            The audio feature extractor.
-        tokenizer ([`Qwen2TokenizerFast`], *optional*):
-            The text tokenizer.
-        chat_template (`Optional[str]`, *optional*):
-            The Jinja template to use for formatting the conversation. If not provided, the default chat template is used.
-    """
-
     def __init__(
         self, image_processor=None, video_processor=None, feature_extractor=None, tokenizer=None, chat_template=None
     ):
@@ -114,6 +98,7 @@ def __init__(
         self.audio_bos_token = self.tokenizer.audio_bos_token
         self.audio_eos_token = self.tokenizer.audio_eos_token
 
+    @auto_docstring
     def __call__(
         self,
         text: TextInput = None,
@@ -122,30 +107,6 @@ def __call__(
         audio: Optional[AudioInput] = None,
         **kwargs,
     ) -> BatchFeature:
-        """
-        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
-        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the audio(s), this method forwards the `audio` and `kwargs` arguments to
-        WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] if `audio` is not `None`. To prepare the vision inputs,
-        this method forwards the `vision_infos` and `kwargs` arguments to Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`]
-        if `vision_infos` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            audio (`np.ndarray`, `List[np.ndarray]`):
-                The audio or batch of audio to be prepared. Each audio can be a NumPy array.
-        """
-
         if text is None:
             raise ValueError("You need to specify either a `text` input to process.")
 
diff --git a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
index b86e3c282ed4..1f2a5c891d3d 100644
--- a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
@@ -27,6 +27,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -44,22 +45,8 @@ class Qwen3VLProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Qwen3VLProcessor(ProcessorMixin):
-    r"""
-    Constructs a Qwen3VL processor which wraps a Qwen3VL image processor and a Qwen2 tokenizer into a single processor.
-    [`Qwen3VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
-    [`~Qwen3VLProcessor.__call__`] and [`~Qwen3VLProcessor.decode`] for more information.
-    Args:
-        image_processor ([`Qwen2VLImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`Qwen2TokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        video_processor ([`Qwen3VLVideoProcessor`], *optional*):
-            The video processor is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
@@ -91,6 +78,7 @@ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, c
             else tokenizer.convert_tokens_to_ids(self.vision_end_token)
         )
 
+    @auto_docstring
     def __call__(
         self,
         images: ImageInput = None,
@@ -99,27 +87,6 @@ def __call__(
         **kwargs: Unpack[Qwen3VLProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
-        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/sam/processing_sam.py b/src/transformers/models/sam/processing_sam.py
index cda5d7ed5aeb..3dcc41b0e5fa 100644
--- a/src/transformers/models/sam/processing_sam.py
+++ b/src/transformers/models/sam/processing_sam.py
@@ -25,6 +25,7 @@
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 from ...utils import is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -52,33 +53,19 @@ class SamProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class SamProcessor(ProcessorMixin):
-    r"""
-    Constructs a SAM processor which wraps a SAM image processor and an 2D points & Bounding boxes processor into a
-    single processor.
-
-    [`SamProcessor`] offers all the functionalities of [`SamImageProcessor`]. See the docstring of
-    [`~SamImageProcessor.__call__`] for more information.
-
-    Args:
-        image_processor (`SamImageProcessor`):
-            An instance of [`SamImageProcessor`]. The image processor is a required input.
-    """
-
     def __init__(self, image_processor):
         super().__init__(image_processor)
         self.target_size = self.image_processor.size["longest_edge"]
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
         **kwargs,
     ) -> BatchEncoding:
-        """
-        This method uses [`SamImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D
-        points and bounding boxes for the model if they are provided.
-        """
         output_kwargs = self._merge_kwargs(
             SamProcessorKwargs,
             tokenizer_init_kwargs={},
diff --git a/src/transformers/models/sam2/processing_sam2.py b/src/transformers/models/sam2/processing_sam2.py
index 21a5f9dc5913..1eb6c06bcae4 100644
--- a/src/transformers/models/sam2/processing_sam2.py
+++ b/src/transformers/models/sam2/processing_sam2.py
@@ -25,6 +25,7 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
 from ...utils import TensorType, is_torch_available, logging
+from ...utils.auto_docstring import auto_docstring
 from ...utils.import_utils import requires
 
 
@@ -35,28 +36,18 @@
 
 
 @requires(backends=("torch",))
+@auto_docstring
 class Sam2Processor(ProcessorMixin):
-    r"""
-    Constructs a SAM2 processor which wraps a SAM2 image processor and an 2D points & Bounding boxes processor into a
-    single processor.
-
-    [`Sam2Processor`] offers all the functionalities of [`Sam2ImageProcessorFast`] and [`Sam2VideoProcessor`]. See the docstring of
-    [`~Sam2ImageProcessorFast.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information.
-
-    Args:
-        image_processor (`Sam2ImageProcessorFast`):
-            An instance of [`Sam2ImageProcessorFast`].
-        target_size (`int`, *optional*):
-            The target size (target_size, target_size) to which the image will be resized.
+    def __init__(self, image_processor, target_size: Optional[int] = None, point_pad_value: int = -10, **kwargs):
+        """
         point_pad_value (`int`, *optional*, defaults to -10):
             The value used for padding input points.
-    """
-
-    def __init__(self, image_processor, target_size: Optional[int] = None, point_pad_value: int = -10, **kwargs):
+        """
         super().__init__(image_processor, **kwargs)
         self.point_pad_value = point_pad_value
         self.target_size = target_size if target_size is not None else self.image_processor.size["height"]
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -68,27 +59,17 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchEncoding:
-        r"""
-        This method uses [`Sam2ImageProcessorFast.__call__`] method to prepare image(s) for the model. It also prepares 2D
-        points and bounding boxes for the model if they are provided.
-
-        Args:
-            images (`ImageInput`, *optional*):
-                The image(s) to process.
-            segmentation_maps (`ImageInput`, *optional*):
-                The segmentation maps to process.
-            input_points (`list[list[list[list[float]]]]`, `torch.Tensor`, *optional*):
-                The points to add to the frame.
-            input_labels (`list[list[list[int]]]`, `torch.Tensor`, *optional*):
-                The labels for the points.
-            input_boxes (`list[list[list[float]]]`, `torch.Tensor`, *optional*):
-                The bounding boxes to add to the frame.
-            original_sizes (`list[list[float]]`, `torch.Tensor`, *optional*):
-                The original sizes of the images.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return.
-            **kwargs:
-                Additional keyword arguments to pass to the image processor.
+        """
+        segmentation_maps (`ImageInput`, *optional*):
+            The segmentation maps to process.
+        input_points (`list[list[list[list[float]]]]`, `torch.Tensor`, *optional*):
+            The points to add to the frame.
+        input_labels (`list[list[list[int]]]`, `torch.Tensor`, *optional*):
+            The labels for the points.
+        input_boxes (`list[list[list[float]]]`, `torch.Tensor`, *optional*):
+            The bounding boxes to add to the frame.
+        original_sizes (`list[list[float]]`, `torch.Tensor`, *optional*):
+            The original sizes of the images.
 
         Returns:
             A [`BatchEncoding`] with the following fields:
diff --git a/src/transformers/models/sam2_video/processing_sam2_video.py b/src/transformers/models/sam2_video/processing_sam2_video.py
index 839449ba505d..145e07d02374 100644
--- a/src/transformers/models/sam2_video/processing_sam2_video.py
+++ b/src/transformers/models/sam2_video/processing_sam2_video.py
@@ -28,38 +28,27 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import BatchEncoding
 from ...utils import TensorType
+from ...utils.auto_docstring import auto_docstring
 from ...utils.import_utils import requires
 from ...video_utils import VideoInput
 from .modeling_sam2_video import Sam2VideoInferenceSession
 
 
 @requires(backends=("torch",))
+@auto_docstring
 class Sam2VideoProcessor(ProcessorMixin):
-    r"""
-    Constructs a SAM2 processor which wraps a SAM2 image processor and an 2D points & Bounding boxes processor into a
-    single processor.
-
-    [`Sam2VideoProcessor`] offers all the functionalities of [`Sam2ImageProcessorFast`] and [`Sam2VideoProcessor`]. See the docstring of
-    [`~Sam2ImageProcessorFast.__call__`] and [`~Sam2VideoProcessor.__call__`] for more information.
-
-    Args:
-        image_processor (`Sam2ImageProcessorFast`):
-            An instance of [`Sam2ImageProcessorFast`].
-        video_processor (`Sam2VideoVideoProcessor`):
-            An instance of [`Sam2VideoVideoProcessor`].
-        target_size (`int`, *optional*):
-            The target size (target_size, target_size) to which the image will be resized.
-        point_pad_value (`int`, *optional*, defaults to -10):
-            The value used for padding input points.
-    """
-
     def __init__(
         self, image_processor, video_processor, target_size: Optional[int] = None, point_pad_value: int = -10, **kwargs
     ):
+        """
+        point_pad_value (`int`, *optional*, defaults to -10):
+            The value used for padding input points.
+        """
         super().__init__(image_processor, video_processor, **kwargs)
         self.point_pad_value = point_pad_value
         self.target_size = target_size if target_size is not None else self.image_processor.size["height"]
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
@@ -71,27 +60,17 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = None,
         **kwargs,
     ) -> BatchEncoding:
-        r"""
-        This method uses [`Sam2VideoImageProcessorFast.__call__`] method to prepare image(s) for the model. It also prepares 2D
-        points and bounding boxes for the model if they are provided.
-
-        Args:
-            images (`ImageInput`, *optional*):
-                The image(s) to process.
-            segmentation_maps (`ImageInput`, *optional*):
-                The segmentation maps to process.
-            input_points (`list[list[list[list[float]]]]`, `torch.Tensor`, *optional*):
-                The points to add to the frame.
-            input_labels (`list[list[list[int]]]`, `torch.Tensor`, *optional*):
-                The labels for the points.
-            input_boxes (`list[list[list[float]]]`, `torch.Tensor`, *optional*):
-                The bounding boxes to add to the frame.
-            original_sizes (`list[list[float]]`, `torch.Tensor`, *optional*):
-                The original sizes of the images.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return.
-            **kwargs:
-                Additional keyword arguments to pass to the image processor.
+        """
+        segmentation_maps (`ImageInput`, *optional*):
+            The segmentation maps to process.
+        input_points (`list[list[list[list[float]]]]`, `torch.Tensor`, *optional*):
+            The points to add to the frame.
+        input_labels (`list[list[list[int]]]`, `torch.Tensor`, *optional*):
+            The labels for the points.
+        input_boxes (`list[list[list[float]]]`, `torch.Tensor`, *optional*):
+            The bounding boxes to add to the frame.
+        original_sizes (`list[list[float]]`, `torch.Tensor`, *optional*):
+            The original sizes of the images.
 
         Returns:
             A [`BatchEncoding`] with the following fields:
diff --git a/src/transformers/models/sam_hq/processing_samhq.py b/src/transformers/models/sam_hq/processing_samhq.py
index 1434a9ca5a2d..df09ff832e75 100644
--- a/src/transformers/models/sam_hq/processing_samhq.py
+++ b/src/transformers/models/sam_hq/processing_samhq.py
@@ -23,8 +23,9 @@
 
 from ...image_utils import ImageInput
 from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack
-from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+from ...tokenization_utils_base import BatchEncoding
 from ...utils import is_torch_available
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -52,19 +53,8 @@ class SamHQProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class SamHQProcessor(ProcessorMixin):
-    r"""
-    Constructs a SAM HQ processor which wraps a SAM  image processor and an 2D points & Bounding boxes processor into a
-    single processor.
-
-    [`SamHQProcessor`] offers all the functionalities of [`SamImageProcessor`]. See the docstring of
-    [`~SamImageProcessor.__call__`] for more information.
-
-    Args:
-        image_processor (`SamImageProcessor`):
-            An instance of [`SamImageProcessor`]. The image processor is a required input.
-    """
-
     def __init__(self, image_processor):
         super().__init__(image_processor)
         # Ensure image_processor is properly initialized
@@ -74,16 +64,12 @@ def __init__(self, image_processor):
             raise ValueError("image_processor.size is not set")
         self.target_size = self.image_processor.size["longest_edge"]
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
-        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
         **kwargs: Unpack[SamHQProcessorKwargs],
     ) -> BatchEncoding:
-        """
-        This method uses [`SamImageProcessor.__call__`] method to prepare image(s) for the model. It also prepares 2D
-        points and bounding boxes for the model if they are provided.
-        """
         output_kwargs = self._merge_kwargs(
             SamHQProcessorKwargs,
             tokenizer_init_kwargs={},
diff --git a/src/transformers/models/seamless_m4t/processing_seamless_m4t.py b/src/transformers/models/seamless_m4t/processing_seamless_m4t.py
index a506d81af61d..0ed8dde57060 100644
--- a/src/transformers/models/seamless_m4t/processing_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/processing_seamless_m4t.py
@@ -22,6 +22,7 @@
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...utils.deprecation import deprecate_kwarg
 
 
@@ -38,28 +39,15 @@ class SeamlessM4TProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {}
 
 
+@auto_docstring
 class SeamlessM4TProcessor(ProcessorMixin):
-    r"""
-    Constructs a SeamlessM4T processor which wraps a SeamlessM4T feature extractor and a SeamlessM4T tokenizer into a
-    single processor.
-
-    [`SeamlessM4TProcessor`] offers all the functionalities of [`SeamlessM4TFeatureExtractor`] and
-    [`SeamlessM4TTokenizerFast`]. See the [`~SeamlessM4TProcessor.__call__`] and [`~SeamlessM4TProcessor.decode`] for
-    more information.
-
-    Args:
-        feature_extractor ([`SeamlessM4TFeatureExtractor`]):
-            The audio processor is a required input.
-        tokenizer ([`SeamlessM4TTokenizerFast`]):
-            The tokenizer is a required input.
-    """
-
     valid_processor_kwargs = SeamlessM4TProcessorKwargs
 
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
     @deprecate_kwarg("audios", version="v4.59.0", new_name="audio")
+    @auto_docstring
     def __call__(
         self,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
@@ -68,21 +56,6 @@ def __call__(
         **kwargs: Unpack[ProcessingKwargs],
     ):
         """
-        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
-        and `kwargs` arguments to SeamlessM4TTokenizerFast's [`~SeamlessM4TTokenizerFast.__call__`] if `text` is not
-        `None` to encode the text. To prepare the audio(s), this method forwards the `audios` and `kwargs` arguments to
-        SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audios` is not `None`. Please refer
-        to the docstring of the above two methods for more information.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            audios (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
-                of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
-                and T the sample length of the audio.
         Returns:
             [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
 
diff --git a/src/transformers/models/siglip/processing_siglip.py b/src/transformers/models/siglip/processing_siglip.py
index 2d63eacd1747..6976a0b3f7b6 100644
--- a/src/transformers/models/siglip/processing_siglip.py
+++ b/src/transformers/models/siglip/processing_siglip.py
@@ -17,22 +17,11 @@
 """
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class SiglipProcessor(ProcessorMixin):
-    r"""
-    Constructs a Siglip processor which wraps a Siglip image processor and a Siglip tokenizer into a single processor.
-
-    [`SiglipProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`SiglipTokenizer`]. See the
-    [`~SiglipProcessor.__call__`] and [`~SiglipProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`SiglipImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`SiglipTokenizer`]):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/siglip2/processing_siglip2.py b/src/transformers/models/siglip2/processing_siglip2.py
index fe33ad11dbe7..0739af8a92ed 100644
--- a/src/transformers/models/siglip2/processing_siglip2.py
+++ b/src/transformers/models/siglip2/processing_siglip2.py
@@ -17,6 +17,7 @@
 """
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
 class Siglip2ProcessorKwargs(ProcessingKwargs, total=False):
@@ -33,20 +34,8 @@ class Siglip2ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class Siglip2Processor(ProcessorMixin):
-    r"""
-    Constructs a Siglip2 processor which wraps a Siglip2 image processor and a Gemma tokenizer into a single processor.
-
-    [`Siglip2Processor`] offers all the functionalities of [`Siglip2ImageProcessor`] and [`GemmaTokenizerFast`]. See the
-    [`~Siglip2Processor.__call__`] and [`~Siglip2Processor.decode`] for more information.
-
-    Args:
-        image_processor ([`Siglip2ImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`GemmaTokenizerFast`]):
-            The tokenizer is a required input.
-    """
-
     valid_processor_kwargs = Siglip2ProcessorKwargs
 
     def __init__(self, image_processor, tokenizer):
diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py
index 2ce6465ee971..605db74d3c53 100644
--- a/src/transformers/models/smolvlm/processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/processing_smolvlm.py
@@ -24,6 +24,7 @@
 from ...processing_utils import AllKwargsForChatTemplate, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import BatchEncoding, TextInput
 from ...utils import is_num2words_available, is_vision_available, logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -119,28 +120,8 @@ class SmolVLMProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class SmolVLMProcessor(ProcessorMixin):
-    r"""
-    Constructs a SmolVLM processor which wraps a LLama tokenizer and SmolVLM image processor into a single processor.
-
-    [`SmolVLMProcessor`] offers all the functionalities of [`SmolVLMImageProcessor`] and [`SmolVLMTokenizerFast`]. See
-    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`SmolVLMImageProcessor`):
-            An instance of [`SmolVLMImageProcessor`]. The image processor is a required input.
-        tokenizer (`PreTrainedTokenizerBase`):
-            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
-        video_processor (`SmolVLMImageProcessor`):
-            n instance of [`SmolVLMImageProcessor`]. The video processor is a required input.
-        image_seq_len (`int`, *optional*, defaults to 169):
-            The length of the image sequence i.e. the number of <image> tokens per image in the input.
-            This parameter is used to build the string from the input prompt and image tokens and should match the
-            value the model used. It is computed as: image_seq_len = int(((image_size // patch_size) ** 2) / (scale_factor**2))
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-    """
-
     def __init__(
         self,
         image_processor,
@@ -150,6 +131,12 @@ def __init__(
         chat_template: Optional[str] = None,
         **kwargs,
     ):
+        """
+        image_seq_len (`int`, *optional*, defaults to 169):
+            The length of the image sequence i.e. the number of <image> tokens per image in the input.
+            This parameter is used to build the string from the input prompt and image tokens and should match the
+            value the model used. It is computed as: image_seq_len = int(((image_size // patch_size) ** 2) / (scale_factor**2))
+        """
         self.fake_image_token = getattr(tokenizer, "fake_image_token", "<fake_token_around_image>")
         self.image_token = getattr(tokenizer, "image_token", "<image>")
         self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
@@ -230,6 +217,7 @@ def expand_text_with_video_tokens(self, text, video_inputs):
             prompt_strings.append(sample)
         return prompt_strings
 
+    @auto_docstring
     def __call__(
         self,
         images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None,
@@ -237,53 +225,6 @@ def __call__(
         videos: Optional[VideoInput] = None,
         **kwargs: Unpack[SmolVLMProcessorKwargs],
     ) -> BatchEncoding:
-        """
-        Processes the input prompts and returns a BatchEncoding.
-
-        Example:
-
-        ```python
-        >>> import requests
-        >>> from transformers import SmolVLMProcessor
-        >>> from transformers.image_utils import load_image
-
-        >>> processor = SmolVLMProcessor.from_pretrained("HuggingFaceM4/SmolVLM2-256M-Video-Instruct")
-        >>> processor.image_processor.do_image_splitting = False  # Force as False to simplify the example
-
-        >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
-        >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"
-
-        >>> image1, image2 = load_image(url1), load_image(url2)
-        >>> images = [[image1], [image2]]
-
-        >>> text = [
-        ...     "<image>In this image, we see",
-        ...     "bla bla bla<image>",
-        ... ]
-        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
-        >>> input_ids = outputs.input_ids
-        >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
-        >>> print(input_tokens)
-        ['<|begin_of_text|><fake_token_around_image><global-img>((<image>)*169)<fake_token_around_image> In this image, we see', '<|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|begin_of_text|>bla bla bla<fake_token_around_image><global-img>((<image>)*169)<fake_token_around_image>']
-        ```
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. If is of type `list[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
-            text (`Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]`, *optional*):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-                Wherever an image token, `<image>` is encountered it is expanded to
-                `<fake_token_around_image>` + `<row_x_col_y>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
-            videos (`list[PIL.Image.Image]`, `np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*):
-                The video or batch of videos to be prepared. Each video can be a list of PIL frames, NumPy array or PyTorch
-                tensor. If is of type `list[VideoInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
-            return_tensors (`Union[str, TensorType]`, *optional*):
-                If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
-                information.
-        """
         if text is None and images is None and videos is None:
             raise ValueError("You must provide one of `text`, `images` or `videos'.")
 
diff --git a/src/transformers/models/speech_to_text/processing_speech_to_text.py b/src/transformers/models/speech_to_text/processing_speech_to_text.py
index ffcb4e3d4497..720a46812b1a 100644
--- a/src/transformers/models/speech_to_text/processing_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/processing_speech_to_text.py
@@ -19,35 +19,16 @@
 import warnings
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class Speech2TextProcessor(ProcessorMixin):
-    r"""
-    Constructs a Speech2Text processor which wraps a Speech2Text feature extractor and a Speech2Text tokenizer into a
-    single processor.
-
-    [`Speech2TextProcessor`] offers all the functionalities of [`Speech2TextFeatureExtractor`] and
-    [`Speech2TextTokenizer`]. See the [`~Speech2TextProcessor.__call__`] and [`~Speech2TextProcessor.decode`] for more
-    information.
-
-    Args:
-        feature_extractor (`Speech2TextFeatureExtractor`):
-            An instance of [`Speech2TextFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`Speech2TextTokenizer`):
-            An instance of [`Speech2TextTokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
+    @auto_docstring
     def __call__(self, *args, **kwargs):
-        """
-        When used in normal mode, this method forwards all its arguments to Speech2TextFeatureExtractor's
-        [`~Speech2TextFeatureExtractor.__call__`] and returns its output. If used in the context
-        [`~Speech2TextProcessor.as_target_processor`] this method forwards all its arguments to Speech2TextTokenizer's
-        [`~Speech2TextTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
-        information.
-        """
         if "raw_speech" in kwargs:
             warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
             audio = kwargs.pop("raw_speech")
diff --git a/src/transformers/models/speecht5/processing_speecht5.py b/src/transformers/models/speecht5/processing_speecht5.py
index bfac305ab641..baaf0b123436 100644
--- a/src/transformers/models/speecht5/processing_speecht5.py
+++ b/src/transformers/models/speecht5/processing_speecht5.py
@@ -15,25 +15,15 @@
 """Speech processor class for SpeechT5."""
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class SpeechT5Processor(ProcessorMixin):
-    r"""
-    Constructs a SpeechT5 processor which wraps a feature extractor and a tokenizer into a single processor.
-
-    [`SpeechT5Processor`] offers all the functionalities of [`SpeechT5FeatureExtractor`] and [`SpeechT5Tokenizer`]. See
-    the docstring of [`~SpeechT5Processor.__call__`] and [`~SpeechT5Processor.decode`] for more information.
-
-    Args:
-        feature_extractor (`SpeechT5FeatureExtractor`):
-            An instance of [`SpeechT5FeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`SpeechT5Tokenizer`):
-            An instance of [`SpeechT5Tokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
+    @auto_docstring
     def __call__(self, *args, **kwargs):
         """
         Processes audio and text input, as well as audio and text targets.
diff --git a/src/transformers/models/trocr/processing_trocr.py b/src/transformers/models/trocr/processing_trocr.py
index 366bb0850d2d..06c6494e5569 100644
--- a/src/transformers/models/trocr/processing_trocr.py
+++ b/src/transformers/models/trocr/processing_trocr.py
@@ -22,42 +22,25 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 class TrOCRProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {}
 
 
+@auto_docstring
 class TrOCRProcessor(ProcessorMixin):
-    r"""
-    Constructs a TrOCR processor which wraps a vision image processor and a TrOCR tokenizer into a single processor.
-
-    [`TrOCRProcessor`] offers all the functionalities of [`ViTImageProcessor`/`DeiTImageProcessor`] and
-    [`RobertaTokenizer`/`XLMRobertaTokenizer`]. See the [`~TrOCRProcessor.__call__`] and [`~TrOCRProcessor.decode`] for
-    more information.
-
-    Args:
-        image_processor ([`ViTImageProcessor`/`DeiTImageProcessor`], *optional*):
-            An instance of [`ViTImageProcessor`/`DeiTImageProcessor`]. The image processor is a required input.
-        tokenizer ([`RobertaTokenizer`/`XLMRobertaTokenizer`], *optional*):
-            An instance of [`RobertaTokenizer`/`XLMRobertaTokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
         **kwargs: Unpack[TrOCRProcessorKwargs],
     ) -> BatchFeature:
-        """
-        When used in normal mode, this method forwards all its arguments to AutoImageProcessor's
-        [`~AutoImageProcessor.__call__`] and returns its output. If used in the context
-        [`~TrOCRProcessor.as_target_processor`] this method forwards all its arguments to TrOCRTokenizer's
-        [`~TrOCRTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
-        """
         if images is None and text is None:
             raise ValueError("You need to specify either an `images` or `text` input to process.")
 
diff --git a/src/transformers/models/tvp/processing_tvp.py b/src/transformers/models/tvp/processing_tvp.py
index 259246962d27..31e8444f1142 100644
--- a/src/transformers/models/tvp/processing_tvp.py
+++ b/src/transformers/models/tvp/processing_tvp.py
@@ -17,6 +17,7 @@
 """
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
 class TvpProcessorKwargs(ProcessingKwargs, total=False):
@@ -30,20 +31,8 @@ class TvpProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class TvpProcessor(ProcessorMixin):
-    r"""
-    Constructs an TVP processor which wraps a TVP image processor and a Bert tokenizer into a single processor.
-
-    [`TvpProcessor`] offers all the functionalities of [`TvpImageProcessor`] and [`BertTokenizerFast`]. See the
-    [`~TvpProcessor.__call__`] and [`~TvpProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`TvpImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`BertTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
         self.video_processor = image_processor
diff --git a/src/transformers/models/udop/processing_udop.py b/src/transformers/models/udop/processing_udop.py
index 5e37a021e6ab..3c24ee6d08db 100644
--- a/src/transformers/models/udop/processing_udop.py
+++ b/src/transformers/models/udop/processing_udop.py
@@ -24,6 +24,7 @@
 from ...image_utils import ImageInput
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -51,6 +52,7 @@ class UdopProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class UdopProcessor(ProcessorMixin):
     r"""
     Constructs a UDOP processor which combines a LayoutLMv3 image processor and a UDOP tokenizer into a single processor.
@@ -65,17 +67,12 @@ class UdopProcessor(ProcessorMixin):
 
     Additionally, it also supports passing `text_target` and `text_pair_target` to the tokenizer, which can be used to
     prepare labels for language modeling tasks.
-
-    Args:
-        image_processor (`LayoutLMv3ImageProcessor`):
-            An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
-        tokenizer (`UdopTokenizer` or `UdopTokenizerFast`):
-            An instance of [`UdopTokenizer`] or [`UdopTokenizerFast`]. The tokenizer is a required input.
     """
 
     def __init__(self, image_processor, tokenizer):
         super().__init__(image_processor, tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         images: Optional[ImageInput] = None,
diff --git a/src/transformers/models/video_llama_3/processing_video_llama_3.py b/src/transformers/models/video_llama_3/processing_video_llama_3.py
index d5ea2c75e9d8..64507f279149 100644
--- a/src/transformers/models/video_llama_3/processing_video_llama_3.py
+++ b/src/transformers/models/video_llama_3/processing_video_llama_3.py
@@ -26,6 +26,7 @@
 from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import PreTokenizedInput, TextInput
 from ...utils import logging
+from ...utils.auto_docstring import auto_docstring
 from ...video_utils import VideoInput
 
 
@@ -42,21 +43,8 @@ class VideoLlama3ProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class VideoLlama3Processor(ProcessorMixin):
-    r"""
-    Constructs a VideoLLaMA3 processor which wraps a VideoLLaMA3 image processor and a Qwen2 tokenizer into a single processor.
-    [`VideoLlama3Processor`] offers all the functionalities of [`VideoLlama3ImageProcessor`] and [`Qwen2Tokenizer`]. See the
-    [`~VideoLlama3Processor.__call__`] and [`~VideoLlama3Processor.decode`] for more information.
-    Args:
-        image_processor ([`VideoLlama3ImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`Qwen2Tokenizer`], *optional*):
-            The tokenizer is a required input.
-        video_processor ([`VideoLlama3VideoProcessor`], *optional*):
-            The video processor is a required input.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs):
         self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
         self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
@@ -72,6 +60,7 @@ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, c
         )
         super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         images: ImageInput = None,
@@ -80,27 +69,6 @@ def __call__(
         **kwargs: Unpack[VideoLlama3ProcessorKwargs],
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwargs` arguments to
-        VideoLlama3ImageProcessor's [`~VideoLlama3ImageProcessor.__call__`] if `vision_infos` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
-                tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py
index 8d6d916834e8..2f249691817f 100644
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -25,41 +25,14 @@
 from ...processing_utils import ProcessorMixin
 from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
 from ...utils import TensorType, logging
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
 
 
+@auto_docstring
 class VideoLlavaProcessor(ProcessorMixin):
-    r"""
-    Constructs a VideoLlava processor which wraps a VideoLlava image processor and a Llava tokenizer into a single processor.
-
-    [`VideoLlavaProcessor`] offers all the functionalities of [`VideoLlavaImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~VideoLlavaProcessor.__call__`] and [`~VideoLlavaProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`VideoLlavaImageProcessor`], *optional*):
-            The image processor is a required input.
-        video_processor ([`VideoLlavaVideoProcessor`], *optional*):
-            The video processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-        patch_size (`int`, *optional*, defaults to 14):
-            Patch size from the vision tower.
-        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
-            The feature selection strategy used to select the vision feature from the vision backbone.
-            Should be same as in model's config
-        image_token (`str`, *optional*, defaults to `"<image>"`):
-            Special token used to denote image location.
-        video_token (`str`, *optional*, defaults to `"<video>"`):
-            Special token used to denote video location.
-        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
-            in a chat into a tokenizable string.
-        num_additional_image_tokens (`int`, *optional*, defaults to 1):
-            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
-            extra tokens appended, no need to set this arg.
-    """
-
     def __init__(
         self,
         image_processor=None,
@@ -73,6 +46,20 @@ def __init__(
         num_additional_image_tokens=1,
         **kwargs,
     ):
+        """
+        patch_size (`int`, *optional*, defaults to 14):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Should be same as in model's config
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        video_token (`str`, *optional*, defaults to `"<video>"`):
+            Special token used to denote video location.
+        num_additional_image_tokens (`int`, *optional*, defaults to 1):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
+        """
         self.patch_size = patch_size
         self.num_additional_image_tokens = num_additional_image_tokens
         self.vision_feature_select_strategy = vision_feature_select_strategy
@@ -82,6 +69,7 @@ def __init__(
         self.video_token_id = tokenizer.convert_tokens_to_ids(self.video_token)
         super().__init__(image_processor, video_processor, tokenizer, chat_template=chat_template)
 
+    @auto_docstring
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
@@ -93,43 +81,19 @@ def __call__(
         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
     ) -> BatchFeature:
         """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
-        VideoLlavaImageProcessor's [`~VideoLlavaImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
-        of the above two methods for more information.
-
-        Args:
-            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
-                number of channels, H and W are image height and width.
-            videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                Video frames to preprocess. Expects a single or batch of video frames in NumPy array or PyTorch
-                tensor. Each video should be of shape (T, C, H, W), where T is number of frames, C is
-                number of channels, H and W are image height and width.
-            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
-                Select a strategy to pad the returned sequences (according to the model's padding side and padding
-                index) among:
-                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
-                  sequence if provided).
-                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
-                  acceptable input length for the model if that argument is not provided.
-                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
-                  lengths).
-            max_length (`int`, *optional*):
-                Maximum length of the returned list and optionally padding length (see above).
-            truncation (`bool`, *optional*):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
+        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+            Select a strategy to pad the returned sequences (according to the model's padding side and padding
+            index) among:
+            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                sequence if provided).
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                acceptable input length for the model if that argument is not provided.
+            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                lengths).
+        max_length (`int`, *optional*):
+            Maximum length of the returned list and optionally padding length (see above).
+        truncation (`bool`, *optional*):
+            Activates truncation to cut input sequences longer than `max_length` to `max_length`.
 
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
diff --git a/src/transformers/models/vilt/processing_vilt.py b/src/transformers/models/vilt/processing_vilt.py
index 26738d890d65..c1524c558637 100644
--- a/src/transformers/models/vilt/processing_vilt.py
+++ b/src/transformers/models/vilt/processing_vilt.py
@@ -17,6 +17,7 @@
 """
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
 class ViltProcessorKwargs(ProcessingKwargs, total=False):
@@ -34,20 +35,8 @@ class ViltProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class ViltProcessor(ProcessorMixin):
-    r"""
-    Constructs a ViLT processor which wraps a BERT tokenizer and ViLT image processor into a single processor.
-
-    [`ViltProcessor`] offers all the functionalities of [`ViltImageProcessor`] and [`BertTokenizerFast`]. See the
-    docstring of [`~ViltProcessor.__call__`] and [`~ViltProcessor.decode`] for more information.
-
-    Args:
-        image_processor (`ViltImageProcessor`, *optional*):
-            An instance of [`ViltImageProcessor`]. The image processor is a required input.
-        tokenizer (`BertTokenizerFast`, *optional*):
-            An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
-    """
-
     valid_processor_kwargs = ViltProcessorKwargs
 
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
diff --git a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
index cc1cd01e50c6..59ec8008366e 100644
--- a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
@@ -17,28 +17,15 @@
 """
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
 class VisionTextDualEncoderProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {}
 
 
+@auto_docstring
 class VisionTextDualEncoderProcessor(ProcessorMixin):
-    r"""
-    Constructs a VisionTextDualEncoder processor which wraps an image processor and a tokenizer into a single
-    processor.
-
-    [`VisionTextDualEncoderProcessor`] offers all the functionalities of [`AutoImageProcessor`] and [`AutoTokenizer`].
-    See the [`~VisionTextDualEncoderProcessor.__call__`] and [`~VisionTextDualEncoderProcessor.decode`] for more
-    information.
-
-    Args:
-        image_processor ([`AutoImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`PreTrainedTokenizer`], *optional*):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
 
diff --git a/src/transformers/models/voxtral/processing_voxtral.py b/src/transformers/models/voxtral/processing_voxtral.py
index f07904a89690..d412906e240f 100644
--- a/src/transformers/models/voxtral/processing_voxtral.py
+++ b/src/transformers/models/voxtral/processing_voxtral.py
@@ -17,6 +17,7 @@
 from typing import Optional, Union
 
 from ...utils import is_mistral_common_available, is_soundfile_available, is_torch_available, logging
+from ...utils.auto_docstring import auto_docstring
 
 
 if is_torch_available():
@@ -61,19 +62,8 @@ class VoxtralProcessorKwargs(ProcessingKwargs, total=False):
     }
 
 
+@auto_docstring
 class VoxtralProcessor(ProcessorMixin):
-    r"""
-    Constructs a Voxtral processor which wraps [`WhisperFeatureExtractor`] and
-    [`MistralCommonTokenizer`] into a single processor that inherits both the audio feature extraction and
-    tokenizer functionalities.
-
-    Args:
-        feature_extractor ([`WhisperFeatureExtractor`]):
-            The feature extractor is a required input.
-        tokenizer ([`MistralCommonTokenizer`]):
-            The tokenizer is a required input.
-    """
-
     def __init__(
         self,
         feature_extractor,
@@ -226,28 +216,13 @@ def apply_chat_template(
 
         return encoded_instruct_inputs
 
+    @auto_docstring
     def __call__(
         self,
         text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]],
         **kwargs: Unpack[VoxtralProcessorKwargs],
     ):
-        r"""
-        Method to prepare text to be fed as input to the model. This method forwards the `text`
-        arguments to MistralCommonTokenizer's [`~MistralCommonTokenizer.__call__`] to encode
-        the text. Please refer to the docstring of the above methods for more information.
-        This methods does not support audio. To prepare the audio, please use:
-        1. `apply_chat_template` [`~VoxtralProcessor.apply_chat_template`] method.
-        2. `apply_transcription_request` [`~VoxtralProcessor.apply_transcription_request`] method.
-
-        Args:
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                    - `'np'`: Return NumPy `np.ndarray` objects.
+        """
         Returns:
             [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py
index 8a8b7ded7116..e0f8e84140e9 100644
--- a/src/transformers/models/wav2vec2/processing_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py
@@ -21,6 +21,7 @@
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 from .feature_extraction_wav2vec2 import Wav2Vec2FeatureExtractor
 from .tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
 
@@ -29,21 +30,8 @@ class Wav2Vec2ProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {}
 
 
+@auto_docstring
 class Wav2Vec2Processor(ProcessorMixin):
-    r"""
-    Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
-    processor.
-
-    [`Wav2Vec2Processor`] offers all the functionalities of [`Wav2Vec2FeatureExtractor`] and [`PreTrainedTokenizer`].
-    See the docstring of [`~Wav2Vec2Processor.__call__`] and [`~Wav2Vec2Processor.decode`] for more information.
-
-    Args:
-        feature_extractor (`Wav2Vec2FeatureExtractor`):
-            An instance of [`Wav2Vec2FeatureExtractor`]. The feature extractor is a required input.
-        tokenizer ([`PreTrainedTokenizer`]):
-            An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
@@ -66,6 +54,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
             return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         audio: Optional[AudioInput] = None,
@@ -73,16 +62,6 @@ def __call__(
         **kwargs: Unpack[Wav2Vec2ProcessorKwargs],
     ):
         """
-        This method forwards all arguments to [`Wav2Vec2FeatureExtractor.__call__`] and/or
-        [`PreTrainedTokenizer.__call__`] depending on the input modality and returns their outputs. If both modalities are passed, [`Wav2Vec2FeatureExtractor.__call__`] and [`PreTrainedTokenizer.__call__`] are called.
-
-        Args:
-            audio (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
-                An audio input is passed to [`Wav2Vec2FeatureExtractor.__call__`].
-            text (`str`, `List[str]`, *optional*):
-                A text input is passed to [`PreTrainedTokenizer.__call__`].
-
-
         Returns:
             This method returns the results of each `call` method. If both are used, the output is a dictionary containing the results of both.
         """
diff --git a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
index 90da8b651677..eb7f7c342967 100644
--- a/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
+++ b/src/transformers/models/wav2vec2_bert/processing_wav2vec2_bert.py
@@ -21,6 +21,7 @@
 
 from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from ...tokenization_utils_base import AudioInput, PreTokenizedInput, TextInput
+from ...utils.auto_docstring import auto_docstring
 from ..seamless_m4t.feature_extraction_seamless_m4t import SeamlessM4TFeatureExtractor
 from ..wav2vec2.tokenization_wav2vec2 import Wav2Vec2CTCTokenizer
 
@@ -29,21 +30,8 @@ class Wav2Vec2BertProcessorKwargs(ProcessingKwargs, total=False):
     _defaults = {}
 
 
+@auto_docstring
 class Wav2Vec2BertProcessor(ProcessorMixin):
-    r"""
-    Constructs a Wav2Vec2-BERT processor which wraps a Wav2Vec2-BERT feature extractor and a Wav2Vec2 CTC tokenizer into a single
-    processor.
-
-    [`Wav2Vec2Processor`] offers all the functionalities of [`SeamlessM4TFeatureExtractor`] and [`PreTrainedTokenizer`].
-    See the docstring of [`~Wav2Vec2Processor.__call__`] and [`~Wav2Vec2Processor.decode`] for more information.
-
-    Args:
-        feature_extractor (`SeamlessM4TFeatureExtractor`):
-            An instance of [`SeamlessM4TFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer ([`PreTrainedTokenizer`]):
-            An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
@@ -66,6 +54,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
 
             return cls(feature_extractor=feature_extractor, tokenizer=tokenizer)
 
+    @auto_docstring
     def __call__(
         self,
         audio: Optional[AudioInput] = None,
@@ -73,21 +62,6 @@ def __call__(
         **kwargs: Unpack[Wav2Vec2BertProcessorKwargs],
     ):
         """
-        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `audio`
-        and `kwargs` arguments to SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audio` is not
-        `None` to pre-process the audio. To prepare the target sequences(s), this method forwards the `text` and `kwargs` arguments to
-        PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`] if `text` is not `None`. Please refer to the docstring of the above two methods for more information.
-
-        Args:
-            audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
-                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
-                of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
-                and T the sample length of the audio.
-
-            text (`str`, `list[str]`, `list[list[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
         Returns:
             [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
             - **input_features** -- Audio input features to be fed to a model. Returned when `audio` is not `None`.
diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
index 71973334dfd6..e449354344fa 100644
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -28,6 +28,7 @@
 
 from ...processing_utils import ProcessorMixin
 from ...utils import ModelOutput, logging, requires_backends
+from ...utils.auto_docstring import auto_docstring
 
 
 logger = logging.get_logger(__name__)
@@ -66,26 +67,18 @@ class Wav2Vec2DecoderWithLMOutput(ModelOutput):
     word_offsets: Union[list[list[ListOfDict]], list[ListOfDict], ListOfDict] = None
 
 
+@auto_docstring
 class Wav2Vec2ProcessorWithLM(ProcessorMixin):
-    r"""
-    Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor, a Wav2Vec2 CTC tokenizer and a decoder
-    with language model support into a single processor for language model boosted speech recognition decoding.
-
-    Args:
-        feature_extractor ([`Wav2Vec2FeatureExtractor`] or [`SeamlessM4TFeatureExtractor`]):
-            An instance of [`Wav2Vec2FeatureExtractor`] or [`SeamlessM4TFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer ([`Wav2Vec2CTCTokenizer`]):
-            An instance of [`Wav2Vec2CTCTokenizer`]. The tokenizer is a required input.
-        decoder (`pyctcdecode.BeamSearchDecoderCTC`):
-            An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input.
-    """
-
     def __init__(
         self,
         feature_extractor: "FeatureExtractionMixin",
         tokenizer: "PreTrainedTokenizerBase",
         decoder: "BeamSearchDecoderCTC",
     ):
+        """
+        decoder (`pyctcdecode.BeamSearchDecoderCTC`):
+            An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input.
+        """
         from pyctcdecode import BeamSearchDecoderCTC
 
         super().__init__(feature_extractor, tokenizer)
@@ -214,14 +207,8 @@ def get_missing_alphabet_tokens(decoder, tokenizer):
 
         return missing_tokens
 
+    @auto_docstring
     def __call__(self, *args, **kwargs):
-        """
-        When used in normal mode, this method forwards all its arguments to the feature extractor's
-        [`~FeatureExtractionMixin.__call__`] and returns its output. If used in the context
-        [`~Wav2Vec2ProcessorWithLM.as_target_processor`] this method forwards all its arguments to
-        Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.__call__`]. Please refer to the docstring of the above two
-        methods for more information.
-        """
         if "raw_speech" in kwargs:
             warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
             audio = kwargs.pop("raw_speech")
diff --git a/src/transformers/models/whisper/processing_whisper.py b/src/transformers/models/whisper/processing_whisper.py
index e71a7a545281..2b3cf5c26f2a 100644
--- a/src/transformers/models/whisper/processing_whisper.py
+++ b/src/transformers/models/whisper/processing_whisper.py
@@ -17,35 +17,19 @@
 """
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class WhisperProcessor(ProcessorMixin):
-    r"""
-    Constructs a Whisper processor which wraps a Whisper feature extractor and a Whisper tokenizer into a single
-    processor.
-
-    [`WhisperProcessor`] offers all the functionalities of [`WhisperFeatureExtractor`] and [`WhisperTokenizer`]. See
-    the [`~WhisperProcessor.__call__`] and [`~WhisperProcessor.decode`] for more information.
-
-    Args:
-        feature_extractor (`WhisperFeatureExtractor`):
-            An instance of [`WhisperFeatureExtractor`]. The feature extractor is a required input.
-        tokenizer (`WhisperTokenizer`):
-            An instance of [`WhisperTokenizer`]. The tokenizer is a required input.
-    """
-
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
 
     def get_decoder_prompt_ids(self, task=None, language=None, no_timestamps=True):
         return self.tokenizer.get_decoder_prompt_ids(task=task, language=language, no_timestamps=no_timestamps)
 
+    @auto_docstring
     def __call__(self, *args, **kwargs):
-        """
-        Forwards the `audio` argument to WhisperFeatureExtractor's [`~WhisperFeatureExtractor.__call__`] and the `text`
-        argument to [`~WhisperTokenizer.__call__`]. Please refer to the docstring of the above two methods for more
-        information.
-        """
         audio = kwargs.pop("audio", None)
         sampling_rate = kwargs.pop("sampling_rate", None)
         text = kwargs.pop("text", None)
diff --git a/src/transformers/models/x_clip/processing_x_clip.py b/src/transformers/models/x_clip/processing_x_clip.py
index ae31cd075b8a..f1ab389acfa9 100644
--- a/src/transformers/models/x_clip/processing_x_clip.py
+++ b/src/transformers/models/x_clip/processing_x_clip.py
@@ -17,22 +17,11 @@
 """
 
 from ...processing_utils import ProcessorMixin
+from ...utils.auto_docstring import auto_docstring
 
 
+@auto_docstring
 class XCLIPProcessor(ProcessorMixin):
-    r"""
-    Constructs an X-CLIP processor which wraps a VideoMAE image processor and a CLIP tokenizer into a single processor.
-
-    [`XCLIPProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`CLIPTokenizerFast`]. See the
-    [`~XCLIPProcessor.__call__`] and [`~XCLIPProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`CLIPImageProcessor`], *optional*):
-            The image processor is a required input.
-        tokenizer ([`CLIPTokenizerFast`], *optional*):
-            The tokenizer is a required input.
-    """
-
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         super().__init__(image_processor, tokenizer)
         self.video_processor = self.image_processor
diff --git a/src/transformers/utils/auto_docstring.py b/src/transformers/utils/auto_docstring.py
index b67d9fa11760..3d37c8f07db3 100644
--- a/src/transformers/utils/auto_docstring.py
+++ b/src/transformers/utils/auto_docstring.py
@@ -45,6 +45,7 @@
 
 PLACEHOLDER_TO_AUTO_MODULE = {
     "image_processor_class": ("image_processing_auto", "IMAGE_PROCESSOR_MAPPING_NAMES"),
+    "tokenizer_class": ("tokenization_auto", "TOKENIZER_MAPPING_NAMES"),
     "video_processor_class": ("video_processing_auto", "VIDEO_PROCESSOR_MAPPING_NAMES"),
     "feature_extractor_class": ("feature_extraction_auto", "FEATURE_EXTRACTOR_MAPPING_NAMES"),
     "processor_class": ("processing_auto", "PROCESSOR_MAPPING_NAMES"),
@@ -53,10 +54,12 @@
 
 UNROLL_KWARGS_METHODS = {
     "preprocess",
+    "__call__",
 }
 
 UNROLL_KWARGS_CLASSES = {
     "ImageProcessorFast",
+    "ProcessorMixin",
 }
 
 HARDCODED_CONFIG_FOR_MODELS = {
@@ -249,6 +252,76 @@ class ImageProcessorArgs:
     }
 
 
+class ProcessorArgs:
+    # __init__ arguments
+    image_processor = {
+        "description": """
+    The image processor is a required input.
+    """,
+        "type": "{image_processor_class}",
+    }
+
+    tokenizer = {
+        "description": """
+    The tokenizer is a required input.
+    """,
+        "type": "{tokenizer_class}",
+    }
+
+    video_processor = {
+        "description": """
+    The video processor is a required input.
+    """,
+        "type": "{video_processor_class}",
+    }
+
+    audio_processor = {
+        "description": """
+    The audio processor is a required input.
+    """,
+        "type": "{audio_processor_class}",
+    }
+
+    feature_extractor = {
+        "description": """
+    The feature extractor is a required input.
+    """,
+        "type": "{feature_extractor_class}",
+    }
+
+    chat_template = {
+        "description": """
+    A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
+    """,
+        "type": "str",
+    }
+
+    # __call__ arguments
+    text = {
+        "description": """
+    The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+    (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+    `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+    """,
+    }
+
+    audio = {
+        "description": """
+    The audio or batch of audios to be prepared. Each audio can be a NumPy array or PyTorch tensor.
+    In case of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
+    and T is the sample length of the audio.
+    """,
+    }
+
+    audios = {
+        "description": """
+    The audio or batch of audios to be prepared. Each audio can be a NumPy array or PyTorch tensor.
+    In case of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
+    and T is the sample length of the audio.
+    """,
+    }
+
+
 class ModelArgs:
     labels = {
         "description": """
@@ -1130,6 +1203,63 @@ def get_model_name(obj):
     return "model"
 
 
+def generate_processor_intro(cls) -> str:
+    """
+    Generate the intro docstring for a processor class based on its attributes.
+
+    Args:
+        cls: Processor class to generate intro for
+
+    Returns:
+        str: Generated intro text
+    """
+    class_name = cls.__name__
+
+    # Get attributes and their corresponding class names
+    attributes = cls.get_attributes()
+    if not attributes:
+        return ""
+
+    # Build list of component names and their classes
+    components = []
+    component_classes = []
+
+    for attr in attributes:
+        # Get the class name for this attribute
+        class_attr = f"{attr}_class"
+        # Format attribute name for display
+        attr_display = attr.replace("_", " ")
+        components.append(attr_display)
+        component_classes.append(f"[`{{{class_attr}}}`]")
+    if not components:
+        return ""
+
+    # Generate the intro text
+    if len(components) == 1:
+        components_text = f"a {components[0]}"
+        classes_text = component_classes[0]
+        classes_text_short = component_classes[0].replace("[`", "[`~")
+    elif len(components) == 2:
+        components_text = f"a {components[0]} and a {components[1]}"
+        classes_text = f"{component_classes[0]} and {component_classes[1]}"
+        classes_text_short = (
+            f"{component_classes[0].replace('[`', '[`~')} and {component_classes[1].replace('[`', '[`~')}"
+        )
+    else:
+        components_text = ", ".join(f"a {c}" for c in components[:-1]) + f", and a {components[-1]}"
+        classes_text = ", ".join(component_classes[:-1]) + f", and {component_classes[-1]}"
+        classes_short = [c.replace("[`", "[`~") for c in component_classes]
+        classes_text_short = ", ".join(classes_short[:-1]) + f", and {classes_short[-1]}"
+
+    intro = f"""Constructs a {class_name} which wraps {components_text} into a single processor.
+
+[`{class_name}`] offers all the functionalities of {classes_text}. See the
+{classes_text_short} for more information.
+"""
+
+    return intro
+
+
 def get_placeholders_dict(placeholders: list, model_name: str) -> dict:
     """
     Get the dictionary of placeholders for the given model name.
@@ -1151,7 +1281,9 @@ def get_placeholders_dict(placeholders: list, model_name: str) -> dict:
                 place_holder_value = None
             if place_holder_value is not None:
                 if isinstance(place_holder_value, (list, tuple)):
-                    place_holder_value = place_holder_value[0]
+                    place_holder_value = (
+                        place_holder_value[-1] if place_holder_value[-1] is not None else place_holder_value[0]
+                    )
                 placeholders_dict[placeholder] = place_holder_value if place_holder_value is not None else placeholder
             else:
                 placeholders_dict[placeholder] = placeholder
@@ -1342,13 +1474,14 @@ def _get_parameter_info(param_name, documented_params, source_args_dict, param_t
         ):
             param_type = documented_params[param_name]["type"]
         optional = documented_params[param_name]["optional"]
-        shape = documented_params[param_name]["shape"]
+        shape = documented_params[param_name].get("shape", None)
         shape_string = shape if shape else ""
         additional_info = documented_params[param_name]["additional_info"] or ""
         description = f"{documented_params[param_name]['description']}\n"
     elif param_name in source_args_dict:
         # Parameter is documented in ModelArgs or ImageProcessorArgs
-        shape = source_args_dict[param_name]["shape"]
+        param_type = source_args_dict[param_name].get("type", param_type)
+        shape = source_args_dict[param_name].get("shape", None)
         shape_string = " " + shape if shape else ""
         description = source_args_dict[param_name]["description"]
         additional_info = source_args_dict[param_name].get("additional_info", None)
@@ -1376,9 +1509,23 @@ def _process_regular_parameters(
         undocumented_parameters (`list`): List to append undocumented parameters to
     """
     docstring = ""
-    source_args_dict = (
-        get_args_doc_from_source([ModelArgs, ImageProcessorArgs]) if source_args_dict is None else source_args_dict
-    )
+    # Check if this is a processor (check both parent_class and class_name for "Processor")
+    is_processor = False
+    if parent_class is not None:
+        is_processor = "ProcessorMixin" in parent_class.__name__ or any(
+            "ProcessorMixin" in base.__name__ for base in parent_class.__mro__
+        )
+    elif class_name and "Processor" in class_name:
+        # When decorating methods directly, check if class name suggests it's a processor
+        is_processor = True
+
+    # Use appropriate args source based on whether it's a processor or not
+    if source_args_dict is None:
+        if is_processor:
+            source_args_dict = get_args_doc_from_source([ModelArgs, ImageProcessorArgs, ProcessorArgs])
+        else:
+            source_args_dict = get_args_doc_from_source([ModelArgs, ImageProcessorArgs])
+
     missing_args = {}
 
     for param_name, param in sig.parameters.items():
@@ -1467,7 +1614,22 @@ def _process_kwargs_parameters(sig, func, parent_class, documented_kwargs, inden
         undocumented_parameters (`list`): List to append undocumented parameters to
     """
     docstring = ""
-    source_args_dict = get_args_doc_from_source(ImageProcessorArgs)
+
+    # Check if this is a processor (has ProcessorMixin in parent class hierarchy or class name contains "Processor")
+    is_processor = False
+    if parent_class is not None:
+        is_processor = "ProcessorMixin" in parent_class.__name__ or any(
+            "ProcessorMixin" in base.__name__ for base in parent_class.__mro__
+        )
+    # Also check by function's qualified name when decorating methods directly
+    elif "Processor" in func.__qualname__:
+        is_processor = True
+
+    # Use appropriate args source based on whether it's a processor or not
+    if is_processor:
+        source_args_dict = get_args_doc_from_source([ImageProcessorArgs, ProcessorArgs])
+    else:
+        source_args_dict = get_args_doc_from_source(ImageProcessorArgs)
 
     # Check if we need to add typed kwargs description to the docstring
     unroll_kwargs = func.__name__ in UNROLL_KWARGS_METHODS
@@ -1476,7 +1638,6 @@ def _process_kwargs_parameters(sig, func, parent_class, documented_kwargs, inden
         unroll_kwargs = any(
             unroll_kwargs_class in parent_class.__name__ for unroll_kwargs_class in UNROLL_KWARGS_CLASSES
         )
-
     if unroll_kwargs:
         # get all unpackable "kwargs" parameters
         kwargs_parameters = [
@@ -1496,6 +1657,117 @@ def _process_kwargs_parameters(sig, func, parent_class, documented_kwargs, inden
 
             # Process each kwarg parameter
             for param_name, param_type_annotation in kwarg_param.annotation.__args__[0].__annotations__.items():
+                # Handle nested kwargs structures for processors
+                if is_processor and param_name.endswith("_kwargs"):
+                    # Check if this is a basic kwargs type that should be skipped
+                    # Basic kwargs types are generic containers that shouldn't be documented as individual params
+                    basic_kwargs_types = ["TextKwargs", "ImagesKwargs", "VideosKwargs", "AudioKwargs"]
+
+                    # Get the actual type (unwrap Optional if needed)
+                    actual_type = param_type_annotation
+                    type_name = getattr(param_type_annotation, "__name__", None)
+                    if type_name is None and hasattr(param_type_annotation, "__origin__"):
+                        # Handle Optional[Type] or Union cases
+                        args = getattr(param_type_annotation, "__args__", ())
+                        for arg in args:
+                            if arg is not type(None):
+                                actual_type = arg
+                                type_name = getattr(arg, "__name__", None)
+                                break
+
+                    # Skip only if it's one of the basic kwargs types
+                    if type_name in basic_kwargs_types:
+                        continue
+
+                    # Otherwise, unroll the custom typed kwargs
+                    # Get the nested TypedDict's annotations
+                    if hasattr(actual_type, "__annotations__"):
+                        nested_kwargs_doc = getattr(actual_type, "__doc__", None)
+                        documented_nested_kwargs = {}
+                        if nested_kwargs_doc:
+                            documented_nested_kwargs = parse_docstring(nested_kwargs_doc)[0]
+
+                        # Only process fields that are documented in the custom kwargs class's own docstring
+                        # This prevents showing too many inherited parameters
+                        if not documented_nested_kwargs:
+                            # No documentation in the custom kwargs class, skip unrolling
+                            continue
+
+                        # Process each field in the custom typed kwargs
+                        for nested_param_name, nested_param_type in actual_type.__annotations__.items():
+                            # Only document parameters that are explicitly documented in the TypedDict's docstring
+                            if nested_param_name not in documented_nested_kwargs:
+                                continue
+                            nested_param_type_str = str(nested_param_type)
+                            nested_optional = False
+
+                            # Process parameter type
+                            if "typing" in nested_param_type_str:
+                                nested_param_type_str = "".join(nested_param_type_str.split("typing.")).replace(
+                                    "transformers.", "~"
+                                )
+                            else:
+                                nested_param_type_str = f"{nested_param_type_str.replace('transformers.', '~').replace('builtins', '')}.{nested_param_name}"
+                            if "ForwardRef" in nested_param_type_str:
+                                nested_param_type_str = re.sub(
+                                    r"ForwardRef\('([\w.]+)'\)", r"\1", nested_param_type_str
+                                )
+                            if "Optional" in nested_param_type_str:
+                                nested_param_type_str = re.sub(r"Optional\[(.*?)\]", r"\1", nested_param_type_str)
+                                nested_optional = True
+
+                            # Check for default value
+                            nested_param_default = ""
+                            if parent_class is not None:
+                                nested_param_default = str(getattr(parent_class, nested_param_name, ""))
+                                nested_param_default = (
+                                    f", defaults to `{nested_param_default}`" if nested_param_default != "" else ""
+                                )
+
+                            # Only use the TypedDict's own docstring, not source_args_dict
+                            # This prevents pulling in too many inherited parameters
+                            (
+                                nested_param_type_str,
+                                nested_optional_string,
+                                nested_shape_string,
+                                nested_additional_info,
+                                nested_description,
+                                nested_is_documented,
+                            ) = _get_parameter_info(
+                                nested_param_name,
+                                documented_nested_kwargs,
+                                {},  # Empty dict - only use TypedDict's own docstring
+                                nested_param_type_str,
+                                nested_optional,
+                            )
+
+                            # nested_is_documented should always be True here since we filter for it above
+                            # Check if type is missing
+                            if nested_param_type_str == "":
+                                print(
+                                    f"🚨 {nested_param_name} for {type_name} in file {func.__code__.co_filename} has no type"
+                                )
+                            nested_param_type_str = (
+                                nested_param_type_str if "`" in nested_param_type_str else f"`{nested_param_type_str}`"
+                            )
+                            # Format the parameter docstring
+                            if nested_additional_info:
+                                docstring += set_min_indent(
+                                    f"{nested_param_name} ({nested_param_type_str}{nested_additional_info}):{nested_description}",
+                                    indent_level + 8,
+                                )
+                            else:
+                                docstring += set_min_indent(
+                                    f"{nested_param_name} ({nested_param_type_str}{nested_shape_string}{nested_optional_string}{nested_param_default}):{nested_description}",
+                                    indent_level + 8,
+                                )
+
+                        # Skip processing the _kwargs parameter itself since we've processed its contents
+                        continue
+                    else:
+                        # If we can't get annotations, skip this parameter
+                        continue
+
                 param_type = str(param_type_annotation)
                 optional = False
 
@@ -1781,12 +2053,22 @@ def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=No
     from transformers.models import auto as auto_module
 
     is_dataclass = False
+    is_processor = False
     docstring_init = ""
     docstring_args = ""
     if "PreTrainedModel" in (x.__name__ for x in cls.__mro__):
         docstring_init = auto_method_docstring(
             cls.__init__, parent_class=cls, custom_args=custom_args, checkpoint=checkpoint
         ).__doc__.replace("Args:", "Parameters:")
+    elif "ProcessorMixin" in (x.__name__ for x in cls.__mro__):
+        is_processor = True
+        docstring_init = auto_method_docstring(
+            cls.__init__,
+            parent_class=cls,
+            custom_args=custom_args,
+            checkpoint=checkpoint,
+            source_args_dict=get_args_doc_from_source([ModelArgs, ImageProcessorArgs, ProcessorArgs]),
+        ).__doc__.replace("Args:", "Parameters:")
     elif "ModelOutput" in (x.__name__ for x in cls.__mro__):
         # We have a data class
         is_dataclass = True
@@ -1810,17 +2092,23 @@ def auto_class_docstring(cls, custom_intro=None, custom_args=None, checkpoint=No
         model_name_lowercase = model_name_lowercase.replace("_", "-")
 
     name = re.findall(rf"({'|'.join(ClassDocstring.__dict__.keys())})$", cls.__name__)
-    if name == [] and custom_intro is None and not is_dataclass:
+    if name == [] and custom_intro is None and not is_dataclass and not is_processor:
         raise ValueError(
             f"`{cls.__name__}` is not registered in the auto doc. Here are the available classes: {ClassDocstring.__dict__.keys()}.\n"
             "Add a `custom_intro` to the decorator if you want to use `auto_docstring` on a class not registered in the auto doc."
         )
-    if name != [] or custom_intro is not None or is_dataclass:
+    if name != [] or custom_intro is not None or is_dataclass or is_processor:
         name = name[0] if name else None
         if custom_intro is not None:
             pre_block = equalize_indent(custom_intro, indent_level)
             if not pre_block.endswith("\n"):
                 pre_block += "\n"
+        elif is_processor:
+            # Generate processor intro dynamically
+            pre_block = generate_processor_intro(cls)
+            if pre_block:
+                pre_block = equalize_indent(pre_block, indent_level)
+                pre_block = format_args_docstring(pre_block, model_name_lowercase)
         elif model_name_title is None or name is None:
             pre_block = ""
         else:
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index fff7e882f457..4726b1b5d6dc 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -485,6 +485,23 @@
 }
 
 
+def has_auto_docstring_decorator(obj) -> bool:
+    try:
+        # Get the source lines for the object
+        source_lines = inspect.getsourcelines(obj)[0]
+
+        # Check the lines before the definition for @auto_docstring decorator
+        for line in source_lines[:10]:  # Check first 10 lines (decorators come before def/class)
+            line = line.strip()
+            if line.startswith("@auto_docstring"):
+                return True
+    except (TypeError, OSError):
+        # Some objects don't have source code available
+        pass
+
+    return False
+
+
 def find_indent(line: str) -> int:
     """
     Returns the number of spaces that start a line indent.
@@ -1423,6 +1440,10 @@ def check_docstrings(overwrite: bool = False, check_all: bool = False):
         if not callable(obj) or not isinstance(obj, type) or getattr(obj, "__doc__", None) is None:
             continue
 
+        # Skip objects decorated with @auto_docstring - they have auto-generated documentation
+        if has_auto_docstring_decorator(obj):
+            continue
+
         # If we are checking against the diff, we skip objects that are not part of the diff.
         if module_diff_files is not None:
             object_file = find_source_file(getattr(transformers, name))