diff --git a/.github/workflows/check_failed_tests.yml b/.github/workflows/check_failed_tests.yml index ec40bee377f4..1238d9c569f9 100644 --- a/.github/workflows/check_failed_tests.yml +++ b/.github/workflows/check_failed_tests.yml @@ -115,7 +115,7 @@ jobs: if: ${{ env.process == 'true' && inputs.pr_number != '' }} uses: actions/github-script@v6 with: - script: | + script: | const { data: pr } = await github.rest.pulls.get({ owner: context.repo.owner, repo: context.repo.repo, diff --git a/.github/workflows/self-comment-ci.yml b/.github/workflows/self-comment-ci.yml index 99785c16fc71..b48d700d3b41 100644 --- a/.github/workflows/self-comment-ci.yml +++ b/.github/workflows/self-comment-ci.yml @@ -96,9 +96,9 @@ jobs: run: | python -m pip install GitPython python utils/pr_slow_ci_models.py --message "$PR_COMMENT" | tee output.txt - echo "models=$(tail -n 1 output.txt)" >> $GITHUB_ENV + echo 'models=$(tail -n 1 output.txt)' >> $GITHUB_ENV python utils/pr_slow_ci_models.py --message "$PR_COMMENT" --quantization | tee output2.txt - echo "quantizations=$(tail -n 1 output2.txt)" >> $GITHUB_ENV + echo 'quantizations=$(tail -n 1 output2.txt)' >> $GITHUB_ENV - name: Show models to test id: models_to_run @@ -135,6 +135,27 @@ jobs: "repos/${github_repository}/issues/${pr_number}/comments" \ -f body="💔 This comment contains \`run-slow\`, but unknown error occurred and [the workflow run]($GITHUB_RUN_URL) aborted!" + # Report back if we are not able to get the tests (for example, security check is failing) + report_error_earlier: + name: Report error earlier + if: ${{ always() && needs.get-pr-info.result == 'success' && needs.get-tests.result != 'success' }} + needs: [get-pr-number, get-pr-info, get-tests] + permissions: + pull-requests: write + runs-on: ubuntu-22.04 + steps: + - name: Reply to the comment + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \ + -f body="💔 This comment contains \`run-slow\`, but unknown error occurred and [the workflow run]($GITHUB_RUN_URL) aborted!" + reply_to_comment: name: Reply to the comment if: ${{ needs.get-tests.outputs.models != '[]' || needs.get-tests.outputs.quantizations != '[]' }} @@ -251,21 +272,21 @@ jobs: python3 << 'PYTHON_SCRIPT' import json import os - + def filter_and_format_report(data): """ Filter out entries where commit is `None` (failing tests who status is not certain) and format as text """ lines = [] - + for model, model_result in data.items(): model_lines = [] for device, failures in model_result.items(): - + # Filter out None commits and extract just the test names test_names = [ - failure['test'] - for failure in failures + failure['test'] + for failure in failures if isinstance(failure, dict) and failure.get('commit') is not None ] @@ -274,32 +295,32 @@ jobs: if idx == 0: job_link = failures[idx]['job_link'] model_lines.append(f"- [{model}]({job_link}):") - + model_lines.append(f" {test_name}") # Only add model section if it has tests if len(model_lines) > 0: lines.extend(model_lines) lines.append("") # Empty line between models - + return "\n".join(lines).strip() - + # Load and filter reports model_report_str = os.environ.get('MODEL_REPORT', '{}') quant_report_str = os.environ.get('QUANT_REPORT', '{}') - + model_report = json.loads(model_report_str) if model_report_str else {} quant_report = json.loads(quant_report_str) if quant_report_str else {} - + formatted_model = filter_and_format_report(model_report) formatted_quant = filter_and_format_report(quant_report) - + # Write to files with open('model_ci.txt', 'w') as f: f.write(formatted_model) if formatted_model: f.write('\n') - + with open('quantization_ci.txt', 'w') as f: f.write(formatted_quant) if formatted_quant: @@ -339,7 +360,7 @@ jobs: cat model_ci.txt echo '' fi - + # Check if quantization_ci.txt has content if [ -s quantization_ci.txt ]; then echo '### Quantization CI Report' diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 219e570469f1..57693322a771 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -545,11 +545,11 @@ jobs: - name: Reinstall transformers in edit mode working-directory: /transformers run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[testing] - + - name: Install kernels working-directory: /transformers run: python3 -m pip install -U kernels - + - name: NVIDIA-SMI run: nvidia-smi @@ -579,7 +579,7 @@ jobs: echo "$machine_type" echo "machine_type=$machine_type" >> $GITHUB_ENV - + - name: Run kernel tests on GPU working-directory: /transformers run: | @@ -597,6 +597,70 @@ jobs: name: ${{ env.machine_type }}_run_kernels_gpu_test_reports path: /transformers/reports/${{ env.machine_type }}_run_kernels_gpu_test_reports + run_kernels_gpu: + if: ${{ inputs.job == 'run_kernels_gpu' }} + name: Kernel tests + strategy: + fail-fast: false + matrix: + machine_type: [aws-g5-4xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' + container: + image: ${{ inputs.docker }} + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }} + + - name: Reinstall transformers in edit mode + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[testing] + + - name: Install kernels + working-directory: /transformers + run: python3 -m pip install -U kernels + + - name: NVIDIA-SMI + run: nvidia-smi + + - name: Environment + working-directory: /transformers + run: python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Set `machine_type` for report and artifact names + working-directory: /transformers + shell: bash + run: | + if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then + machine_type=single-gpu + else + machine_type=${{ matrix.machine_type }} + fi + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Run kernel tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_kernels_gpu_test_reports tests/kernels/test_kernels.py + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ env.machine_type }}_run_kernels_gpu_test_reports/failures_short.txt + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_kernels_gpu_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_run_kernels_gpu_test_reports + path: /transformers/reports/${{ env.machine_type }}_run_kernels_gpu_test_reports + run_extract_warnings: # Let's only do this for the job `run_models_gpu` to simplify the (already complex) logic. if: ${{ always() && inputs.job == 'run_models_gpu' }} diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index ac927b8d2306..cc4b5e2a02fc 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -17,6 +17,7 @@ """ from ...processing_utils import ProcessingKwargs, ProcessorMixin +from ...utils.auto_docstring import auto_docstring class AlignProcessorKwargs(ProcessingKwargs, total=False): @@ -29,36 +30,8 @@ class AlignProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class AlignProcessor(ProcessorMixin): - r""" - Constructs an ALIGN processor which wraps [`EfficientNetImageProcessor`] and - [`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that inherits both the image processor and - tokenizer functionalities. See the [`~AlignProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more - information. - The preferred way of passing kwargs is as a dictionary per modality, see usage example below. - ```python - from transformers import AlignProcessor - from PIL import Image - model_id = "kakaobrain/align-base" - processor = AlignProcessor.from_pretrained(model_id) - - processor( - images=your_pil_image, - text=["What is that?"], - images_kwargs = {"crop_size": {"height": 224, "width": 224}}, - text_kwargs = {"padding": "do_not_pad"}, - common_kwargs = {"return_tensors": "pt"}, - ) - ``` - - Args: - image_processor ([`EfficientNetImageProcessor`]): - The image processor is a required input. - tokenizer ([`BertTokenizer`, `BertTokenizerFast`]): - The tokenizer is a required input. - - """ - valid_processor_kwargs = AlignProcessorKwargs def __init__(self, image_processor, tokenizer): diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py index 933a5e48dfed..989dc7ecdbdd 100644 --- a/src/transformers/models/altclip/processing_altclip.py +++ b/src/transformers/models/altclip/processing_altclip.py @@ -17,24 +17,12 @@ """ from ...processing_utils import ProcessorMixin +from ...utils.auto_docstring import auto_docstring from ...utils.deprecation import deprecate_kwarg +@auto_docstring class AltCLIPProcessor(ProcessorMixin): - r""" - Constructs a AltCLIP processor which wraps a CLIP image processor and a XLM-Roberta tokenizer into a single - processor. - - [`AltCLIPProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`XLMRobertaTokenizerFast`]. See - the [`~AltCLIPProcessor.__call__`] and [`~AltCLIPProcessor.decode`] for more information. - - Args: - image_processor ([`CLIPImageProcessor`], *optional*): - The image processor is a required input. - tokenizer ([`XLMRobertaTokenizerFast`], *optional*): - The tokenizer is a required input. - """ - @deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor") def __init__(self, image_processor=None, tokenizer=None): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py index c29c289649da..e8754c1a3df9 100644 --- a/src/transformers/models/aria/processing_aria.py +++ b/src/transformers/models/aria/processing_aria.py @@ -27,6 +27,7 @@ from ...processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils import PreTokenizedInput, TextInput from ...utils import TensorType +from ...utils.auto_docstring import auto_docstring from ..auto import AutoTokenizer @@ -52,21 +53,8 @@ class AriaProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class AriaProcessor(ProcessorMixin): - """ - AriaProcessor is a processor for the Aria model which wraps the Aria image preprocessor and the LLama slow tokenizer. - - Args: - image_processor (`AriaImageProcessor`, *optional*): - The AriaImageProcessor to use for image preprocessing. - tokenizer (`PreTrainedTokenizerBase`, *optional*): - An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input. - chat_template (`str`, *optional*): - A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. - size_conversion (`Dict`, *optional*): - A dictionary indicating size conversions for images. - """ - def __init__( self, image_processor=None, @@ -74,6 +62,10 @@ def __init__( chat_template: Optional[str] = None, size_conversion: Optional[dict[Union[float, int], int]] = None, ): + """ + size_conversion (`Dict`, *optional*): + A dictionary indicating size conversions for images. + """ if size_conversion is None: size_conversion = {490: 128, 980: 256} self.size_conversion = {int(k): v for k, v in size_conversion.items()} @@ -85,6 +77,7 @@ def __init__( super().__init__(image_processor, tokenizer, chat_template=chat_template) + @auto_docstring def __call__( self, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]], @@ -92,18 +85,6 @@ def __call__( **kwargs: Unpack[AriaProcessorKwargs], ) -> BatchFeature: """ - Main method to prepare for the model one or several sequences(s) and image(s). - - Args: - text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - images (`ImageInput`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - - Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. diff --git a/src/transformers/models/aya_vision/processing_aya_vision.py b/src/transformers/models/aya_vision/processing_aya_vision.py index 049b0e5d24eb..08d42fff08a5 100644 --- a/src/transformers/models/aya_vision/processing_aya_vision.py +++ b/src/transformers/models/aya_vision/processing_aya_vision.py @@ -21,6 +21,7 @@ from ...image_utils import ImageInput, make_flat_list_of_images from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils.auto_docstring import auto_docstring class AyaVisionProcessorKwargs(ProcessingKwargs, total=False): @@ -36,16 +37,26 @@ class AyaVisionProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class AyaVisionProcessor(ProcessorMixin): - r""" - Constructs a AyaVision processor which wraps a [`AutoImageProcessor`] and - [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and - tokenizer functionalities. See the [`~AyaVisionProcessor.__call__`] and [`~AyaVisionProcessor.decode`] for more information. - Args: - image_processor ([`AutoImageProcessor`], *optional*): - The image processor is a required input. - tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*): - The tokenizer is a required input. + def __init__( + self, + image_processor=None, + tokenizer=None, + patch_size: int = 28, + img_size: int = 364, + image_token="", # set the default and let users change if they have peculiar special tokens in rare cases + downsample_factor: int = 1, + start_of_img_token="<|START_OF_IMG|>", + end_of_img_token="<|END_OF_IMG|>", + img_patch_token="<|IMG_PATCH|>", + img_line_break_token="<|IMG_LINE_BREAK|>", + tile_token="TILE", + tile_global_token="TILE_GLOBAL", + chat_template=None, + **kwargs, + ): + """ patch_size (`int`, *optional*, defaults to 28): The size of image patches for tokenization. img_size (`int`, *optional*, defaults to 364): @@ -66,27 +77,7 @@ class AyaVisionProcessor(ProcessorMixin): The token to be used to represent an image patch in the text. tile_global_token (`str`, *optional*, defaults to `"TILE_GLOBAL"`): The token to be used to represent the cover image in the text. - chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages - in a chat into a tokenizable string. - """ - - def __init__( - self, - image_processor=None, - tokenizer=None, - patch_size: int = 28, - img_size: int = 364, - image_token="", # set the default and let users change if they have peculiar special tokens in rare cases - downsample_factor: int = 1, - start_of_img_token="<|START_OF_IMG|>", - end_of_img_token="<|END_OF_IMG|>", - img_patch_token="<|IMG_PATCH|>", - img_line_break_token="<|IMG_LINE_BREAK|>", - tile_token="TILE", - tile_global_token="TILE_GLOBAL", - chat_template=None, - **kwargs, - ): + """ super().__init__(image_processor, tokenizer, chat_template=chat_template) self.image_token = image_token @@ -125,6 +116,7 @@ def _prompt_split_image(self, num_patches): img_string += f"{self.end_of_img_token}" return img_string + @auto_docstring def __call__( self, images: Optional[ImageInput] = None, @@ -132,24 +124,6 @@ def __call__( **kwargs: Unpack[AyaVisionProcessorKwargs], ) -> BatchFeature: """ - Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` - and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text. - To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to - GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`. - - Args: - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - text (`str`, `list[str]`, `list[list[str]]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py index 403d107f48f9..9c702013c740 100644 --- a/src/transformers/models/bark/processing_bark.py +++ b/src/transformers/models/bark/processing_bark.py @@ -26,6 +26,7 @@ from ...processing_utils import ProcessorMixin from ...tokenization_utils_base import BatchEncoding from ...utils import logging +from ...utils.auto_docstring import auto_docstring from ...utils.hub import cached_file from ..auto import AutoTokenizer @@ -33,13 +34,16 @@ logger = logging.get_logger(__name__) +@auto_docstring class BarkProcessor(ProcessorMixin): - r""" - Constructs a Bark processor which wraps a text tokenizer and optional Bark voice presets into a single processor. + preset_shape = { + "semantic_prompt": 1, # 1D array of shape (X,) + "coarse_prompt": 2, # 2D array of shape (2,X) + "fine_prompt": 2, # 2D array of shape (8,X) + } - Args: - tokenizer ([`PreTrainedTokenizer`]): - An instance of [`PreTrainedTokenizer`]. + def __init__(self, tokenizer, speaker_embeddings=None): + """ speaker_embeddings (`dict[dict[str]]`, *optional*): Optional nested speaker embeddings dictionary. The first level contains voice preset names (e.g `"en_speaker_4"`). The second level contains `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"` @@ -47,15 +51,7 @@ class BarkProcessor(ProcessorMixin): [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c) for a list of `voice_preset_names`. - """ - - preset_shape = { - "semantic_prompt": 1, # 1D array of shape (X,) - "coarse_prompt": 2, # 2D array of shape (2,X) - "fine_prompt": 2, # 2D array of shape (8,X) - } - - def __init__(self, tokenizer, speaker_embeddings=None): + """ super().__init__(tokenizer) self.speaker_embeddings = speaker_embeddings @@ -260,6 +256,7 @@ def _verify_speaker_embeddings(self, remove_unavailable: bool = True): for voice_preset in unavailable_keys: del self.speaker_embeddings[voice_preset] + @auto_docstring def __call__( self, text=None, @@ -272,26 +269,11 @@ def __call__( **kwargs, ) -> BatchEncoding: """ - Main method to prepare for the model one or several sequences(s). This method forwards the `text` and `kwargs` - arguments to the AutoTokenizer's [`~AutoTokenizer.__call__`] to encode the text. The method also proposes a - voice preset which is a dictionary of arrays that conditions `Bark`'s output. `kwargs` arguments are forwarded - to the tokenizer and to `cached_file` method if `voice_preset` is a valid filename. - - Args: - text (`str`, `list[str]`, `list[list[str]]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - voice_preset (`str`, `dict[np.ndarray]`): - The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g - `"en_speaker_1"`, or directly a dictionary of `np.ndarray` embeddings for each submodel of `Bark`. Or - it can be a valid file name of a local `.npz` single voice preset containing the keys - `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`. - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. + voice_preset (`str`, `dict[np.ndarray]`): + The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g + `"en_speaker_1"`, or directly a dictionary of `np.ndarray` embeddings for each submodel of `Bark`. Or + it can be a valid file name of a local `.npz` single voice preset containing the keys + `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"`. Returns: [`BatchEncoding`]: A [`BatchEncoding`] object containing the output of the `tokenizer`. diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py index 965164206c5a..50c7ae96a130 100644 --- a/src/transformers/models/blip/processing_blip.py +++ b/src/transformers/models/blip/processing_blip.py @@ -17,6 +17,7 @@ """ from typing import Optional, Union +from ...utils.auto_docstring import auto_docstring from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack @@ -39,48 +40,20 @@ class BlipProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class BlipProcessor(ProcessorMixin): - r""" - Constructs a BLIP processor which wraps a BERT tokenizer and BLIP image processor into a single processor. - - [`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`BertTokenizerFast`]. See the - docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information. - - Args: - image_processor (`BlipImageProcessor`): - An instance of [`BlipImageProcessor`]. The image processor is a required input. - tokenizer (`BertTokenizerFast`): - An instance of ['BertTokenizerFast`]. The tokenizer is a required input. - """ def __init__(self, image_processor, tokenizer, **kwargs): tokenizer.return_token_type_ids = False super().__init__(image_processor, tokenizer) + @auto_docstring def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None, **kwargs: Unpack[BlipProcessorKwargs], ) -> BatchEncoding: - """ - This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and - [`BertTokenizerFast.__call__`] to prepare text for the model. - - Please refer to the docstring of the above two methods for more information. - Args: - images (`ImageInput`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - """ if images is None and text is None: raise ValueError("You have to specify either images or text.") diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index 5949e2c648ce..36ba8bcf4557 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -23,6 +23,7 @@ from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import AddedToken, BatchEncoding, PreTokenizedInput, TextInput from ...utils import logging +from ...utils.auto_docstring import auto_docstring logger = logging.get_logger(__name__) @@ -44,23 +45,13 @@ class Blip2ProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class Blip2Processor(ProcessorMixin): - r""" - Constructs a BLIP-2 processor which wraps a BLIP image processor and an OPT/T5 tokenizer into a single processor. - - [`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the docstring - of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information. - - Args: - image_processor (`BlipImageProcessor`): - An instance of [`BlipImageProcessor`]. The image processor is a required input. - tokenizer (`AutoTokenizer`): - An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. + def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs): + """ num_query_tokens (`int`, *optional*): Number of tokens used by the Qformer as queries, should be same as in model's config. - """ - - def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs): + """ tokenizer.return_token_type_ids = False if not hasattr(tokenizer, "image_token"): self.image_token = AddedToken("", normalized=False, special=True) @@ -71,30 +62,13 @@ def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs): super().__init__(image_processor, tokenizer) + @auto_docstring def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None, **kwargs: Unpack[Blip2ProcessorKwargs], ) -> BatchEncoding: - """ - This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and - [`BertTokenizerFast.__call__`] to prepare text for the model. - - Please refer to the docstring of the above two methods for more information. - Args: - images (`ImageInput`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - """ if images is None and text is None: raise ValueError("You have to specify either images or text.") output_kwargs = self._merge_kwargs( diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py index 5de97ec411dc..5ccd7aab5658 100644 --- a/src/transformers/models/bridgetower/processing_bridgetower.py +++ b/src/transformers/models/bridgetower/processing_bridgetower.py @@ -17,6 +17,7 @@ """ from ...processing_utils import ProcessingKwargs, ProcessorMixin +from ...utils.auto_docstring import auto_docstring class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False): @@ -38,22 +39,8 @@ class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class BridgeTowerProcessor(ProcessorMixin): - r""" - Constructs a BridgeTower processor which wraps a Roberta tokenizer and BridgeTower image processor into a single - processor. - - [`BridgeTowerProcessor`] offers all the functionalities of [`BridgeTowerImageProcessor`] and - [`RobertaTokenizerFast`]. See the docstring of [`~BridgeTowerProcessor.__call__`] and - [`~BridgeTowerProcessor.decode`] for more information. - - Args: - image_processor (`BridgeTowerImageProcessor`): - An instance of [`BridgeTowerImageProcessor`]. The image processor is a required input. - tokenizer (`RobertaTokenizerFast`): - An instance of ['RobertaTokenizerFast`]. The tokenizer is a required input. - """ - valid_processor_kwargs = BridgeTowerProcessorKwargs def __init__(self, image_processor, tokenizer): diff --git a/src/transformers/models/bros/processing_bros.py b/src/transformers/models/bros/processing_bros.py index d92b163955a7..a2317c43205e 100644 --- a/src/transformers/models/bros/processing_bros.py +++ b/src/transformers/models/bros/processing_bros.py @@ -17,6 +17,7 @@ """ from ...processing_utils import ProcessingKwargs, ProcessorMixin +from ...utils.auto_docstring import auto_docstring class BrosProcessorKwargs(ProcessingKwargs, total=False): @@ -34,17 +35,8 @@ class BrosProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class BrosProcessor(ProcessorMixin): - r""" - Constructs a Bros processor which wraps a BERT tokenizer. - - [`BrosProcessor`] offers all the functionalities of [`BertTokenizerFast`]. See the docstring of - [`~BrosProcessor.__call__`] and [`~BrosProcessor.decode`] for more information. - - Args: - tokenizer (`BertTokenizerFast`, *optional*): - An instance of ['BertTokenizerFast`]. The tokenizer is a required input. - """ valid_processor_kwargs = BrosProcessorKwargs diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py index 694be7ab8f26..418ef1d30147 100644 --- a/src/transformers/models/chameleon/processing_chameleon.py +++ b/src/transformers/models/chameleon/processing_chameleon.py @@ -30,6 +30,7 @@ Unpack, ) from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils.auto_docstring import auto_docstring class ChameleonTextKwargs(TextKwargs, total=False): @@ -50,26 +51,15 @@ class ChameleonProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class ChameleonProcessor(ProcessorMixin): - r""" - Constructs a Chameleon processor which wraps a Chameleon image processor and a Chameleon tokenizer into a single - processor. - - [`ChameleonProcessor`] offers all the functionalities of [`ChameleonImageProcessor`] and [`LlamaTokenizerFast`]. - See the [`~ChameleonProcessor.__call__`] and [`~ChameleonProcessor.decode`] for more information. - - Args: - image_processor ([`ChameleonImageProcessor`]): - The image processor is a required input. - tokenizer ([`LlamaTokenizerFast`]): - The tokenizer is a required input. + def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = ""): + """ image_seq_length (`int`, *optional*, defaults to 1024): Sequence length of one image embedding. image_token (`str`, *optional*, defaults to `""`): The special token used to indicate image in the text. - """ - - def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = ""): + """ self.image_seq_length = image_seq_length self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) @@ -84,6 +74,7 @@ def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, ima super().__init__(image_processor, tokenizer) + @auto_docstring def __call__( self, images: Optional[ImageInput] = None, @@ -91,26 +82,6 @@ def __call__( **kwargs: Unpack[ChameleonProcessorKwargs], ) -> BatchFeature: """ - Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` - and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode - the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to - CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring - of the above two methods for more information. - - Args: - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - text (`str`, `list[str]`, `list[list[str]]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: diff --git a/src/transformers/models/chinese_clip/processing_chinese_clip.py b/src/transformers/models/chinese_clip/processing_chinese_clip.py index 6508136f772e..e60944c330e7 100644 --- a/src/transformers/models/chinese_clip/processing_chinese_clip.py +++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py @@ -17,22 +17,11 @@ """ from ...processing_utils import ProcessorMixin +from ...utils.auto_docstring import auto_docstring +@auto_docstring class ChineseCLIPProcessor(ProcessorMixin): - r""" - Constructs a Chinese-CLIP processor which wraps a Chinese-CLIP image processor and a Chinese-CLIP tokenizer into a - single processor. - - [`ChineseCLIPProcessor`] offers all the functionalities of [`ChineseCLIPImageProcessor`] and [`BertTokenizerFast`]. - See the [`~ChineseCLIPProcessor.__call__`] and [`~ChineseCLIPProcessor.decode`] for more information. - - Args: - image_processor ([`ChineseCLIPImageProcessor`], *optional*): - The image processor is a required input. - tokenizer ([`BertTokenizerFast`], *optional*): - The tokenizer is a required input. - """ def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/clap/processing_clap.py b/src/transformers/models/clap/processing_clap.py index a72151cb9b63..4a733f33cc4a 100644 --- a/src/transformers/models/clap/processing_clap.py +++ b/src/transformers/models/clap/processing_clap.py @@ -22,30 +22,20 @@ from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import logging +from ...utils.auto_docstring import auto_docstring from ...utils.deprecation import deprecate_kwarg logger = logging.get_logger(__name__) +@auto_docstring class ClapProcessor(ProcessorMixin): - r""" - Constructs a CLAP processor which wraps a CLAP feature extractor and a RoBerta tokenizer into a single processor. - - [`ClapProcessor`] offers all the functionalities of [`ClapFeatureExtractor`] and [`RobertaTokenizerFast`]. See the - [`~ClapProcessor.__call__`] and [`~ClapProcessor.decode`] for more information. - - Args: - feature_extractor ([`ClapFeatureExtractor`]): - The audio processor is a required input. - tokenizer ([`RobertaTokenizerFast`]): - The tokenizer is a required input. - """ - def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) @deprecate_kwarg("audios", version="v4.59.0", new_name="audio") + @auto_docstring def __call__( self, text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None, @@ -53,11 +43,6 @@ def __call__( audio: Optional[AudioInput] = None, **kwargs: Unpack[ProcessingKwargs], ): - """ - Forwards the `audio` and `sampling_rate` arguments to [`~ClapFeatureExtractor.__call__`] and the `text` - argument to [`~RobertaTokenizerFast.__call__`]. Please refer to the docstring of the above two methods for more - information. - """ # The `deprecate_kwarg` will not work if the inputs are passed as arguments, so we check # again that the correct naming is used if audios is not None and audio is None: diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py index 9258d2e8fee3..3b20fff7490a 100644 --- a/src/transformers/models/clip/processing_clip.py +++ b/src/transformers/models/clip/processing_clip.py @@ -17,22 +17,11 @@ """ from ...processing_utils import ProcessorMixin +from ...utils import auto_docstring +@auto_docstring class CLIPProcessor(ProcessorMixin): - r""" - Constructs a CLIP processor which wraps a CLIP image processor and a CLIP tokenizer into a single processor. - - [`CLIPProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`CLIPTokenizerFast`]. See the - [`~CLIPProcessor.__call__`] and [`~CLIPProcessor.decode`] for more information. - - Args: - image_processor ([`CLIPImageProcessor`], *optional*): - The image processor is a required input. - tokenizer ([`AutoTokenizer`], *optional*): - The tokenizer is a required input. - """ - def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py index 4d431181cb4f..f856b97a4490 100644 --- a/src/transformers/models/clipseg/processing_clipseg.py +++ b/src/transformers/models/clipseg/processing_clipseg.py @@ -18,51 +18,21 @@ from ...processing_utils import ProcessorMixin from ...tokenization_utils_base import BatchEncoding +from ...utils.auto_docstring import auto_docstring +@auto_docstring class CLIPSegProcessor(ProcessorMixin): - r""" - Constructs a CLIPSeg processor which wraps a CLIPSeg image processor and a CLIP tokenizer into a single processor. - - [`CLIPSegProcessor`] offers all the functionalities of [`ViTImageProcessor`] and [`CLIPTokenizerFast`]. See the - [`~CLIPSegProcessor.__call__`] and [`~CLIPSegProcessor.decode`] for more information. - - Args: - image_processor ([`ViTImageProcessor`], *optional*): - The image processor is a required input. - tokenizer ([`CLIPTokenizerFast`], *optional*): - The tokenizer is a required input. - """ - def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) + @auto_docstring def __call__(self, text=None, images=None, visual_prompt=None, return_tensors=None, **kwargs): """ - Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` - and `kwargs` arguments to CLIPTokenizerFast's [`~CLIPTokenizerFast.__call__`] if `text` is not `None` to encode - the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to - ViTImageProcessor's [`~ViTImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring of - the above two methods for more information. - - Args: - text (`str`, `list[str]`, `list[list[str]]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - visual_prompt (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): - The visual prompt image or batch of images to be prepared. Each visual prompt image can be a PIL image, - NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape - (C, H, W), where C is a number of channels, H and W are image height and width. - - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. + visual_prompt (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): + The visual prompt image or batch of images to be prepared. Each visual prompt image can be a PIL image, + NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape + (C, H, W), where C is a number of channels, H and W are image height and width. Returns: [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: diff --git a/src/transformers/models/clvp/processing_clvp.py b/src/transformers/models/clvp/processing_clvp.py index 331589a23999..41812fe21195 100644 --- a/src/transformers/models/clvp/processing_clvp.py +++ b/src/transformers/models/clvp/processing_clvp.py @@ -19,34 +19,20 @@ from ...processing_utils import ProcessorMixin from ...utils import logging +from ...utils.auto_docstring import auto_docstring logger = logging.get_logger(__name__) +@auto_docstring class ClvpProcessor(ProcessorMixin): - r""" - Constructs a CLVP processor which wraps a CLVP Feature Extractor and a CLVP Tokenizer into a single processor. - - [`ClvpProcessor`] offers all the functionalities of [`ClvpFeatureExtractor`] and [`ClvpTokenizer`]. See the - [`~ClvpProcessor.__call__`], [`~ClvpProcessor.decode`] and [`~ClvpProcessor.batch_decode`] for more information. - - Args: - feature_extractor (`ClvpFeatureExtractor`): - An instance of [`ClvpFeatureExtractor`]. The feature extractor is a required input. - tokenizer (`ClvpTokenizer`): - An instance of [`ClvpTokenizer`]. The tokenizer is a required input. - """ def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) + @auto_docstring def __call__(self, *args, **kwargs): - """ - Forwards the `audio` and `sampling_rate` arguments to [`~ClvpFeatureExtractor.__call__`] and the `text` - argument to [`~ClvpTokenizer.__call__`]. Please refer to the docstring of the above two methods for more - information. - """ raw_speech = kwargs.pop("raw_speech", None) if raw_speech is not None: logger.warning( diff --git a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py index b34fd1c5594e..690a1906e7cf 100644 --- a/src/transformers/models/cohere2_vision/processing_cohere2_vision.py +++ b/src/transformers/models/cohere2_vision/processing_cohere2_vision.py @@ -21,6 +21,7 @@ from ...image_utils import ImageInput from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils.auto_docstring import auto_docstring class Cohere2VisionProcessorKwargs(ProcessingKwargs, total=False): @@ -33,20 +34,8 @@ class Cohere2VisionProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class Cohere2VisionProcessor(ProcessorMixin): - r""" - Constructs a Cohere2Vision processor which wraps a [`AutoImageProcessor`] and - [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and - tokenizer functionalities. See the [`~Cohere2VisionProcessor.__call__`] and [`~Cohere2VisionProcessor.decode`] for more information. - Args: - image_processor ([`AutoImageProcessor`], *optional*): - The image processor is a required input. - tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*): - The tokenizer is a required input. - chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages - in a chat into a tokenizable string. - """ - def __init__( self, image_processor=None, @@ -72,6 +61,7 @@ def __init__( ] ) + @auto_docstring def __call__( self, images: Optional[ImageInput] = None, @@ -79,24 +69,6 @@ def __call__( **kwargs: Unpack[Cohere2VisionProcessorKwargs], ) -> BatchFeature: """ - Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` - and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text. - To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to - GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`. - - Args: - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - text (`str`, `list[str]`, `list[list[str]]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: diff --git a/src/transformers/models/colpali/processing_colpali.py b/src/transformers/models/colpali/processing_colpali.py index 1ad511ced7a7..463be38d818d 100644 --- a/src/transformers/models/colpali/processing_colpali.py +++ b/src/transformers/models/colpali/processing_colpali.py @@ -27,6 +27,7 @@ from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput from ...utils import is_torch_available +from ...utils.auto_docstring import auto_docstring if is_torch_available(): @@ -72,27 +73,8 @@ def build_string_from_input(prompt, bos_token, image_seq_len, image_token, num_i return f"{image_token * image_seq_len * num_images}{bos_token}{prompt}\n" +@auto_docstring class ColPaliProcessor(ProcessorMixin): - r""" - Constructs a ColPali processor which wraps a PaliGemmaProcessor and special methods to process images and queries, as - well as to compute the late-interaction retrieval score. - - [`ColPaliProcessor`] offers all the functionalities of [`PaliGemmaProcessor`]. See the [`~PaliGemmaProcessor.__call__`] - for more information. - - Args: - image_processor ([`SiglipImageProcessor`], *optional*): - The image processor is a required input. - tokenizer ([`LlamaTokenizerFast`], *optional*): - The tokenizer is a required input. - chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages - in a chat into a tokenizable string. - visual_prompt_prefix (`str`, *optional*, defaults to `"Describe the image."`): - A string that gets tokenized and prepended to the image tokens. - query_prefix (`str`, *optional*, defaults to `"Question: "`): - A prefix to be used for the query. - """ - def __init__( self, image_processor=None, @@ -101,6 +83,12 @@ def __init__( visual_prompt_prefix: str = "Describe the image.", query_prefix: str = "Question: ", ): + """ + visual_prompt_prefix (`str`, *optional*, defaults to `"Describe the image."`): + A string that gets tokenized and prepended to the image tokens. + query_prefix (`str`, *optional*, defaults to `"Question: "`): + A prefix to be used for the query. + """ self.visual_prompt_prefix = visual_prompt_prefix self.query_prefix = query_prefix if not hasattr(image_processor, "image_seq_length"): @@ -124,6 +112,7 @@ def __init__( super().__init__(image_processor, tokenizer, chat_template=chat_template) + @auto_docstring def __call__( self, images: Optional[ImageInput] = None, @@ -131,31 +120,6 @@ def __call__( **kwargs: Unpack[ColPaliProcessorKwargs], ) -> BatchFeature: """ - Main method to prepare for the model either (1) one or several texts, either (2) one or several image(s). This method is a custom - wrapper around the PaliGemmaProcessor's [`~PaliGemmaProcessor.__call__`] method adapted for the ColPali model. It cannot process - both text and images at the same time. - - When preparing the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's - [`~LlamaTokenizerFast.__call__`]. - When preparing the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's - [`~SiglipImageProcessor.__call__`]. - Please refer to the docstring of the above two methods for more information. - - Args: - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a - number of channels, H and W are image height and width. - text (`str`, `list[str]`, `list[list[str]]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: diff --git a/src/transformers/models/colqwen2/processing_colqwen2.py b/src/transformers/models/colqwen2/processing_colqwen2.py index 00f00c920856..a8d1db4fff4e 100644 --- a/src/transformers/models/colqwen2/processing_colqwen2.py +++ b/src/transformers/models/colqwen2/processing_colqwen2.py @@ -26,6 +26,7 @@ from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import is_torch_available +from ...utils.auto_docstring import auto_docstring if is_torch_available(): @@ -45,25 +46,8 @@ class ColQwen2ProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class ColQwen2Processor(ProcessorMixin): - r""" - Constructs a ColQwen2 processor which wraps a Qwen2VLProcessor and special methods to process images and queries, as - well as to compute the late-interaction retrieval score. - - [`ColQwen2Processor`] offers all the functionalities of [`Qwen2VLProcessor`]. See the [`~Qwen2VLProcessor.__call__`] - for more information. - - Args: - image_processor ([`Qwen2VLImageProcessor`], *optional*): - The image processor is a required input. - tokenizer ([`Qwen2TokenizerFast`], *optional*): - The tokenizer is a required input. - chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages - in a chat into a tokenizable string. - visual_prompt_prefix (`str`, *optional*): A string that gets tokenized and prepended to the image tokens. - query_prefix (`str`, *optional*): A prefix to be used for the query. - """ - def __init__( self, image_processor=None, @@ -85,6 +69,7 @@ def __init__( query_prefix = "Query: " self.query_prefix = query_prefix + @auto_docstring def __call__( self, images: Optional[ImageInput] = None, @@ -92,31 +77,10 @@ def __call__( **kwargs: Unpack[ColQwen2ProcessorKwargs], ) -> BatchFeature: """ - Main method to prepare for the model either (1) one or several texts, either (2) one or several image(s). This method is a custom - wrapper around the Qwen2VLProcessor's [`~Qwen2VLProcessor.__call__`] method adapted for the ColQwen2 model. It cannot process - both text and images at the same time. - - When preparing the the text(s), this method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's - [`~Qwen2TokenizerFast.__call__`]. - When preparing the the image(s), this method forwards the `images` and `kwargs` arguments to Qwen2VLImageProcessor's - [`~Qwen2VLImageProcessor.__call__`]. - Please refer to the doctsring of the above two methods for more information. - - Args: - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a - number of channels, H and W are image height and width. - text (`str`, `list[str]`, `list[list[str]]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - + visual_prompt_prefix (`str`, *optional*, defaults to `"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>"`): + A string that gets tokenized and prepended to the image tokens. + query_prefix (`str`, *optional*, defaults to `"Query: "`): + A prefix to be used for the query. Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: diff --git a/src/transformers/models/csm/processing_csm.py b/src/transformers/models/csm/processing_csm.py index d77ffeffd896..ff59ee3f912b 100644 --- a/src/transformers/models/csm/processing_csm.py +++ b/src/transformers/models/csm/processing_csm.py @@ -20,6 +20,7 @@ import numpy as np from ...utils import is_soundfile_available, is_torch_available +from ...utils.auto_docstring import auto_docstring if is_torch_available(): @@ -59,42 +60,8 @@ class CsmProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class CsmProcessor(ProcessorMixin): - r""" - Constructs a Csm processor which wraps [`EncodecFeatureExtractor`] and - [`PretrainedTokenizerFast`] into a single processor that inherits both the audio feature extraction and - tokenizer functionalities. See the [`~CsmProcessor.__call__`] for more - information. - The preferred way of passing kwargs is as a dictionary per modality, see usage example below. - ```python - from transformers import CsmProcessor - from datasets import load_dataset - - ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train") - audio = ds[0]["audio"]["array"] - - processor = CsmProcessor.from_pretrained("sesame/csm-1b") - - processor( - text=["<|begin_of_text|>[0]What are you working on?<|end_of_text|><|AUDIO|><|audio_eos|><|begin_of_text|>[1]I'm figuring out my budget.<|end_of_text|>"], - audio=audio, - text_kwargs = {"padding": False}, - audio_kwargs = {"sampling_rate": 16000}, - common_kwargs = {"return_tensors": "pt"}, - ) - # this should error out because EncodecFeatureExtractor expects a 24kHz audio :) - ``` - - Args: - feature_extractor ([`EncodecFeatureExtractor`]): - The feature extractor is a required input. - tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]): - The tokenizer is a required input. - chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages - in a chat into a tokenizable string. - - """ - def __init__( self, feature_extractor, @@ -189,6 +156,7 @@ def save_audio( audio_value = audio_value.cpu().float().numpy() sf.write(p, audio_value, sampling_rate) + @auto_docstring def __call__( self, text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]], @@ -197,21 +165,7 @@ def __call__( depth_decoder_labels_ratio: Optional[float] = 1.0, **kwargs: Unpack[CsmProcessorKwargs], ): - r""" - Main method to prepare text(s) and audio to be fed as input to the model. This method forwards the `text` - arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode - the text. To prepare the audio, this method forwards the `audio` arguments to - EncodecFeatureExtractor's [`~EncodecFeatureExtractor.__call__`]. Please refer - to the docstring of the above two methods for more information. - - Args: - audio (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`): - The audio or batch of audio to be prepared. Each audio can be a NumPy array or PyTorch - tensor. - text (`str`, `list[str]`, `list[list[str]]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + """ output_labels (bool, *optional*, default=False): Whether to return labels for training. Indices will be in `[config.audio_token_id, -100, -101]`. - `config.audio_token_id` indicates an audio frame (considering sequence length elements as frames) @@ -219,10 +173,7 @@ def __call__( - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels) depth_decoder_labels_ratio (float, *optional*, default=1.0): The ratio of audio frames to keep for the depth decoder labels. - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. + Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: diff --git a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py index 22b1c2ab71dd..2e1ff47ad437 100644 --- a/src/transformers/models/deepseek_vl/processing_deepseek_vl.py +++ b/src/transformers/models/deepseek_vl/processing_deepseek_vl.py @@ -24,6 +24,7 @@ from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils.auto_docstring import auto_docstring class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False): @@ -33,25 +34,8 @@ class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class DeepseekVLProcessor(ProcessorMixin): - r""" - Constructs a DeepseekVL processor which wraps a DeepseekVL Image Processor and a Llama tokenizer into a single processor. - - [`DeepseekVLProcessor`] offers all the functionalities of [`DeepseekVLImageProcessor`] and [`LlamaTokenizerFast`]. See the - [`~DeepseekVLProcessor.__call__`] and [`~DeepseekVLProcessor.decode`] for more information. - - Args: - image_processor ([`DeepseekVLImageProcessor`]): - The image processor is a required input. - tokenizer ([`LlamaTokenizerFast`]): - The tokenizer is a required input. - chat_template (`str`, *optional*): - A Jinja template which will be used to convert lists of messages - in a chat into a tokenizable string. - num_image_tokens (`int`, *optional*, defaults to 576): - The number of special image tokens used as placeholders for visual content in text sequences. - """ - def __init__( self, image_processor, @@ -59,11 +43,16 @@ def __init__( chat_template=None, num_image_tokens=576, ): + """ + num_image_tokens (`int`, *optional*, defaults to 576): + The number of special image tokens used as placeholders for visual content in text sequences. + """ self.image_token = tokenizer.image_token self.num_image_tokens = num_image_tokens super().__init__(image_processor, tokenizer, chat_template=chat_template) + @auto_docstring def __call__( self, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, @@ -71,25 +60,6 @@ def __call__( **kwargs: Unpack[DeepseekVLProcessorKwargs], ) -> BatchFeature: """ - Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` - and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode - the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to - DeepseekVLImageProcessor's [`~DeepseekVLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring - of the above two methods for more information. - - Args: - text (`str`, `List[str]`, `List[List[str]]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: diff --git a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py index 8f842db7346f..db19160f0a71 100644 --- a/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py +++ b/src/transformers/models/deepseek_vl_hybrid/processing_deepseek_vl_hybrid.py @@ -24,6 +24,7 @@ from ...image_utils import ImageInput from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils.auto_docstring import auto_docstring class DeepseekVLHybridProcessorKwargs(ProcessingKwargs, total=False): @@ -33,25 +34,8 @@ class DeepseekVLHybridProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class DeepseekVLHybridProcessor(ProcessorMixin): - r""" - Constructs a DeepseekVLHybrid processor which wraps a DeepseekVLHybrid Image Processor and a Llama tokenizer into a single processor. - - [`DeepseekVLHybridProcessor`] offers all the functionalities of [`DeepseekVLHybridImageProcessor`] and [`LlamaTokenizerFast`]. See the - [`~DeepseekVLHybridProcessor.__call__`] and [`~DeepseekVLHybridProcessor.decode`] for more information. - - Args: - image_processor ([`DeepseekVLHybridImageProcessor`]): - The image processor is a required input. - tokenizer ([`LlamaTokenizerFast`]): - The tokenizer is a required input. - chat_template (`str`, *optional*): - A Jinja template which will be used to convert lists of messages - in a chat into a tokenizable string. - num_image_tokens (`int`, *optional*, defaults to 576): - The number of special image tokens used as placeholders for visual content in text sequences. - """ - def __init__( self, image_processor, @@ -59,11 +43,16 @@ def __init__( chat_template=None, num_image_tokens=576, ): + """ + num_image_tokens (`int`, *optional*, defaults to 576): + The number of special image tokens used as placeholders for visual content in text sequences. + """ self.image_token = tokenizer.image_token self.num_image_tokens = num_image_tokens super().__init__(image_processor, tokenizer, chat_template=chat_template) + @auto_docstring def __call__( self, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, @@ -71,32 +60,13 @@ def __call__( **kwargs: Unpack[DeepseekVLHybridProcessorKwargs], ) -> BatchFeature: """ - Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` - and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode - the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to - DeepseekVLHybridImageProcessor's [`~DeepseekVLHybridImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring - of the above two methods for more information. - - Args: - text (`str`, `List[str]`, `List[List[str]]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when - `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not - `None`). + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ output_kwargs = self._merge_kwargs( diff --git a/src/transformers/models/dia/processing_dia.py b/src/transformers/models/dia/processing_dia.py index 23c04687308c..0d1281f0744b 100644 --- a/src/transformers/models/dia/processing_dia.py +++ b/src/transformers/models/dia/processing_dia.py @@ -22,6 +22,7 @@ from ...feature_extraction_utils import BatchFeature from ...processing_utils import AudioKwargs, ProcessingKwargs, ProcessorMixin, Unpack from ...utils import is_soundfile_available, is_torch_available +from ...utils.auto_docstring import auto_docstring if is_torch_available(): @@ -61,27 +62,18 @@ class DiaProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class DiaProcessor(ProcessorMixin): - r""" - Constructs a Dia processor which wraps a [`DiaFeatureExtractor`], [`DiaTokenizer`], and a [`DacModel`] into - a single processor. It inherits, the audio feature extraction, tokenizer, and audio encode/decode functio- - nalities. See [`~DiaProcessor.__call__`], [`~DiaProcessor.encode`], and [`~DiaProcessor.decode`] for more - information. - - Args: - feature_extractor (`DiaFeatureExtractor`): - An instance of [`DiaFeatureExtractor`]. The feature extractor is a required input. - tokenizer (`DiaTokenizer`): - An instance of [`DiaTokenizer`]. The tokenizer is a required input. - audio_tokenizer (`DacModel`): - An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input. - """ - audio_tokenizer_class = "DacModel" def __init__(self, feature_extractor, tokenizer, audio_tokenizer): + """ + audio_tokenizer (`DacModel`): + An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input. + """ super().__init__(feature_extractor, tokenizer, audio_tokenizer=audio_tokenizer) + @auto_docstring def __call__( self, text: Union[str, list[str]], @@ -89,12 +81,6 @@ def __call__( output_labels: Optional[bool] = False, **kwargs: Unpack[DiaProcessorKwargs], ): - """ - Main method to prepare text(s) and audio to be fed as input to the model. The `audio` argument is - forwarded to the DiaFeatureExtractor's [`~DiaFeatureExtractor.__call__`] and subsequently to the - DacModel's [`~DacModel.encode`]. The `text` argument to [`~DiaTokenizer.__call__`]. Please refer - to the docstring of the above methods for more information. - """ if not is_torch_available(): raise ValueError( "The `DiaProcessor` relies on the `audio_tokenizer` which requires `torch` but we couldn't " diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py index fedd173117eb..c004b9499df7 100644 --- a/src/transformers/models/donut/processing_donut.py +++ b/src/transformers/models/donut/processing_donut.py @@ -23,6 +23,7 @@ from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import logging +from ...utils.auto_docstring import auto_docstring class DonutProcessorKwargs(ProcessingKwargs, total=False): @@ -32,37 +33,18 @@ class DonutProcessorKwargs(ProcessingKwargs, total=False): logger = logging.get_logger(__name__) +@auto_docstring class DonutProcessor(ProcessorMixin): - r""" - Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single - processor. - - [`DonutProcessor`] offers all the functionalities of [`DonutImageProcessor`] and - [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. See the [`~DonutProcessor.__call__`] and - [`~DonutProcessor.decode`] for more information. - - Args: - image_processor ([`DonutImageProcessor`], *optional*): - An instance of [`DonutImageProcessor`]. The image processor is a required input. - tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`], *optional*): - An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input. - """ - def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) + @auto_docstring def __call__( self, images: Optional[ImageInput] = None, text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None, **kwargs: Unpack[DonutProcessorKwargs], ): - """ - When used in normal mode, this method forwards all its arguments to AutoImageProcessor's - [`~AutoImageProcessor.__call__`] and returns its output. If used in the context - [`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's - [`~DonutTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information. - """ if images is None and text is None: raise ValueError("You need to specify either an `images` or `text` input to process.") diff --git a/src/transformers/models/emu3/image_processing_emu3.py b/src/transformers/models/emu3/image_processing_emu3.py index 0c550937581f..735046dd9390 100644 --- a/src/transformers/models/emu3/image_processing_emu3.py +++ b/src/transformers/models/emu3/image_processing_emu3.py @@ -48,6 +48,13 @@ class Emu3ImageProcessorKwargs(ImagesKwargs, total=False): + """ + ratio (`str`, *optional*, defaults to `"1:1"`): + The ratio of the image to resize the image. + image_area (`int`, *optional*, defaults to `518400`): + The area of the image to resize the image. + """ + ratio: str image_area: int diff --git a/src/transformers/models/emu3/processing_emu3.py b/src/transformers/models/emu3/processing_emu3.py index 52f39a913c54..c7355c67effa 100644 --- a/src/transformers/models/emu3/processing_emu3.py +++ b/src/transformers/models/emu3/processing_emu3.py @@ -22,11 +22,11 @@ from ...image_utils import ImageInput from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput -from ...utils import is_vision_available +from ...utils import auto_docstring, is_vision_available if is_vision_available(): - from .image_processing_emu3 import smart_resize + from .image_processing_emu3 import Emu3ImageProcessorKwargs, smart_resize class Emu3TextKwargs(TextKwargs, total=False): @@ -35,6 +35,7 @@ class Emu3TextKwargs(TextKwargs, total=False): class Emu3ProcessorKwargs(ProcessingKwargs, total=False): text_kwargs: Emu3TextKwargs + images_kwargs: Emu3ImageProcessorKwargs _defaults = { "text_kwargs": { "return_for_image_generation": False, @@ -47,23 +48,8 @@ class Emu3ProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class Emu3Processor(ProcessorMixin): - r""" - Constructs a Emu3 processor which wraps a Emu3 image processor and a GPT2 tokenizer into a single - processor. - - [`Emu3Processor`] offers all the functionalities of [`Emu3ImageProcessor`] and [`GPT2TokenizerFast`]. - See the [`~Emu3Processor.__call__`] and [`~Emu3Processor.decode`] for more information. - - Args: - image_processor ([`Emu3ImageProcessor`]): - The image processor is a required input. - tokenizer ([`Emu3TokenizerFast`]): - The tokenizer is a required input. - chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages - in a chat into a tokenizable string. - """ - def __init__( self, image_processor, @@ -81,6 +67,7 @@ def __init__( self.downsample_ratio = 8 super().__init__(image_processor, tokenizer, chat_template=chat_template) + @auto_docstring def __call__( self, images: Optional[ImageInput] = None, @@ -88,26 +75,6 @@ def __call__( **kwargs: Unpack[Emu3ProcessorKwargs], ) -> BatchFeature: """ - Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` - and `kwargs` arguments to Emu3TokenizerFast's [`~Emu3TokenizerFast.__call__`] if `text` is not `None` to encode - the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to - CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring - of the above two methods for more information. - - Args: - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - text (`str`, `list[str]`, `list[list[str]]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: diff --git a/src/transformers/models/evolla/processing_evolla.py b/src/transformers/models/evolla/processing_evolla.py index 807bd294c406..afc0812f5f80 100644 --- a/src/transformers/models/evolla/processing_evolla.py +++ b/src/transformers/models/evolla/processing_evolla.py @@ -22,30 +22,23 @@ from ...processing_utils import ( ProcessorMixin, ) +from ...utils.auto_docstring import auto_docstring PROTEIN_VALID_KEYS = ["aa_seq", "foldseek", "msa"] +@auto_docstring class EvollaProcessor(ProcessorMixin): - r""" - Constructs a EVOLLA processor which wraps a LLama tokenizer and SaProt tokenizer (EsmTokenizer) into a single processor. - - [`EvollaProcessor`] offers all the functionalities of [`EsmTokenizer`] and [`LlamaTokenizerFast`]. See the - docstring of [`~EvollaProcessor.__call__`] and [`~EvollaProcessor.decode`] for more information. - - Args: + def __init__(self, protein_tokenizer, tokenizer=None, protein_max_length=1024, text_max_length=512, **kwargs): + """ protein_tokenizer (`EsmTokenizer`): An instance of [`EsmTokenizer`]. The protein tokenizer is a required input. - tokenizer (`LlamaTokenizerFast`, *optional*): - An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input. protein_max_length (`int`, *optional*, defaults to 1024): The maximum length of the sequence to be generated. text_max_length (`int`, *optional*, defaults to 512): The maximum length of the text to be generated. - """ - - def __init__(self, protein_tokenizer, tokenizer=None, protein_max_length=1024, text_max_length=512, **kwargs): + """ if protein_tokenizer is None: raise ValueError("You need to specify an `protein_tokenizer`.") if tokenizer is None: @@ -94,6 +87,7 @@ def process_text( ) return prompt_inputs + @auto_docstring def __call__( self, proteins: Optional[Union[list[dict], dict]] = None, @@ -102,7 +96,8 @@ def __call__( text_max_length: Optional[int] = None, **kwargs, ): - r"""This method takes batched or non-batched proteins and messages_list and converts them into format that can be used by + r""" + This method takes batched or non-batched proteins and messages_list and converts them into format that can be used by the model. Args: diff --git a/src/transformers/models/flava/processing_flava.py b/src/transformers/models/flava/processing_flava.py index 7e5b3c0e012e..0bb603753788 100644 --- a/src/transformers/models/flava/processing_flava.py +++ b/src/transformers/models/flava/processing_flava.py @@ -17,20 +17,11 @@ """ from ...processing_utils import ProcessorMixin +from ...utils.auto_docstring import auto_docstring +@auto_docstring class FlavaProcessor(ProcessorMixin): - r""" - Constructs a FLAVA processor which wraps a FLAVA image processor and a FLAVA tokenizer into a single processor. - - [`FlavaProcessor`] offers all the functionalities of [`FlavaImageProcessor`] and [`BertTokenizerFast`]. See the - [`~FlavaProcessor.__call__`] and [`~FlavaProcessor.decode`] for more information. - - Args: - image_processor ([`FlavaImageProcessor`], *optional*): The image processor is a required input. - tokenizer ([`BertTokenizerFast`], *optional*): The tokenizer is a required input. - """ - def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/florence2/processing_florence2.py b/src/transformers/models/florence2/processing_florence2.py index c8d699e4bc3e..d6fca01ef2c1 100644 --- a/src/transformers/models/florence2/processing_florence2.py +++ b/src/transformers/models/florence2/processing_florence2.py @@ -28,6 +28,7 @@ from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import is_torch_available, logging +from ...utils.auto_docstring import auto_docstring if is_torch_available(): @@ -42,26 +43,8 @@ class Florence2ProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class Florence2Processor(ProcessorMixin): - r""" - Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor. - - [`Florence2Processor`] offers all the functionalities of [`AutoImageProcessor`] and [`BartTokenizerFast`]. See the - [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information. - - Args: - image_processor (`AutoImageProcessor`, *optional*): - The image processor is a required input. - tokenizer (`Union[BartTokenizer, BartTokenizerFast]`, *optional*): - The tokenizer is a required input. - num_additional_image_tokens (`int`, *optional*, defaults to 0): - Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other - extra tokens appended, no need to set this arg. - post_processor_config (`dict`, *optional*, defaults to 0): - Task-specific parsing rules for [`Florence2PostProcessor`], e.g. regex patterns, - thresholds, or banned tokens. - """ - def __init__( self, image_processor=None, @@ -70,6 +53,14 @@ def __init__( post_processor_config: Optional[dict] = None, **kwargs, ): + """ + num_additional_image_tokens (`int`, *optional*, defaults to 0): + Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other + extra tokens appended, no need to set this arg. + post_processor_config (`dict`, *optional*, defaults to 0): + Task-specific parsing rules for [`Florence2PostProcessor`], e.g. regex patterns, + thresholds, or banned tokens. + """ self.tasks_answer_post_processing_type = { "": "pure_text", "": "ocr", @@ -143,6 +134,7 @@ def _construct_prompts(self, text: Union[str, list[str]]) -> list[str]: prompts.append(prompt) return prompts + @auto_docstring def __call__( self, images: Optional[ImageInput] = None, @@ -150,25 +142,6 @@ def __call__( **kwargs: Unpack[Florence2ProcessorKwargs], ) -> BatchFeature: """ - Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` - and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode - the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to - CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring - of the above two methods for more information. - - Args: - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - text (`str`, `list[str]`, `list[list[str]]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py index ee697deccf9e..ceda84912ba5 100644 --- a/src/transformers/models/fuyu/processing_fuyu.py +++ b/src/transformers/models/fuyu/processing_fuyu.py @@ -30,6 +30,7 @@ ) from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import is_torch_available, logging, requires_backends +from ...utils.auto_docstring import auto_docstring from ...utils.import_utils import requires @@ -333,20 +334,8 @@ def scale_bbox_to_transformed_image( @requires(backends=("vision",)) +@auto_docstring class FuyuProcessor(ProcessorMixin): - r""" - Constructs a Fuyu processor which wraps a Fuyu image processor and a Llama tokenizer into a single processor. - - [`FuyuProcessor`] offers all the functionalities of [`FuyuImageProcessor`] and [`LlamaTokenizerFast`]. See the - [`~FuyuProcessor.__call__`] and [`~FuyuProcessor.decode`] for more information. - - Args: - image_processor ([`FuyuImageProcessor`]): - The image processor is a required input. - tokenizer ([`LlamaTokenizerFast`]): - The tokenizer is a required input. - """ - def __init__(self, image_processor, tokenizer, **kwargs): super().__init__(image_processor=image_processor, tokenizer=tokenizer) self.image_processor = image_processor @@ -478,6 +467,7 @@ def get_sample_encoding( } return batch_encoding + @auto_docstring def __call__( self, images: Optional[ImageInput] = None, @@ -485,21 +475,6 @@ def __call__( **kwargs: Unpack[FuyuProcessorKwargs], ) -> "FuyuBatchFeature": """ - Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` - and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to - encode the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to - FuyuImageProcessor's [`~FuyuImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring - of the above two methods for more information. - - Args: - images (`PIL.Image.Image`, `list[PIL.Image.Image]`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - text (`str`, `list[str]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - Returns: [`FuyuBatchEncoding`]: A [`FuyuBatchEncoding`] with the following fields: diff --git a/src/transformers/models/gemma3/processing_gemma3.py b/src/transformers/models/gemma3/processing_gemma3.py index 11574e30b7c1..d8d8e3d61e26 100644 --- a/src/transformers/models/gemma3/processing_gemma3.py +++ b/src/transformers/models/gemma3/processing_gemma3.py @@ -23,6 +23,7 @@ from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import to_py_obj +from ...utils.auto_docstring import auto_docstring class Gemma3ProcessorKwargs(ProcessingKwargs, total=False): @@ -41,6 +42,7 @@ class Gemma3ProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class Gemma3Processor(ProcessorMixin): def __init__( self, @@ -64,6 +66,7 @@ def __init__( **kwargs, ) + @auto_docstring def __call__( self, images: Optional[ImageInput] = None, diff --git a/src/transformers/models/gemma3n/processing_gemma3n.py b/src/transformers/models/gemma3n/processing_gemma3n.py index 51b686557ed0..9be0c408c129 100644 --- a/src/transformers/models/gemma3n/processing_gemma3n.py +++ b/src/transformers/models/gemma3n/processing_gemma3n.py @@ -21,6 +21,7 @@ from ...image_utils import ImageInput, make_nested_list_of_images from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput +from ...utils.auto_docstring import auto_docstring class Gemma3nProcessorKwargs(ProcessingKwargs, total=False): @@ -29,28 +30,8 @@ class Gemma3nProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class Gemma3nProcessor(ProcessorMixin): - """ - A processor for Gemma 3n, wrapping the full capabilities of a feature extractor, image processor, and tokenizer - into a single processor. - - Args: - feature_extractor (`Gemma3nAudioFeatureExtractor`): - Feature extractor that converts raw audio waveforms into MEL spectrograms for the audio encoder. This - should return a `BatchFeature` with `input_features` and `input_features_mask` features. - image_processor (`SiglipImageProcessorFast`): - Image processor that prepares batches of images for the vision encoder. This should return a `BatchFeature` - with a `pixel_values` feature. - tokenizer (`GemmaTokenizerFast`): - The text tokenizer for the model. - chat_template (`string`, *optional*): - A Jinja template for generating text prompts from a set of messages. - audio_seq_length (int, *optional*, defaults to 188): - The number of audio soft tokens that will be added to the text prompt - image_seq_length (int, *optional*, defaults to 256): - The number of image soft tokens that should be added to - """ - def __init__( self, feature_extractor, @@ -61,6 +42,12 @@ def __init__( image_seq_length: int = 256, **kwargs, ): + """ + audio_seq_length (int, *optional*, defaults to 188): + The number of audio soft tokens that will be added to the text prompt + image_seq_length (int, *optional*, defaults to 256): + The number of image soft tokens that should be added to + """ self.audio_seq_length = audio_seq_length self.audio_token_id = tokenizer.audio_token_id self.boa_token = tokenizer.boa_token @@ -83,6 +70,7 @@ def __init__( **kwargs, ) + @auto_docstring def __call__( self, images: Optional[ImageInput] = None, diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py index 89cfc9618987..80e4f31ad3e1 100644 --- a/src/transformers/models/git/processing_git.py +++ b/src/transformers/models/git/processing_git.py @@ -17,22 +17,11 @@ """ from ...processing_utils import ProcessorMixin +from ...utils.auto_docstring import auto_docstring +@auto_docstring class GitProcessor(ProcessorMixin): - r""" - Constructs a GIT processor which wraps a CLIP image processor and a BERT tokenizer into a single processor. - - [`GitProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`BertTokenizerFast`]. See the - [`~GitProcessor.__call__`] and [`~GitProcessor.decode`] for more information. - - Args: - image_processor ([`AutoImageProcessor`]): - The image processor is a required input. - tokenizer ([`AutoTokenizer`]): - The tokenizer is a required input. - """ - def __init__(self, image_processor, tokenizer): super().__init__(image_processor, tokenizer) diff --git a/src/transformers/models/glm4v/processing_glm4v.py b/src/transformers/models/glm4v/processing_glm4v.py index 79935cbde7b4..8dd68393a76b 100644 --- a/src/transformers/models/glm4v/processing_glm4v.py +++ b/src/transformers/models/glm4v/processing_glm4v.py @@ -27,6 +27,7 @@ from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import logging +from ...utils.auto_docstring import auto_docstring from ...video_utils import VideoInput @@ -44,21 +45,8 @@ class Glm4vProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class Glm4vProcessor(ProcessorMixin): - r""" - Constructs a GLM-4V processor which wraps a GLM-4V image processor and a GLM-4 tokenizer into a single processor. - [`~Glm4vProcessor.__call__`] and [`~Glm4vProcessor.decode`] for more information. - Args: - image_processor ([`Glm4vProcessor`], *optional*): - The image processor is a required input. - tokenizer ([`PreTrainedTokenizerFast`], *optional*): - The tokenizer is a required input. - video_processor ([`Glm4vVideoProcessor`], *optional*): - The video processor is a required input. - chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages - in a chat into a tokenizable string. - """ - def __init__(self, image_processor=None, tokenizer=None, video_processor=None, chat_template=None, **kwargs): self.image_token = "<|image|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token self.video_token = "<|video|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token @@ -74,6 +62,7 @@ def __init__(self, image_processor=None, tokenizer=None, video_processor=None, c ) super().__init__(image_processor, tokenizer, video_processor, chat_template=chat_template) + @auto_docstring def __call__( self, images: Optional[ImageInput] = None, @@ -82,26 +71,6 @@ def __call__( **kwargs: Unpack[Glm4vProcessorKwargs], ) -> BatchFeature: """ - Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` - and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode - the text. - - Args: - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - text (`str`, `List[str]`, `List[List[str]]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`): - The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch - tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported. - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: diff --git a/src/transformers/models/got_ocr2/processing_got_ocr2.py b/src/transformers/models/got_ocr2/processing_got_ocr2.py index 162efef5e9f9..9b5a773ef4ec 100644 --- a/src/transformers/models/got_ocr2/processing_got_ocr2.py +++ b/src/transformers/models/got_ocr2/processing_got_ocr2.py @@ -23,6 +23,7 @@ from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import is_vision_available, logging +from ...utils.auto_docstring import auto_docstring if is_vision_available(): @@ -79,20 +80,8 @@ def preprocess_box_annotation(box: Union[list, tuple], image_size: tuple[int, in return list(box) +@auto_docstring class GotOcr2Processor(ProcessorMixin): - r""" - Constructs a GotOcr2 processor which wraps a [`GotOcr2ImageProcessor`] and - [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and - tokenizer functionalities. See the [`~GotOcr2Processor.__call__`] and [`~GotOcr2Processor.decode`] for more information. - Args: - image_processor ([`GotOcr2ImageProcessor`], *optional*): - The image processor is a required input. - tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*): - The tokenizer is a required input. - chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages - in a chat into a tokenizable string. - """ - def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): super().__init__(image_processor, tokenizer, chat_template=chat_template) @@ -127,6 +116,7 @@ def _make_list_of_inputs(self, images, text, box, color, multi_page): return images, text, box, color + @auto_docstring def __call__( self, images: Optional[ImageInput] = None, @@ -134,45 +124,6 @@ def __call__( **kwargs: Unpack[GotOcr2ProcessorKwargs], ) -> BatchFeature: """ - Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` - and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text` - is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and - `crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to - GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`. - - Args: - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - text (`str`, `list[str]`, `list[list[str]]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - format (`bool`, *optional*): - If set, will add the format token to the query, and the model will return the OCR result with formatting. - box (`list[float]`, `list[tuple[float, float]]`, `list[tuple[float, float, float, float]]`, *optional*): - The box annotation to be added to the query. If a list of floats or a tuple of floats is provided, it - will be interpreted as [x1, y1, x2, y2]. If a list of tuples is provided, each tuple should be in the - form (x1, y1, x2, y2). - color (`str`, *optional*): - The color annotation to be added to the query. The model will return the OCR result within the box with - the specified color. - multi_page (`bool`, *optional*): - If set, will enable multi-page inference. The model will return the OCR result across multiple pages. - crop_to_patches (`bool`, *optional*): - If set, will crop the image to patches. The model will return the OCR result upon the patch reference. - min_patches (`int`, *optional*): - The minimum number of patches to be cropped from the image. Only used when `crop_to_patches` is set to - `True`. - max_patches (`int`, *optional*): - The maximum number of patches to be cropped from the image. Only used when `crop_to_patches` is set to - `True`. - - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - Returns: [`BatchFeature`]: A [`BatchFeature`] with the following fields: diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py index 910840bd661c..51e7ab8c9031 100644 --- a/src/transformers/models/granite_speech/processing_granite_speech.py +++ b/src/transformers/models/granite_speech/processing_granite_speech.py @@ -20,6 +20,7 @@ from ...processing_utils import ProcessorMixin from ...tokenization_utils import PreTokenizedInput, TextInput from ...utils import is_torch_available, logging +from ...utils.auto_docstring import auto_docstring from ...utils.import_utils import requires_backends @@ -29,6 +30,7 @@ logger = logging.get_logger(__name__) +@auto_docstring class GraniteSpeechProcessor(ProcessorMixin): def __init__( self, @@ -37,9 +39,14 @@ def __init__( audio_token="<|audio|>", chat_template=None, ): + """ + audio_toke (str, *optional*, defaults to "<|audio|>"): + The audio token to use for the processor. + """ self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token super().__init__(audio_processor, tokenizer, chat_template=chat_template) + @auto_docstring def __call__( self, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]], diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py index 74565588d852..60258b209d19 100644 --- a/src/transformers/models/grounding_dino/processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -24,6 +24,7 @@ from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput from ...utils import TensorType, is_torch_available +from ...utils.auto_docstring import auto_docstring if is_torch_available(): @@ -114,47 +115,20 @@ class GroundingDinoProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class GroundingDinoProcessor(ProcessorMixin): - r""" - Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a - single processor. - - [`GroundingDinoProcessor`] offers all the functionalities of [`GroundingDinoImageProcessor`] and - [`AutoTokenizer`]. See the docstring of [`~GroundingDinoProcessor.__call__`] and [`~GroundingDinoProcessor.decode`] - for more information. - - Args: - image_processor (`GroundingDinoImageProcessor`): - An instance of [`GroundingDinoImageProcessor`]. The image processor is a required input. - tokenizer (`AutoTokenizer`): - An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. - """ - valid_processor_kwargs = GroundingDinoProcessorKwargs def __init__(self, image_processor, tokenizer): super().__init__(image_processor, tokenizer) + @auto_docstring def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, **kwargs: Unpack[GroundingDinoProcessorKwargs], ) -> BatchEncoding: - """ - This method uses [`GroundingDinoImageProcessor.__call__`] method to prepare image(s) for the model, and - [`BertTokenizerFast.__call__`] to prepare text for the model. - - Args: - images (`ImageInput`, `list[ImageInput]`, *optional*): - The image or batch of images to be processed. The image might be either PIL image, numpy array or a torch tensor. - text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`, *optional*): - Candidate labels to be detected on the image. The text might be one of the following: - - A list of candidate labels (strings) to be detected on the image (e.g. ["a cat", "a dog"]). - - A batch of candidate labels to be detected on the batch of images (e.g. [["a cat", "a dog"], ["a car", "a person"]]). - - A merged candidate labels string to be detected on the image, separated by "." (e.g. "a cat. a dog."). - - A batch of merged candidate labels text to be detected on the batch of images (e.g. ["a cat. a dog.", "a car. a person."]). - """ if text is not None: text = self._preprocess_input_text(text) return super().__call__(images=images, text=text, **kwargs) diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index 7cb640e56854..00d368c1fc2e 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -29,6 +29,7 @@ ) from ...tokenization_utils_base import PreTokenizedInput, TextInput from ...utils import is_torch_available +from ...utils.auto_docstring import auto_docstring from ...utils.deprecation import deprecate_kwarg @@ -135,25 +136,15 @@ def is_url(string): return all([result.scheme, result.netloc]) +@auto_docstring class IdeficsProcessor(ProcessorMixin): - r""" - Constructs a IDEFICS processor which wraps a LLama tokenizer and IDEFICS image processor into a single processor. - - [`IdeficsProcessor`] offers all the functionalities of [`IdeficsImageProcessor`] and [`LlamaTokenizerFast`]. See - the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information. - - Args: - image_processor (`IdeficsImageProcessor`): - An instance of [`IdeficsImageProcessor`]. The image processor is a required input. - tokenizer (`LlamaTokenizerFast`): - An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input. - image_size (`int`, *optional*, defaults to 224): - Image size (assuming a square image) - add_end_of_utterance_token (`str`, *optional*): - The string representation of token representing end of utterance - """ - def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_utterance_token=None, **kwargs): + """ + image_size (int, *optional*, defaults to 224): + The size of the image to be processed. + add_end_of_utterance_token (bool, *optional*, defaults to None): + Whether to add the end of utterance token to the text. + """ super().__init__(image_processor, tokenizer) self.image_token_id = ( tokenizer.image_token_id @@ -172,6 +163,7 @@ def __init__(self, image_processor, tokenizer=None, image_size=224, add_end_of_u ) @deprecate_kwarg(old_name="prompts", version="5.0.0", new_name="text", raise_if_both_names=True) + @auto_docstring def __call__( self, images: Union[ImageInput, list[ImageInput], str, list[str], list[list[str]]] = None, @@ -185,29 +177,16 @@ def __call__( ] = None, **kwargs: Unpack[IdeficsProcessorKwargs], ) -> BatchFeature: - """This method takes batched or non-batched prompts made of text and images and converts them into prompts that - the model was trained on and prepares the image pixel values for the model to process. - - Args: - images (`Union[ImageInput, list[ImageInput], str, list[str], list[list[str]]]`): - either a single image or a batched list of images - can be passed in when text contains only text prompts, - in order to use the image-text-to-text behavior. - text (`Union[list[TextInput], [list[list[TextInput]]]]`): - either a single prompt or a batched list of prompts - see the detailed description immediately after - the end of the arguments doc section. - return_tensors (`str` or `TensorType`, *optional*, defaults to `TensorType.PYTORCH`): - The type of tensors to return. Can be one of: - - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. - + """ Returns: a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be directly passed to `model.generate` - Detailed explanation: + Detailed explanation: - Each entry in `text` is either a text to be passed as is or an image that will be processed. + Each entry in `text` is either a text to be passed as is or an image that will be processed. - An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved. + An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved. When the processor encounters an image it'll inject `` entry into the prompt. diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index df5f9ca73a8b..05f7153fdfea 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -28,6 +28,7 @@ ) from ...tokenization_utils_base import AddedToken, TextInput from ...utils import logging +from ...utils.auto_docstring import auto_docstring if TYPE_CHECKING: @@ -55,29 +56,17 @@ class Idefics2ProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class Idefics2Processor(ProcessorMixin): - r""" - Constructs a IDEFICS2 processor which wraps a LLama tokenizer and IDEFICS2 image processor into a single processor. - - [`IdeficsProcessor`] offers all the functionalities of [`Idefics2ImageProcessor`] and [`LlamaTokenizerFast`]. See - the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information. - - Args: - image_processor (`Idefics2ImageProcessor`): - An instance of [`Idefics2ImageProcessor`]. The image processor is a required input. - tokenizer (`PreTrainedTokenizerBase`, *optional*): - An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input. + def __init__( + self, image_processor, tokenizer=None, image_seq_len: int = 64, chat_template: Optional[str] = None, **kwargs + ): + """ image_seq_len (`int`, *optional*, defaults to 64): The length of the image sequence i.e. the number of tokens per image in the input. This parameter is used to build the string from the input prompt and image tokens and should match the config.perceiver_config.resampler_n_latents value for the model used. - chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages - in a chat into a tokenizable string. - """ - - def __init__( - self, image_processor, tokenizer=None, image_seq_len: int = 64, chat_template: Optional[str] = None, **kwargs - ): + """ if not hasattr(tokenizer, "image_token"): self.fake_image_token = AddedToken("", normalized=False, special=True).content self.image_token = AddedToken("", normalized=False, special=True).content @@ -107,58 +96,13 @@ def _extract_images_from_prompts(self, prompts): prompt_images.append(images) return prompt_images + @auto_docstring def __call__( self, images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None, text: Union[TextInput, "PreTokenizedInput", list[TextInput], list["PreTokenizedInput"]] = None, **kwargs: Unpack[Idefics2ProcessorKwargs], ) -> BatchFeature: - """ - Processes the input prompts and returns a BatchEncoding. - - Example: - - ```python - >>> import requests - >>> from transformers import Idefics2Processor - >>> from transformers.image_utils import load_image - - >>> processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2) - >>> processor.image_processor.do_image_splitting = False # Force as False to simplify the example - - >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" - >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg" - - >>> image1, image2 = load_image(url1), load_image(url2) - >>> images = [[image1], [image2]] - - >>> text = [ - ... "In this image, we see", - ... "bla bla bla", - ... ] - >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True) - >>> input_ids = outputs.input_ids - >>> input_tokens = processor.tokenizer.batch_decode(input_ids) - >>> print(input_tokens) - [' In this image, we see', ' bla bla bla'] - ``` - - Args: - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. If is of type `list[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1. - text (`Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]`, *optional*): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - - Wherever an image token, `` is encountered it is expanded to - `` + `` * `image_seq_len` * `. - return_tensors (`Union[str, TensorType]`, *optional*): - If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more - information. - - """ if text is None and images is None: raise ValueError("You must provide either `text` or `images`.") diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index 5c978eb3b230..73584110b55e 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -27,6 +27,7 @@ from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import AddedToken, BatchEncoding, TextInput from ...utils import logging +from ...utils.auto_docstring import auto_docstring if TYPE_CHECKING: @@ -101,29 +102,17 @@ class Idefics3ProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class Idefics3Processor(ProcessorMixin): - r""" - Constructs a Idefics3 processor which wraps a LLama tokenizer and Idefics3 image processor into a single processor. - - [`Idefics3Processor`] offers all the functionalities of [`Idefics3ImageProcessor`] and [`Idefics3TokenizerFast`]. See - the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information. - - Args: - image_processor (`Idefics3ImageProcessor`): - An instance of [`Idefics3ImageProcessor`]. The image processor is a required input. - tokenizer (`PreTrainedTokenizerBase`, *optional*): - An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input. + def __init__( + self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, **kwargs + ): + """ image_seq_len (`int`, *optional*, defaults to 169): The length of the image sequence i.e. the number of tokens per image in the input. This parameter is used to build the string from the input prompt and image tokens and should match the value the model used. It is computed as: image_seq_len = int(((image_size // patch_size) ** 2) / (scale_factor**2)) - chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages - in a chat into a tokenizable string. - """ - - def __init__( - self, image_processor, tokenizer=None, image_seq_len: int = 169, chat_template: Optional[str] = None, **kwargs - ): + """ self.fake_image_token = AddedToken("", normalized=False, special=True).content self.image_token = AddedToken("", normalized=False, special=True).content self.end_of_utterance_token = AddedToken("", normalized=False, special=True).content @@ -164,6 +153,7 @@ def _extract_images_from_prompts(self, prompts): prompt_images.append(images) return prompt_images + @auto_docstring def __call__( self, images: Union[ImageInput, list[ImageInput], list[list[ImageInput]]] = None, @@ -172,51 +162,9 @@ def __call__( **kwargs: Unpack[Idefics3ProcessorKwargs], ) -> BatchEncoding: """ - Processes the input prompts and returns a BatchEncoding. - - Example: - - ```python - >>> import requests - >>> from transformers import Idefics3Processor - >>> from transformers.image_utils import load_image - - >>> processor = Idefics3Processor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3") - >>> processor.image_processor.do_image_splitting = False # Force as False to simplify the example - - >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" - >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg" - - >>> image1, image2 = load_image(url1), load_image(url2) - >>> images = [[image1], [image2]] - - >>> text = [ - ... "In this image, we see", - ... "bla bla bla", - ... ] - >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True) - >>> input_ids = outputs.input_ids - >>> input_tokens = processor.tokenizer.batch_decode(input_ids) - >>> print(input_tokens) - ['<|begin_of_text|>(()*169) In this image, we see', '<|reserved_special_token_0|><|reserved_special_token_0|><|reserved_special_token_0|><|begin_of_text|>bla bla bla(()*169)'] - ``` - - Args: - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. If is of type `list[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1. - text (`Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]`, *optional*): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - Wherever an image token, `` is encountered it is expanded to - `` + `` + `` * `image_seq_len` * `. - image_seq_len (`int`, *optional*): - The length of the image sequence. If not provided, the default value of self.image_seq_len is used. - image_seq_len should be equal to int(((image_size // patch_size) ** 2) / (scale_factor**2)) - return_tensors (`Union[str, TensorType]`, *optional*): - If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more - information. + image_seq_len (`int`, *optional*): + The length of the image sequence. If not provided, the default value of self.image_seq_len is used. + image_seq_len should be equal to int(((image_size // patch_size) ** 2) / (scale_factor**2)) """ if text is None and images is None: raise ValueError("You must provide either `text` or `images`.") diff --git a/src/transformers/models/instructblip/processing_instructblip.py b/src/transformers/models/instructblip/processing_instructblip.py index cfed52f745ae..017e0e2564d2 100644 --- a/src/transformers/models/instructblip/processing_instructblip.py +++ b/src/transformers/models/instructblip/processing_instructblip.py @@ -23,6 +23,7 @@ from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput from ...utils import logging +from ...utils.auto_docstring import auto_docstring logger = logging.get_logger(__name__) @@ -44,26 +45,15 @@ class InstructBlipProcessorKwargs(ProcessingKwargs, total=False): } +@auto_docstring class InstructBlipProcessor(ProcessorMixin): - r""" - Constructs an InstructBLIP processor which wraps a BLIP image processor and a LLaMa/T5 tokenizer into a single - processor. - - [`InstructBlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the - docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information. - - Args: - image_processor (`BlipImageProcessor`): - An instance of [`BlipImageProcessor`]. The image processor is a required input. - tokenizer (`AutoTokenizer`): - An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. + def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs): + """ qformer_tokenizer (`AutoTokenizer`): An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input. num_query_tokens (`int`, *optional*):" Number of tokens used by the Qformer as queries, should be same as in model's config. - """ - - def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs): + """ if not hasattr(tokenizer, "image_token"): self.image_token = AddedToken("", normalized=False, special=True) tokenizer.add_tokens([self.image_token], special_tokens=True) @@ -73,26 +63,13 @@ def __init__(self, image_processor, tokenizer, qformer_tokenizer, num_query_toke super().__init__(image_processor, tokenizer, qformer_tokenizer) + @auto_docstring def __call__( self, images: Optional[ImageInput] = None, text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, **kwargs: Unpack[InstructBlipProcessorKwargs], ) -> BatchFeature: - """ - This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and - [`BertTokenizerFast.__call__`] to prepare text for the model. - - Please refer to the docstring of the above two methods for more information. - Args: - images (`ImageInput`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - """ if images is None and text is None: raise ValueError("You have to specify at least images or text.") diff --git a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py index 81d0103b2742..cc776e5a1e70 100644 --- a/src/transformers/models/instructblipvideo/processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/processing_instructblipvideo.py @@ -28,32 +28,22 @@ TruncationStrategy, ) from ...utils import TensorType, logging +from ...utils.auto_docstring import auto_docstring from ...video_utils import VideoInput logger = logging.get_logger(__name__) +@auto_docstring class InstructBlipVideoProcessor(ProcessorMixin): - r""" - Constructs an InstructBLIPVideo processor which wraps a InstructBLIP image processor and a LLaMa/T5 tokenizer into a single - processor. - - [`InstructBlipVideoProcessor`] offers all the functionalities of [`InstructBlipVideoVideoProcessor`] and [`AutoTokenizer`]. See the - docstring of [`~InstructBlipVideoProcessor.__call__`] and [`~InstructBlipVideoProcessor.decode`] for more information. - - Args: - video_processor (`InstructBlipVideoVideoProcessor`): - An instance of [`InstructBlipVideoVideoProcessor`]. The video processor is a required input. - tokenizer (`AutoTokenizer`): - An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. + def __init__(self, video_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs): + """ qformer_tokenizer (`AutoTokenizer`): An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input. num_query_tokens (`int`, *optional*): Number of tokens used by the Qformer as queries, should be same as in model's config. - """ - - def __init__(self, video_processor, tokenizer, qformer_tokenizer, num_query_tokens=None, **kwargs): + """ if not hasattr(tokenizer, "video_token"): self.video_token = AddedToken("