Student0809 commited on Jul 24

Commit

ee3af03

verified ·

1 Parent(s): c848ed3

Add files using upload-large-folder tool

Browse files

Files changed (20) hide show

.github/ISSUE_TEMPLATE/custom.md +8 -0
.github/SECURITY.md +3 -0
.github/workflows/lint.yaml +22 -0
.github/workflows/publish.yaml +29 -0
.ipynb_checkpoints/GRPO_TEST-checkpoint.jsonl +0 -0
.ipynb_checkpoints/GRPOtrain-checkpoint.sh +38 -0
.ipynb_checkpoints/README_CN-checkpoint.md +413 -0
.ipynb_checkpoints/VLLM-checkpoint.sh +7 -0
.ipynb_checkpoints/clean_transcripts-checkpoint.py +95 -0
.ipynb_checkpoints/compare_scores-checkpoint.py +96 -0
.ipynb_checkpoints/dialogue_length_ranges-checkpoint.png +0 -0
.ipynb_checkpoints/infer-checkpoint.py +63 -0
GRPO_TRAIN.jsonl +0 -0
docs/transformers/build/lib/transformers/models/phimoe/__init__.py +28 -0
docs/transformers/build/lib/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py +155 -0
docs/transformers/build/lib/transformers/models/pix2struct/image_processing_pix2struct.py +464 -0
docs/transformers/build/lib/transformers/models/pixtral/__init__.py +30 -0
docs/transformers/build/lib/transformers/models/pixtral/configuration_pixtral.py +106 -0
docs/transformers/build/lib/transformers/models/pixtral/image_processing_pixtral.py +472 -0
docs/transformers/build/lib/transformers/models/pixtral/modeling_pixtral.py +505 -0

.github/ISSUE_TEMPLATE/custom.md ADDED Viewed

	@@ -0,0 +1,8 @@

+---
+name: Custom issue template
+about: Describe this issue template's purpose here.
+title: ''
+labels: ''
+assignees: ''
+---

.github/SECURITY.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Reporting Security Issues
2	+
3	+ Usually security issues of a deep learning project come from non-standard 3rd packages or continuous running services. If you are suffering from security issues from our project, please consider reporting to us. We appreciate your efforts to responsibly disclose your findings, and will make every effort to acknowledge your contributions.

.github/workflows/lint.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+name: Lint test
+on: [push, pull_request]
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.10'
+      - name: Install pre-commit hook
+        run: |
+          pip install pre-commit
+      - name: Linting
+        run: pre-commit run --all-files

.github/workflows/publish.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+name: release
+on:
+  push:
+    tags:
+      - 'v**'
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}-publish
+  cancel-in-progress: true
+jobs:
+  build-n-publish:
+    runs-on: ubuntu-22.04
+    #if: startsWith(github.event.ref, 'refs/tags')
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.10'
+      - name: Install wheel
+        run: pip install wheel packaging setuptools==69.5.1
+      - name: Build ModelScope Swift
+        run: python setup.py sdist bdist_wheel
+      - name: Publish package to PyPI
+        run: |
+          pip install twine
+          twine upload dist/* --skip-existing -u __token__ -p ${{ secrets.PYPI_API_TOKEN }}

.ipynb_checkpoints/GRPO_TEST-checkpoint.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

.ipynb_checkpoints/GRPOtrain-checkpoint.sh ADDED Viewed

	@@ -0,0 +1,38 @@

+WANDB_API_KEY="a7ab128385681b17ad156ad0d8c81ba3e2296164" \
+CUDA_VISIBLE_DEVICES=0,1 \
+NPROC_PER_NODE=2 \
+swift rlhf \
+    --rlhf_type grpo \
+    --model /root/autodl-tmp/output_7B_FULL_cotSFT/v11-20250721-183605/checkpoint-330 \
+    --external_plugins GRPO/Reward.py \
+    --reward_funcs external_r1v_acc external_r1v_format_acc \
+    --use_vllm false \
+    --train_type full \
+    --torch_dtype bfloat16 \
+    --dataset 'all_dataset_train_resampled_16000.jsonl' \
+    --max_completion_length 512 \
+    --num_train_epochs 2 \
+    --per_device_train_batch_size 2 \
+    --per_device_eval_batch_size 2 \
+    --learning_rate 1e-6 \
+    --gradient_accumulation_steps 2 \
+    --save_strategy 'steps' \
+    --eval_strategy 'steps' \
+    --eval_steps 290 \
+    --save_steps 290 \
+    --save_total_limit 5 \
+    --logging_steps 5 \
+    --output_dir /root/autodl-tmp/output_7B_GRPO \
+    --warmup_ratio 0.01 \
+    --dataloader_num_workers 1 \
+    --num_generations 2 \
+    --temperature 1.0 \
+    --log_completions true \
+    --num_iterations 1 \
+    --async_generate false \
+    --beta 0.01 \
+    --deepspeed zero3_offload \
+    --report_to wandb \
+    #     --vllm_mode server \
+    # --vllm_server_host 127.0.0.1 \
+    # --vllm_server_port 8000 \

.ipynb_checkpoints/README_CN-checkpoint.md ADDED Viewed

	@@ -0,0 +1,413 @@

+# SWIFT (Scalable lightWeight Infrastructure for Fine-Tuning)
+<p align="center">
+    <br>
+    <img src="asset/banner.png"/>
+    <br>
+<p>
+<p align="center">
+<a href="https://modelscope.cn/home">魔搭社区官网</a>
+<br>
+        中文&nbsp ｜ &nbsp<a href="README.md">English</a>&nbsp
+</p>
+<p align="center">
+<img src="https://img.shields.io/badge/python-3.10-5be.svg">
+<img src="https://img.shields.io/badge/pytorch-%E2%89%A52.0-orange.svg">
+<a href="https://github.com/modelscope/modelscope/"><img src="https://img.shields.io/badge/modelscope-%E2%89%A51.19-5D91D4.svg"></a>
+<a href="https://pypi.org/project/ms-swift/"><img src="https://badge.fury.io/py/ms-swift.svg"></a>
+<a href="https://github.com/modelscope/swift/blob/main/LICENSE"><img src="https://img.shields.io/github/license/modelscope/swift"></a>
+<a href="https://pepy.tech/project/ms-swift"><img src="https://pepy.tech/badge/ms-swift"></a>
+<a href="https://github.com/modelscope/swift/pulls"><img src="https://img.shields.io/badge/PR-welcome-55EB99.svg"></a>
+</p>
+<p align="center">
+<a href="https://trendshift.io/repositories/6427" target="_blank"><img src="https://trendshift.io/api/badge/repositories/6427" alt="modelscope%2Fswift | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</p>
+<p align="center">
+        <a href="https://arxiv.org/abs/2408.05517">论文</a> &nbsp ｜ <a href="https://swift.readthedocs.io/en/latest/">English Documentation</a> &nbsp ｜ &nbsp <a href="https://swift.readthedocs.io/zh-cn/latest/">中文文档</a> &nbsp
+</p>
+##  📖 目录
+- [用户群](#-用户群)
+- [简介](#-简介)
+- [新闻](#-新闻)
+- [安装](#%EF%B8%8F-安装)
+- [快速开始](#-快速开始)
+- [如何使用](#-如何使用)
+- [License](#-license)
+- [引用](#-引用)
+## ☎ 用户群
+请扫描下面的二维码来加入我们的交流群：
+[Discord Group](https://discord.com/invite/D27yfEFVz5)              |  微信群
+:-------------------------:|:-------------------------:
+<img src="asset/discord_qr.jpg" width="200" height="200">  |  <img src="asset/wechat.png" width="200" height="200">
+## 📝 简介
+🍲 ms-swift是魔搭社区提供的大模型与多模态大模型微调部署框架，现已支持500+大模型与200+多模态大模型的训练（预训练、微调、人类对齐）、推理、评测、量化与部署。其中大模型包括：Qwen3、Qwen3-MoE、Qwen2.5、InternLM3、GLM4、Mistral、DeepSeek-R1、Yi1.5、TeleChat2、Baichuan2、Gemma2等模型，多模态大模型包括：Qwen2.5-VL、Qwen2-Audio、Llama4、Llava、InternVL2.5、MiniCPM-V-2.6、GLM4v、Xcomposer2.5、Yi-VL、DeepSeek-VL2、Phi3.5-Vision、GOT-OCR2等模型。
+🍔 除此之外，ms-swift汇集了最新的训练技术，包括LoRA、QLoRA、Llama-Pro、LongLoRA、GaLore、Q-GaLore、LoRA+、LISA、DoRA、FourierFt、ReFT、UnSloth、和Liger等轻量化训练技术，以及DPO、GRPO、RM、PPO、KTO、CPO、SimPO、ORPO等人类对齐训练方法。ms-swift支持使用vLLM和LMDeploy对推理、评测和部署模块进行加速，并支持使用GPTQ、AWQ、BNB等技术对大模型进行量化。ms-swift还提供了基于Gradio的Web-UI界面及丰富的最佳实践。
+**为什么选择ms-swift？**
+- 🍎 **模型类型**：支持500+纯文本大模型、**200+多模态大模型**以及All-to-All全模态模型、序列分类模型、Embedding模型**训练到部署全流程**。
+- **数据集类型**：内置150+预训练、微调、人类对齐、多模态等各种类型的数据集，并支持自定义数据集。
+- **硬件支持**：CPU、RTX系列、T4/V100、A10/A100/H100、Ascend NPU、MPS等。
+- 🍊 **轻量训练**：支持了LoRA、QLoRA、DoRA、LoRA+、ReFT、RS-LoRA、LLaMAPro、Adapter、GaLore、Q-Galore、LISA、UnSloth、Liger-Kernel等轻量微调方式。
+- **分布式训练**：支持分布式数据并行（DDP）、device_map简易模型并行、DeepSpeed ZeRO2 ZeRO3、FSDP等分布式训练技术。
+- **量化训练**：支持对BNB、AWQ、GPTQ、AQLM、HQQ、EETQ量化模型进行训练。
+- **RLHF训练**：支持纯文本大模型和多模态大模型的DPO、GRPO、RM、PPO、KTO、CPO、SimPO、ORPO等人类对齐训练方法。
+- 🍓 **多模态训练**：支持对图像、视频和语音不同模态模型进行训练，支持VQA、Caption、OCR、Grounding任务的训练。
+- **界面训练**：以界面的方式提供训练、推理、评测、量化的能力，完成大模型的全链路。
+- **插件化与拓展**：支持自定义模型和数据集拓展，支持对loss、metric、trainer、loss-scale、callback、optimizer等组件进行自定义。
+- 🍉 **工具箱能力**：不仅提供大模型和多模态大模型的训练支持，还涵盖其推理、评测、量化和部署全流程。
+- **推理加速**：支持PyTorch、vLLM、LmDeploy推理加速引擎，并提供OpenAI接口，为推理、部署和评测模块提供加速。
+- **模型评测**：以EvalScope作为评测后端，支持100+评测数据集对纯��本和多模态模型进行评测。
+- **模型量化**：支持AWQ、GPTQ和BNB的量化导出，导出的模型支持使用vLLM/LmDeploy推理加速，并支持继续训练。
+## 🎉 新闻
+- 🎁 2025.05.11: GRPO中的奖励模型支持自定义处理逻辑，GenRM的例子参考[这里](./docs/source/Instruction/GRPO.md#自定义奖励模型)
+- 🎁 2025.04.15: ms-swift论文已经被AAAI 2025接收，论文地址在[这里](https://ojs.aaai.org/index.php/AAAI/article/view/35383)。
+- 🎁 2025.03.23: 支持了多轮GRPO，用于构建多轮对话场景的训练(例如agent tool calling)，请查看[训练脚本](examples/train/grpo/internal/train_multi_round.sh)。
+- 🎁 2025.03.16: 支持了Megatron的并行技术进行训练，请查看[Megatron-SWIFT训练文档](https://swift.readthedocs.io/zh-cn/latest/Instruction/Megatron-SWIFT训练.html)。
+- 🎁 2025.03.15: 支持纯文本和多模态模型的embedding模型的微调，请查看[训练脚本](examples/train/embedding)。
+- 🎁 2025.03.05: 支持GRPO的hybrid模式，4GPU(4*80G)训练72B模型的脚本参考[这里](examples/train/grpo/internal/train_72b_4gpu.sh)。同时支持vllm的tensor并行，训练脚本参考[这里](examples/train/grpo/internal/multi_gpu_mp_colocate.sh)。
+- 🎁 2025.02.21: GRPO算法支持使用LMDeploy，训练脚本参考[这里](examples/train/grpo/internal/full_lmdeploy.sh)。此外测试了GRPO算法的性能，使用一些tricks使训练速度提高到300%。WanDB表格请查看[这里](https://wandb.ai/tastelikefeet/grpo_perf_test?nw=nwuseryuzezyz)。
+- 🎁 2025.02.21: 支持`swift sample`命令。强化微调脚本参考[这里](docs/source/Instruction/强化微调.md)，大模型API蒸馏采样脚本参考[这里](examples/sampler/distill/distill.sh)。
+- 🔥 2025.02.12: 支持GRPO (Group Relative Policy Optimization) 训练算法，文档参考[这里](docs/source/Instruction/GRPO.md)。
+- 🎁 2024.12.04: **ms-swift3.0**大版本更新。请查看[发布说明和更改](https://swift.readthedocs.io/zh-cn/latest/Instruction/ReleaseNote3.0.html)。
+<details><summary>更多</summary>
+- 🎉 2024.08.12: ms-swift论文已经发布到arXiv上，可以点击[这里](https://arxiv.org/abs/2408.05517)阅读。
+- 🔥 2024.08.05: 支持使用[evalscope](https://github.com/modelscope/evalscope/)作为后端进行大模型和多模态模型的评测。
+- 🔥 2024.07.29: 支持使用[vllm](https://github.com/vllm-project/vllm), [lmdeploy](https://github.com/InternLM/lmdeploy)对大模型和多模态大模型进行推理加速，在infer/deploy/eval时额外指定`--infer_backend vllm/lmdeploy`即可。
+- 🔥 2024.07.24: 支持对多模态大模型进行人类偏好对齐训练，包括DPO/ORPO/SimPO/CPO/KTO/RM/PPO。
+- 🔥 2024.02.01: 支持Agent训练！训练算法源自这篇[论文](https://arxiv.org/pdf/2309.00986.pdf)。
+</details>
+## 🛠️ 安装
+使用pip进行安装：
+```shell
+pip install ms-swift -U
+```
+从源代码安装：
+```shell
+# pip install git+https://github.com/modelscope/ms-swift.git
+git clone https://github.com/modelscope/ms-swift.git
+cd ms-swift
+pip install -e .
+```
+运行环境：
+|        | 范围           | 推荐 | 备注 |
+| ------ |--------------| ---- | --|
+| python | >=3.9        | 3.10 ||
+| cuda |              | cuda12 |使用cpu、npu、mps则无需安装|
+| torch | >=2.0        |  ||
+| transformers | >=4.33       | 4.51 ||
+| modelscope | >=1.23       |  ||
+| peft | >=0.11,<0.16 | ||
+| trl | >=0.13,<0.18 | 0.17 |RLHF|
+| deepspeed | >=0.14       | 0.14.5 |训练|
+| vllm | >=0.5.1      | 0.7.3/0.8 |推理/部署/评测|
+| lmdeploy | >=0.5        | 0.8 |推理/部署/评测|
+| evalscope | >=0.11       | |评测|
+更多可选依赖可以参考[这里](https://github.com/modelscope/ms-swift/blob/main/requirements/install_all.sh)。
+## 🚀 快速开始
+**10分钟**在单卡3090上对Qwen2.5-7B-Instruct进行自我认知微调：
+### 命令行
+```shell
+# 22GB
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --train_type lora \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/alpaca-gpt4-data-en#500' \
+              'swift/self-cognition#500' \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --gradient_accumulation_steps 16 \
+    --eval_steps 50 \
+    --save_steps 50 \
+    --save_total_limit 2 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --system 'You are a helpful assistant.' \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --model_author swift \
+    --model_name swift-robot
+```
+小贴士：
+- 如果要使用自定义数据集进行训练，你可以参考[这里](https://swift.readthedocs.io/zh-cn/latest/Customization/%E8%87%AA%E5%AE%9A%E4%B9%89%E6%95%B0%E6%8D%AE%E9%9B%86.html)组织数据集格式，并指定`--dataset <dataset_path>`。
+- `--model_author`和`--model_name`参数只有当数据集中包含`swift/self-cognition`时才生效。
+- 如果要使用其他模型进行训练，你只需要修改`--model <model_id/model_path>`即可。
+- 默认使用ModelScope进行模型和数据集的下载。如果要使用HuggingFace，指定`--use_hf true`即可。
+训练完成后，使用以下命令对训练后的权重进行推理：
+- 这里的`--adapters`需要替换成训练生成的last checkpoint文件夹。由于adapters文件夹中包含了训练的参数文件`args.json`，因此不需要额外指定`--model`，`--system`，swift会自动读取这些参数。如果要关闭此行为，可以设置`--load_args false`。
+```shell
+# 使用交互式命令行进行推理
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --stream true \
+    --temperature 0 \
+    --max_new_tokens 2048
+# merge-lora并使用vLLM进行推理加速
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --stream true \
+    --merge_lora true \
+    --infer_backend vllm \
+    --max_model_len 8192 \
+    --temperature 0 \
+    --max_new_tokens 2048
+```
+最后，使用以下命令将模型推送到ModelScope：
+```shell
+CUDA_VISIBLE_DEVICES=0 \
+swift export \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --push_to_hub true \
+    --hub_model_id '<your-model-id>' \
+    --hub_token '<your-sdk-token>' \
+    --use_hf false
+```
+### Web-UI
+Web-UI是基于gradio界面技术的**零门槛**训练、部署界面方案，具体可以查看[这里](https://swift.readthedocs.io/zh-cn/latest/GetStarted/Web-UI.html)。
+```shell
+swift web-ui
+```
+![image.png](./docs/resources/web-ui.jpg)
+### 使用Python
+ms-swift也支持使用python的方式进行训练和推理。下面给出训练和推理的**伪代码**，具体可以查看[这里](https://github.com/modelscope/ms-swift/blob/main/examples/notebook/qwen2_5-self-cognition/self-cognition-sft.ipynb)。
+训练：
+```python
+# 获取模型和template，并加入可训练的LoRA模块
+model, tokenizer = get_model_tokenizer(model_id_or_path, ...)
+template = get_template(model.model_meta.template, tokenizer, ...)
+model = Swift.prepare_model(model, lora_config)
+# 下载并载入数据集，并将文本encode成tokens
+train_dataset, val_dataset = load_dataset(dataset_id_or_path, ...)
+train_dataset = EncodePreprocessor(template=template)(train_dataset, num_proc=num_proc)
+val_dataset = EncodePreprocessor(template=template)(val_dataset, num_proc=num_proc)
+# 进行训练
+trainer = Seq2SeqTrainer(
+    model=model,
+    args=training_args,
+    data_collator=template.data_collator,
+    train_dataset=train_dataset,
+    eval_dataset=val_dataset,
+    template=template,
+)
+trainer.train()
+```
+推理：
+```python
+# 使用原生pytorch引擎进行推理
+engine = PtEngine(model_id_or_path, adapters=[lora_checkpoint])
+infer_request = InferRequest(messages=[{'role': 'user', 'content': 'who are you?'}])
+request_config = RequestConfig(max_tokens=max_new_tokens, temperature=temperature)
+resp_list = engine.infer([infer_request], request_config)
+print(f'response: {resp_list[0].choices[0].message.content}')
+```
+## ✨ 如何使用
+这里给出使用ms-swift进行训练到部署到最简示例，具体可以查看[examples](https://github.com/modelscope/ms-swift/tree/main/examples)。
+- 若想使用其他模型或者数据集（含多模态模型和数据集），你只需要修改`--model`指定对应模型的id或者path，修改`--dataset`指定对应数据集的id或者path即可。
+- 默认使用ModelScope进行模型和数据集的下载。如果要使用HuggingFace，指定`--use_hf true`即可。
+|   常用链接 |
+| ------ |
+|   [🔥命令行参数](https://swift.readthedocs.io/zh-cn/latest/Instruction/%E5%91%BD%E4%BB%A4%E8%A1%8C%E5%8F%82%E6%95%B0.html)   |
+|   [支持的模型和数据集](https://swift.readthedocs.io/zh-cn/latest/Instruction/%E6%94%AF%E6%8C%81%E7%9A%84%E6%A8%A1%E5%9E%8B%E5%92%8C%E6%95%B0%E6%8D%AE%E9%9B%86.html)   |
+|   [自定义模型](https://swift.readthedocs.io/zh-cn/latest/Customization/%E8%87%AA%E5%AE%9A%E4%B9%89%E6%A8%A1%E5%9E%8B.html), [🔥自定义数据集](https://swift.readthedocs.io/zh-cn/latest/Customization/%E8%87%AA%E5%AE%9A%E4%B9%89%E6%95%B0%E6%8D%AE%E9%9B%86.html)   |
+|   [大模型教程](https://github.com/modelscope/modelscope-classroom/tree/main/LLM-tutorial)   |
+### 训练
+支持的训练方法：
+| 方法   | 全参数 | LoRA                                                                                        | QLoRA | Deepspeed | 多机 | 多模态                                                                                          |
+| ------ | ------ |---------------------------------------------------------------------------------------------| ----- | ------ | ------ |----------------------------------------------------------------------------------------------|
+| 预训练 | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/pretrain/train.sh) | ✅                                                                                           | ✅ | ✅ | ✅ | ✅                                                                                            |
+| 指令监督微调 | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/full/train.sh) | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/lora_sft.sh)            | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/qlora) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-gpu/deepspeed) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multi-node) | [✅](https://github.com/modelscope/ms-swift/tree/main/examples/train/multimodal)              |
+| DPO训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/dpo.sh)            | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/dpo.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/dpo.sh)  |
+| GRPO训练 | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/internal/grpo_zero2.sh) | ✅                                                                                           | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/grpo/internal/multi_node) | ✅                                                                                            |
+| 奖励模型训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/rm.sh)             | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/rm.sh) | ✅ | ✅                                                                                            |
+| PPO训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/ppo.sh)            | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/ppo.sh) | ✅ | ❌                                                                                            |
+| KTO训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/kto.sh)            | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/kto.sh) | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/multimodal/rlhf/kto.sh)  |
+| CPO训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/cpo.sh)            | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/cpo.sh) | ✅ | ✅                                                                                            |
+| SimPO训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/simpo.sh)          | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/simpo.sh) | ✅ | ✅                                                                                            |
+| ORPO训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/orpo.sh)           | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/rlhf/orpo.sh) | ✅ | ✅                                                                                            |
+| 分类模型训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/seq_cls/qwen2_5/sft.sh) | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/seq_cls/qwen2_vl/sft.sh) |
+| Embedding模型训练 | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding/train_gte.sh) | ✅ | ✅ | ✅ | [✅](https://github.com/modelscope/ms-swift/blob/main/examples/train/embedding/train_gme.sh)  |
+预训练：
+```shell
+# 8*A100
+NPROC_PER_NODE=8 \
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
+swift pt \
+    --model Qwen/Qwen2.5-7B \
+    --dataset swift/chinese-c4 \
+    --streaming true \
+    --train_type full \
+    --deepspeed zero2 \
+    --output_dir output \
+    --max_steps 10000 \
+    ...
+```
+微调：
+```shell
+CUDA_VISIBLE_DEVICES=0 swift sft \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --dataset AI-ModelScope/alpaca-gpt4-data-zh \
+    --train_type lora \
+    --output_dir output \
+    ...
+```
+RLHF：
+```shell
+CUDA_VISIBLE_DEVICES=0 swift rlhf \
+    --rlhf_type dpo \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --dataset hjh0119/shareAI-Llama3-DPO-zh-en-emoji \
+    --train_type lora \
+    --output_dir output \
+    ...
+```
+### 推理
+```shell
+CUDA_VISIBLE_DEVICES=0 swift infer \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --stream true \
+    --infer_backend pt \
+    --max_new_tokens 2048
+# LoRA
+CUDA_VISIBLE_DEVICES=0 swift infer \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --adapters swift/test_lora \
+    --stream true \
+    --infer_backend pt \
+    --temperature 0 \
+    --max_new_tokens 2048
+```
+### 界面推理
+```shell
+CUDA_VISIBLE_DEVICES=0 swift app \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --stream true \
+    --infer_backend pt \
+    --max_new_tokens 2048 \
+    --lang zh
+```
+### 部署
+```shell
+CUDA_VISIBLE_DEVICES=0 swift deploy \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --infer_backend vllm
+```
+### 采样
+```shell
+CUDA_VISIBLE_DEVICES=0 swift sample \
+    --model LLM-Research/Meta-Llama-3.1-8B-Instruct \
+    --sampler_engine pt \
+    --num_return_sequences 5 \
+    --dataset AI-ModelScope/alpaca-gpt4-data-zh#5
+```
+### 评测
+```shell
+CUDA_VISIBLE_DEVICES=0 swift eval \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --infer_backend lmdeploy \
+    --eval_backend OpenCompass \
+    --eval_dataset ARC_c
+```
+### 量化
+```shell
+CUDA_VISIBLE_DEVICES=0 swift export \
+    --model Qwen/Qwen2.5-7B-Instruct \
+    --quant_bits 4 --quant_method awq \
+    --dataset AI-ModelScope/alpaca-gpt4-data-zh \
+    --output_dir Qwen2.5-7B-Instruct-AWQ
+```
+### 推送模型
+```shell
+swift export \
+    --model <model-path> \
+    --push_to_hub true \
+    --hub_model_id '<model-id>' \
+    --hub_token '<sdk-token>'
+```
+## 🏛 License
+本框架使用[Apache License (Version 2.0)](https://github.com/modelscope/modelscope/blob/master/LICENSE)进行许可。模型和数据集请查看原资源页面并遵守对应License。
+## 📎 引用
+```bibtex
+@misc{zhao2024swiftascalablelightweightinfrastructure,
+      title={SWIFT:A Scalable lightWeight Infrastructure for Fine-Tuning},
+      author={Yuze Zhao and Jintao Huang and Jinghan Hu and Xingjun Wang and Yunlin Mao and Daoze Zhang and Zeyinzi Jiang and Zhikai Wu and Baole Ai and Ang Wang and Wenmeng Zhou and Yingda Chen},
+      year={2024},
+      eprint={2408.05517},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL},
+      url={https://arxiv.org/abs/2408.05517},
+}
+```
+## Star History
+[![Star History Chart](https://api.star-history.com/svg?repos=modelscope/swift&type=Date)](https://star-history.com/#modelscope/ms-swift&Date)

.ipynb_checkpoints/VLLM-checkpoint.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+CUDA_VISIBLE_DEVICES=0 python -m vllm.entrypoints.openai.api_server \
+    --model /root/autodl-tmp/output_7B_FULL_cotSFT/v0-20250621-230827/Qwen2.5-Omni-7B \
+    --tokenizer /root/autodl-tmp/output_7B_FULL_cotSFT/v0-20250621-230827/Qwen2.5-Omni-7B \
+    --dtype bfloat16 \
+    --host 127.0.0.1 \
+    --port 8000 \
+    --gpu-memory-utilization 0.9

.ipynb_checkpoints/clean_transcripts-checkpoint.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import json
+import re
+from typing import List, Dict, Tuple
+def parse_timestamp(timestamp: str) -> Tuple[int, int]:
+    """Convert timestamp string like '00:15' to seconds."""
+    minutes, seconds = map(int, timestamp.split(':'))
+    return minutes * 60 + seconds
+def extract_time_and_speaker(line: str) -> Tuple[Tuple[int, int], str]:
+    """Extract time range and speaker from a line."""
+    # Extract time range
+    time_match = re.match(r'\[(\d{2}:\d{2}) - (\d{2}:\d{2})\] (Speaker [A-Z]):', line)
+    if not time_match:
+        return None, None
+    start_time = parse_timestamp(time_match.group(1))
+    end_time = parse_timestamp(time_match.group(2))
+    speaker = time_match.group(3)
+    return (start_time, end_time), speaker
+def has_overlap(range1: Tuple[int, int], range2: Tuple[int, int]) -> bool:
+    """Check if two time ranges overlap."""
+    start1, end1 = range1
+    start2, end2 = range2
+    return not (end1 <= start2 or end2 <= start1)
+def has_same_speaker_overlap(transcript: str) -> bool:
+    """Check if a transcript contains overlapping timestamps for the same speaker."""
+    lines = transcript.split('\n')
+    # Dictionary to store time ranges for each speaker
+    speaker_ranges = {}
+    for line in lines:
+        if not line.strip():
+            continue
+        time_range, speaker = extract_time_and_speaker(line)
+        if time_range is None or speaker is None:
+            continue
+        # Check for overlaps with existing ranges of the same speaker
+        if speaker in speaker_ranges:
+            for existing_range in speaker_ranges[speaker]:
+                if has_overlap(time_range, existing_range):
+                    return True
+            speaker_ranges[speaker].append(time_range)
+        else:
+            speaker_ranges[speaker] = [time_range]
+    return False
+def process_file(input_file: str, output_file: str, delete_file: str):
+    """Process the JSON file and separate entries with same-speaker overlapping timestamps."""
+    with open(input_file, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    if isinstance(data, dict):
+        data = [data]
+    cleaned_data = []
+    deleted_data = []
+    removed_count = 0
+    for entry in data:
+        if 'model_output' in entry:
+            if not has_same_speaker_overlap(entry['model_output']):
+                cleaned_data.append(entry)
+            else:
+                deleted_data.append(entry)
+                removed_count += 1
+                print(f"Removing entry with key: {entry.get('key', 'unknown')}")
+    # Save cleaned data
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(cleaned_data, f, ensure_ascii=False, indent=2)
+    # Save deleted data
+    with open(delete_file, 'w', encoding='utf-8') as f:
+        json.dump(deleted_data, f, ensure_ascii=False, indent=2)
+    print(f"\nProcessing Summary:")
+    print(f"Processed {len(data)} entries")
+    print(f"Removed {removed_count} entries with same-speaker overlapping timestamps")
+    print(f"Remaining entries: {len(cleaned_data)}")
+if __name__ == '__main__':
+    input_file = 'silence_overlaps/transcriptions.json'
+    output_file = 'silence_overlaps/cleaned_transcriptions2.json'
+    delete_file = 'silence_overlaps/delete_transcript2.json'
+    process_file(input_file, output_file, delete_file)
+    print(f"\nCleaned transcriptions have been saved to {output_file}")
+    print(f"Deleted entries have been saved to {delete_file}")

.ipynb_checkpoints/compare_scores-checkpoint.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import json
+import re
+from collections import defaultdict
+infer_result_path = '/root/autodl-tmp/output_7B_GRPO/v28-20250722-002940/checkpoint-870/infer_result/53_HH.jsonl'
+test_path = '/root/autodl-tmp/ms-swift/all_audio_test_50.jsonl'
+output_path = 'inference_comparison_result.json'
+def extract_overall_score(response_text):
+    match = re.search(r'<overall score>(\d+)</overall score>', response_text)
+    if match:
+        return int(match.group(1))
+    return None
+def main():
+    # 读取infer_result文件，建立audio到score的映射
+    infer_audio2score = {}
+    with open(infer_result_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            data = json.loads(line)
+            score = extract_overall_score(data['response'])
+            audios = tuple(data.get('audios', []))
+            infer_audio2score[audios] = {
+                'score': score,
+                'raw_response': data['response']
+            }
+    # 读取test文件，建立audio到solution的映射
+    test_audio2solution = {}
+    with open(test_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            data = json.loads(line)
+            solution = data['solution']
+            audios = tuple(data.get('audios', []))
+            test_audio2solution[audios] = solution
+    # 统计和收集错误样本 & 所有推理结果
+    stats_per_class = defaultdict(lambda: {'correct': 0, 'incorrect': 0})
+    incorrect_samples_solution1 = []
+    all_results = []
+    total = 0
+    correct = 0
+    for audios, solution in test_audio2solution.items():
+        infer_entry = infer_audio2score.get(audios, None)
+        infer_score = infer_entry['score'] if infer_entry else None
+        raw_response = infer_entry['raw_response'] if infer_entry else None
+        match = infer_score == solution
+        # 收集所有结果
+        all_results.append({
+            'audios': audios,
+            'gt_solution': solution,
+            'predicted_score': infer_score,
+            'match': match,
+            'response': raw_response
+        })
+        if match:
+            correct += 1
+            stats_per_class[solution]['correct'] += 1
+        else:
+            stats_per_class[solution]['incorrect'] += 1
+            if solution == 1:
+                incorrect_samples_solution1.append({
+                    'audios': audios,
+                    'gt_solution': solution,
+                    'predicted_score': infer_score,
+                    'response': raw_response
+                })
+        total += 1
+    # 总体准确率
+    print(f'\nOverall Accuracy: {correct}/{total} = {correct/total:.2%}\n')
+    # 每类准确率
+    print("Per-Class Accuracy:")
+    for solution, stats in sorted(stats_per_class.items()):
+        total_class = stats['correct'] + stats['incorrect']
+        accuracy = stats['correct'] / total_class if total_class > 0 else 0.0
+        print(f'Class {solution}: Correct={stats["correct"]}, Incorrect={stats["incorrect"]}, Accuracy={accuracy:.2%}')
+    # 列出 solution=1 且预测错误的样本
+    print("\nIncorrect Samples for solution = 1:")
+    for sample in incorrect_samples_solution1:
+        print(json.dumps(sample, indent=2, ensure_ascii=False))
+    # 写入所有结果到 JSON 文件
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(all_results, f, indent=2, ensure_ascii=False)
+    print(f"\nAll inference comparison results saved to: {output_path}")
+if __name__ == '__main__':
+    main()

.ipynb_checkpoints/dialogue_length_ranges-checkpoint.png ADDED Viewed

.ipynb_checkpoints/infer-checkpoint.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+from swift.llm import PtEngine, RequestConfig, safe_snapshot_download, get_model_tokenizer, get_template, InferRequest
+import json
+from transformers import AutoProcessor
+from swift.tuners import Swift
+last_model_checkpoint = '/root/autodl-tmp/output_7B_Lora_cotSFT/v2-20250613-111902/checkpoint-3'
+# 模型
+model_id_or_path = '/root/autodl-tmp/output_7B_Lora_allmission/v2-20250610-190504/checkpoint-1000-merged'  # model_id or model_path
+system = 'You are a helpful assistant.'
+infer_backend = 'pt'
+# 生成参数
+max_new_tokens = 2048
+temperature = 0
+stream = False
+template_type = None
+default_system = system  # None: 使用对应模型默认的default_system
+# 初始化音频处理器
+model, tokenizer = get_model_tokenizer(model_id_or_path)
+# 初始化引擎
+model = Swift.from_pretrained(model, last_model_checkpoint)
+template_type = template_type or model.model_meta.template
+template = get_template(template_type, tokenizer, default_system=default_system)
+engine = PtEngine.from_model_template(model, template, max_batch_size=2)
+request_config = RequestConfig(max_tokens=8192, temperature=0)
+def load_test_data(json_file):
+    test_requests = []
+    with open(json_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            data = json.loads(line.strip())
+            test_requests.append(InferRequest(
+                messages=data['messages'],
+                audios=data['audios']
+            ))
+    return test_requests
+def main():
+    # 加载测试数据
+    test_file = 'dataset_allmissiontest.json'
+    infer_requests = load_test_data(test_file)
+    results = []
+    resp_list = engine.infer(infer_requests, request_config)
+    for i, resp in enumerate(resp_list):
+        assistant_content = next((msg['content'] for msg in infer_requests[i].messages if msg['role'] == 'assistant'), None)
+        result = {
+            "index": i,
+            "truth": assistant_content,
+            "response": resp.choices[0].message.content,
+        }
+        results.append(result)
+        print(f'truth{i}: {assistant_content}')
+        print(f'response{i}: {resp.choices[0].message.content}')
+    output_file = 'inference_results.json'
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+if __name__ == '__main__':
+    main()

GRPO_TRAIN.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

docs/transformers/build/lib/transformers/models/phimoe/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright 2024 Microsoft and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_phimoe import *
+    from .modeling_phimoe import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

docs/transformers/build/lib/transformers/models/pix2struct/convert_pix2struct_original_pytorch_to_hf.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import re
+import torch
+from flax.traverse_util import flatten_dict
+from t5x import checkpoints
+from transformers import (
+    AutoTokenizer,
+    Pix2StructConfig,
+    Pix2StructForConditionalGeneration,
+    Pix2StructImageProcessor,
+    Pix2StructProcessor,
+    Pix2StructTextConfig,
+    Pix2StructVisionConfig,
+)
+def get_flax_param(t5x_checkpoint_path):
+    flax_params = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)
+    flax_params = flatten_dict(flax_params)
+    return flax_params
+def rename_and_convert_flax_params(flax_dict):
+    converted_dict = {}
+    CONVERSION_MAPPING = {
+        "token_embedder": "embeddings",
+        "encoder_norm": "layernorm",
+        "kernel": "weight",
+        ".out": ".output",
+        "scale": "weight",
+        "embedders_0.pos_embedding": "row_embedder.weight",
+        "embedders_1.pos_embedding": "column_embedder.weight",
+    }
+    DECODER_CONVERSION_MAPPING = {
+        "query": "attention.query",
+        "key": "attention.key",
+        "value": "attention.value",
+        "output.dense": "output",
+        "encoder_decoder_attention.o": "encoder_decoder_attention.attention.o",
+        "pre_self_attention_layer_norm": "self_attention.layer_norm",
+        "pre_cross_attention_layer_norm": "encoder_decoder_attention.layer_norm",
+        "mlp.": "mlp.DenseReluDense.",
+        "pre_mlp_layer_norm": "mlp.layer_norm",
+        "self_attention.o": "self_attention.attention.o",
+        "decoder.embeddings.embedding": "decoder.embed_tokens.weight",
+        "decoder.relpos_bias.rel_embedding": "decoder.layer.0.self_attention.attention.relative_attention_bias.weight",
+        "decoder.decoder_norm.weight": "decoder.final_layer_norm.weight",
+        "decoder.logits_dense.weight": "decoder.lm_head.weight",
+    }
+    for key in flax_dict.keys():
+        if "target" in key:
+            # remove the first prefix from the key
+            new_key = ".".join(key[1:])
+            # rename the key
+            for old, new in CONVERSION_MAPPING.items():
+                new_key = new_key.replace(old, new)
+            if "decoder" in new_key:
+                for old, new in DECODER_CONVERSION_MAPPING.items():
+                    new_key = new_key.replace(old, new)
+            if "layers" in new_key and "decoder" not in new_key:
+                # use regex to replace the layer number
+                new_key = re.sub(r"layers_(\d+)", r"layer.\1", new_key)
+                new_key = new_key.replace("encoder", "encoder.encoder")
+            elif "layers" in new_key and "decoder" in new_key:
+                # use regex to replace the layer number
+                new_key = re.sub(r"layers_(\d+)", r"layer.\1", new_key)
+            converted_dict[new_key] = flax_dict[key]
+    converted_torch_dict = {}
+    # convert converted_dict into torch format
+    for key in converted_dict.keys():
+        if ("embed_tokens" not in key) and ("embedder" not in key):
+            converted_torch_dict[key] = torch.from_numpy(converted_dict[key].T)
+        else:
+            converted_torch_dict[key] = torch.from_numpy(converted_dict[key])
+    return converted_torch_dict
+def convert_pix2struct_original_pytorch_checkpoint_to_hf(
+    t5x_checkpoint_path, pytorch_dump_folder_path, use_large=False, is_vqa=False
+):
+    flax_params = get_flax_param(t5x_checkpoint_path)
+    if not use_large:
+        encoder_config = Pix2StructVisionConfig()
+        decoder_config = Pix2StructTextConfig()
+    else:
+        encoder_config = Pix2StructVisionConfig(
+            hidden_size=1536, d_ff=3968, num_attention_heads=24, num_hidden_layers=18
+        )
+        decoder_config = Pix2StructTextConfig(hidden_size=1536, d_ff=3968, num_heads=24, num_layers=18)
+    config = Pix2StructConfig(
+        vision_config=encoder_config.to_dict(), text_config=decoder_config.to_dict(), is_vqa=is_vqa
+    )
+    model = Pix2StructForConditionalGeneration(config)
+    torch_params = rename_and_convert_flax_params(flax_params)
+    model.load_state_dict(torch_params)
+    tok = AutoTokenizer.from_pretrained("ybelkada/test-pix2struct-tokenizer")
+    image_processor = Pix2StructImageProcessor()
+    processor = Pix2StructProcessor(image_processor=image_processor, tokenizer=tok)
+    if use_large:
+        processor.image_processor.max_patches = 4096
+    processor.image_processor.is_vqa = True
+    # mkdir if needed
+    os.makedirs(pytorch_dump_folder_path, exist_ok=True)
+    model.save_pretrained(pytorch_dump_folder_path)
+    processor.save_pretrained(pytorch_dump_folder_path)
+    print("Model saved in {}".format(pytorch_dump_folder_path))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--t5x_checkpoint_path", default=None, type=str, help="Path to the original T5x checkpoint.")
+    parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    parser.add_argument("--use_large", action="store_true", help="Use large model.")
+    parser.add_argument("--is_vqa", action="store_true", help="Use large model.")
+    args = parser.parse_args()
+    convert_pix2struct_original_pytorch_checkpoint_to_hf(
+        args.t5x_checkpoint_path, args.pytorch_dump_folder_path, args.use_large
+    )

docs/transformers/build/lib/transformers/models/pix2struct/image_processing_pix2struct.py ADDED Viewed

	@@ -0,0 +1,464 @@

+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Pix2Struct."""
+import io
+import math
+from typing import Dict, Optional, Union
+import numpy as np
+from huggingface_hub import hf_hub_download
+from ...image_processing_utils import BaseImageProcessor, BatchFeature
+from ...image_transforms import convert_to_rgb, normalize, to_channel_dimension_format, to_pil_image
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    get_image_size,
+    infer_channel_dimension_format,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_torch_available, is_vision_available, logging
+from ...utils.import_utils import requires_backends
+if is_vision_available():
+    import textwrap
+    from PIL import Image, ImageDraw, ImageFont
+if is_torch_available():
+    import torch
+logger = logging.get_logger(__name__)
+DEFAULT_FONT_PATH = "ybelkada/fonts"
+# adapted from: https://discuss.pytorch.org/t/tf-image-extract-patches-in-pytorch/171409/2
+def torch_extract_patches(image_tensor, patch_height, patch_width):
+    """
+    Utiliy function to extract patches from a given image tensor. Returns a tensor of shape (1, `patch_height`,
+    `patch_width`, `num_channels`x `patch_height` x `patch_width`)
+    Args:
+        image_tensor (torch.Tensor):
+            The image tensor to extract patches from.
+        patch_height (int):
+            The height of the patches to extract.
+        patch_width (int):
+            The width of the patches to extract.
+    """
+    requires_backends(torch_extract_patches, ["torch"])
+    image_tensor = image_tensor.unsqueeze(0)
+    patches = torch.nn.functional.unfold(image_tensor, (patch_height, patch_width), stride=(patch_height, patch_width))
+    patches = patches.reshape(image_tensor.size(0), image_tensor.size(1), patch_height, patch_width, -1)
+    patches = patches.permute(0, 4, 2, 3, 1).reshape(
+        image_tensor.size(2) // patch_height,
+        image_tensor.size(3) // patch_width,
+        image_tensor.size(1) * patch_height * patch_width,
+    )
+    return patches.unsqueeze(0)
+# Adapted from https://github.com/google-research/pix2struct/blob/0e1779af0f4db4b652c1d92b3bbd2550a7399123/pix2struct/preprocessing/preprocessing_utils.py#L106
+def render_text(
+    text: str,
+    text_size: int = 36,
+    text_color: str = "black",
+    background_color: str = "white",
+    left_padding: int = 5,
+    right_padding: int = 5,
+    top_padding: int = 5,
+    bottom_padding: int = 5,
+    font_bytes: Optional[bytes] = None,
+    font_path: Optional[str] = None,
+) -> Image.Image:
+    """
+    Render text. This script is entirely adapted from the original script that can be found here:
+    https://github.com/google-research/pix2struct/blob/main/pix2struct/preprocessing/preprocessing_utils.py
+    Args:
+        text (`str`, *optional*, defaults to ):
+            Text to render.
+        text_size (`int`, *optional*, defaults to 36):
+            Size of the text.
+        text_color (`str`, *optional*, defaults to `"black"`):
+            Color of the text.
+        background_color (`str`, *optional*, defaults to `"white"`):
+            Color of the background.
+        left_padding (`int`, *optional*, defaults to 5):
+            Padding on the left.
+        right_padding (`int`, *optional*, defaults to 5):
+            Padding on the right.
+        top_padding (`int`, *optional*, defaults to 5):
+            Padding on the top.
+        bottom_padding (`int`, *optional*, defaults to 5):
+            Padding on the bottom.
+        font_bytes (`bytes`, *optional*):
+            Bytes of the font to use. If `None`, the default font will be used.
+        font_path (`str`, *optional*):
+            Path to the font to use. If `None`, the default font will be used.
+    """
+    requires_backends(render_text, "vision")
+    # Add new lines so that each line is no more than 80 characters.
+    wrapper = textwrap.TextWrapper(width=80)
+    lines = wrapper.wrap(text=text)
+    wrapped_text = "\n".join(lines)
+    if font_bytes is not None and font_path is None:
+        font = io.BytesIO(font_bytes)
+    elif font_path is not None:
+        font = font_path
+    else:
+        font = hf_hub_download(DEFAULT_FONT_PATH, "Arial.TTF")
+    font = ImageFont.truetype(font, encoding="UTF-8", size=text_size)
+    # Use a temporary canvas to determine the width and height in pixels when
+    # rendering the text.
+    temp_draw = ImageDraw.Draw(Image.new("RGB", (1, 1), background_color))
+    _, _, text_width, text_height = temp_draw.textbbox((0, 0), wrapped_text, font)
+    # Create the actual image with a bit of padding around the text.
+    image_width = text_width + left_padding + right_padding
+    image_height = text_height + top_padding + bottom_padding
+    image = Image.new("RGB", (image_width, image_height), background_color)
+    draw = ImageDraw.Draw(image)
+    draw.text(xy=(left_padding, top_padding), text=wrapped_text, fill=text_color, font=font)
+    return image
+# Adapted from https://github.com/google-research/pix2struct/blob/0e1779af0f4db4b652c1d92b3bbd2550a7399123/pix2struct/preprocessing/preprocessing_utils.py#L87
+def render_header(
+    image: np.ndarray, header: str, input_data_format: Optional[Union[str, ChildProcessError]] = None, **kwargs
+):
+    """
+    Renders the input text as a header on the input image.
+    Args:
+        image (`np.ndarray`):
+            The image to render the header on.
+        header (`str`):
+            The header text.
+        data_format (`Union[ChannelDimension, str]`, *optional*):
+            The data format of the image. Can be either "ChannelDimension.channels_first" or
+            "ChannelDimension.channels_last".
+    Returns:
+        `np.ndarray`: The image with the header rendered.
+    """
+    requires_backends(render_header, "vision")
+    # Convert to PIL image if necessary
+    image = to_pil_image(image, input_data_format=input_data_format)
+    header_image = render_text(header, **kwargs)
+    new_width = max(header_image.width, image.width)
+    new_height = int(image.height * (new_width / image.width))
+    new_header_height = int(header_image.height * (new_width / header_image.width))
+    new_image = Image.new("RGB", (new_width, new_height + new_header_height), "white")
+    new_image.paste(header_image.resize((new_width, new_header_height)), (0, 0))
+    new_image.paste(image.resize((new_width, new_height)), (0, new_header_height))
+    # Convert back to the original framework if necessary
+    new_image = to_numpy_array(new_image)
+    if infer_channel_dimension_format(new_image) == ChannelDimension.LAST:
+        new_image = to_channel_dimension_format(new_image, ChannelDimension.LAST)
+    return new_image
+class Pix2StructImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Pix2Struct image processor.
+    Args:
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. According to Pix2Struct paper and code, the image is normalized with its own mean and standard
+            deviation.
+        patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
+            The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16.
+        max_patches (`int`, *optional*, defaults to 2048):
+            The maximum number of patches to extract from the image as per the [Pix2Struct
+            paper](https://arxiv.org/pdf/2210.03347.pdf).
+        is_vqa (`bool`, *optional*, defaults to `False`):
+            Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is
+            rendered onto the input images.
+    """
+    model_input_names = ["flattened_patches"]
+    def __init__(
+        self,
+        do_convert_rgb: bool = True,
+        do_normalize: bool = True,
+        patch_size: Dict[str, int] = None,
+        max_patches: int = 2048,
+        is_vqa: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}
+        self.do_normalize = do_normalize
+        self.do_convert_rgb = do_convert_rgb
+        self.max_patches = max_patches
+        self.is_vqa = is_vqa
+    def extract_flattened_patches(
+        self,
+        image: np.ndarray,
+        max_patches: int,
+        patch_size: dict,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Extract flattened patches from an image.
+        Args:
+            image (`np.ndarray`):
+                Image to extract flattened patches from.
+            max_patches (`int`):
+                Maximum number of patches to extract.
+            patch_size (`dict`):
+                Dictionary containing the patch height and width.
+        Returns:
+            result (`np.ndarray`):
+                A sequence of `max_patches` flattened patches.
+        """
+        requires_backends(self.extract_flattened_patches, "torch")
+        # convert to torch
+        image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_data_format)
+        image = torch.from_numpy(image)
+        patch_height, patch_width = patch_size["height"], patch_size["width"]
+        image_height, image_width = get_image_size(image, ChannelDimension.FIRST)
+        # maximize scale s.t.
+        scale = math.sqrt(max_patches * (patch_height / image_height) * (patch_width / image_width))
+        num_feasible_rows = max(min(math.floor(scale * image_height / patch_height), max_patches), 1)
+        num_feasible_cols = max(min(math.floor(scale * image_width / patch_width), max_patches), 1)
+        resized_height = max(num_feasible_rows * patch_height, 1)
+        resized_width = max(num_feasible_cols * patch_width, 1)
+        image = torch.nn.functional.interpolate(
+            image.unsqueeze(0),
+            size=(resized_height, resized_width),
+            mode="bilinear",
+            align_corners=False,
+            antialias=True,
+        ).squeeze(0)
+        # [1, rows, columns, patch_height * patch_width * image_channels]
+        patches = torch_extract_patches(image, patch_height, patch_width)
+        patches_shape = patches.shape
+        rows = patches_shape[1]
+        columns = patches_shape[2]
+        depth = patches_shape[3]
+        # [rows * columns, patch_height * patch_width * image_channels]
+        patches = patches.reshape([rows * columns, depth])
+        # [rows * columns, 1]
+        row_ids = torch.arange(rows).reshape([rows, 1]).repeat(1, columns).reshape([rows * columns, 1])
+        col_ids = torch.arange(columns).reshape([1, columns]).repeat(rows, 1).reshape([rows * columns, 1])
+        # Offset by 1 so the ids do not contain zeros, which represent padding.
+        row_ids += 1
+        col_ids += 1
+        # Prepare additional patch features.
+        # [rows * columns, 1]
+        row_ids = row_ids.to(torch.float32)
+        col_ids = col_ids.to(torch.float32)
+        # [rows * columns, 2 + patch_height * patch_width * image_channels]
+        result = torch.cat([row_ids, col_ids, patches], -1)
+        # [max_patches, 2 + patch_height * patch_width * image_channels]
+        result = torch.nn.functional.pad(result, [0, 0, 0, max_patches - (rows * columns)]).float()
+        result = to_numpy_array(result)
+        return result
+    def normalize(
+        self,
+        image: np.ndarray,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+        The image std is to mimic the tensorflow implementation of the `per_image_standardization`:
+        https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if image.dtype == np.uint8:
+            image = image.astype(np.float32)
+        # take mean across the whole `image`
+        mean = np.mean(image)
+        std = np.std(image)
+        adjusted_stddev = max(std, 1.0 / math.sqrt(np.prod(image.shape)))
+        return normalize(
+            image,
+            mean=mean,
+            std=adjusted_stddev,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+    def preprocess(
+        self,
+        images: ImageInput,
+        header_text: Optional[str] = None,
+        do_convert_rgb: Optional[bool] = None,
+        do_normalize: Optional[bool] = None,
+        max_patches: Optional[int] = None,
+        patch_size: Optional[Dict[str, int]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> ImageInput:
+        """
+        Preprocess an image or batch of images. The processor first computes the maximum possible number of
+        aspect-ratio preserving patches of size `patch_size` that can be extracted from the image. It then pads the
+        image with zeros to make the image respect the constraint of `max_patches`. Before extracting the patches the
+        images are standardized following the tensorflow implementation of `per_image_standardization`
+        (https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization).
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images.
+            header_text (`Union[List[str], str]`, *optional*):
+                Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            max_patches (`int`, *optional*, defaults to `self.max_patches`):
+                Maximum number of patches to extract.
+            patch_size (`dict`, *optional*, defaults to `self.patch_size`):
+                Dictionary containing the patch height and width.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        max_patches = max_patches if max_patches is not None else self.max_patches
+        is_vqa = self.is_vqa
+        if kwargs.get("data_format", None) is not None:
+            raise ValueError("data_format is not an accepted input as the outputs are ")
+        images = make_list_of_images(images)
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        if is_vqa:
+            if header_text is None:
+                raise ValueError("A header text must be provided for VQA models.")
+            font_bytes = kwargs.pop("font_bytes", None)
+            font_path = kwargs.pop("font_path", None)
+            if isinstance(header_text, str):
+                header_text = [header_text] * len(images)
+            images = [
+                render_header(image, header_text[i], font_bytes=font_bytes, font_path=font_path)
+                for i, image in enumerate(images)
+            ]
+        if do_normalize:
+            images = [self.normalize(image=image, input_data_format=input_data_format) for image in images]
+        # convert to torch tensor and permute
+        images = [
+            self.extract_flattened_patches(
+                image=image, max_patches=max_patches, patch_size=patch_size, input_data_format=input_data_format
+            )
+            for image in images
+        ]
+        # create attention mask in numpy
+        attention_masks = [(image.sum(axis=-1) != 0).astype(np.float32) for image in images]
+        encoded_outputs = BatchFeature(
+            data={"flattened_patches": images, "attention_mask": attention_masks}, tensor_type=return_tensors
+        )
+        return encoded_outputs
+__all__ = ["Pix2StructImageProcessor"]

docs/transformers/build/lib/transformers/models/pixtral/__init__.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_pixtral import *
+    from .image_processing_pixtral import *
+    from .image_processing_pixtral_fast import *
+    from .modeling_pixtral import *
+    from .processing_pixtral import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

docs/transformers/build/lib/transformers/models/pixtral/configuration_pixtral.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pixtral model configuration"""
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+logger = logging.get_logger(__name__)
+class PixtralVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PixtralVisionModel`]. It is used to instantiate an
+    Pixtral vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to the vision encoder used by Pixtral-12B.
+    e.g. [pixtral-hf/pixtral-9b](https://huggingface.co/pixtral-hf/pixtral-9b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 4096):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of input channels in the input images.
+        image_size (`int`, *optional*, defaults to 1024):
+            Max dimension of the input images.
+        patch_size (`int`, *optional*, defaults to 16):
+            Size of the image patches.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            Activation function used in the hidden layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            Dropout probability for the attention layers.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    Example:
+    ```python
+    >>> from transformers import PixtralVisionModel, PixtralVisionConfig
+    >>> # Initializing a Pixtral-12B style configuration
+    >>> config = PixtralVisionConfig()
+    >>> # Initializing a model (with randomly initialized weights) from the configuration
+    >>> model = PixtralVisionModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "pixtral"
+    def __init__(
+        self,
+        hidden_size=1024,
+        intermediate_size=4096,
+        num_hidden_layers=24,
+        num_attention_heads=16,
+        num_channels=3,
+        image_size=1024,
+        patch_size=16,
+        hidden_act="gelu",
+        attention_dropout=0.0,
+        rope_theta=10000.0,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.hidden_act = hidden_act
+        self.rope_theta = rope_theta
+        self.head_dim = hidden_size // num_attention_heads
+        self.initializer_range = initializer_range
+__all__ = ["PixtralVisionConfig"]

docs/transformers/build/lib/transformers/models/pixtral/image_processing_pixtral.py ADDED Viewed

	@@ -0,0 +1,472 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Pixtral."""
+import math
+from typing import Dict, List, Optional, Tuple, Union
+import numpy as np
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    pad,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_kwargs,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, is_vision_available, logging
+from ...utils.import_utils import requires_backends
+logger = logging.get_logger(__name__)
+if is_vision_available():
+    import PIL
+# Adapted from function in image_transforms.py to ensure any transparent pixels are converted to white.
+def convert_to_rgb(image: ImageInput) -> ImageInput:
+    """
+    Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
+    as is.
+    Args:
+        image (Image):
+            The image to convert.
+    """
+    requires_backends(convert_to_rgb, ["vision"])
+    if not isinstance(image, PIL.Image.Image):
+        return image
+    if image.mode == "RGB":
+        return image
+    # First we convert to RGBA to set background to white.
+    image = image.convert("RGBA")
+    # Create a new image with a white background.
+    new_image = PIL.Image.new("RGBA", image.size, "WHITE")
+    new_image.paste(image, (0, 0), image)
+    new_image = new_image.convert("RGB")
+    return new_image
+def _num_image_tokens(image_size: Tuple[int, int], patch_size: Tuple[int, int]) -> int:
+    """
+    Calculate the number of image tokens given the image size and patch size.
+    Args:
+        image_size (`Tuple[int, int]`):
+            The size of the image as `(height, width)`.
+        patch_size (`Tuple[int, int]`):
+            The patch size as `(height, width)`.
+    Returns:
+        `int`: The number of image tokens.
+    """
+    height, width = image_size
+    patch_height, patch_width = patch_size if isinstance(patch_size, (tuple, list)) else (patch_size, patch_size)
+    num_width_tokens = (width - 1) // patch_width + 1
+    num_height_tokens = (height - 1) // patch_height + 1
+    return num_height_tokens, num_width_tokens
+def get_resize_output_image_size(
+    input_image: ImageInput,
+    size: Union[int, Tuple[int, int], List[int], Tuple[int]],
+    patch_size: Union[int, Tuple[int, int], List[int], Tuple[int]],
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple:
+    """
+    Find the target (height, width) dimension of the output image after resizing given the input image and the desired
+    size.
+    Args:
+        input_image (`ImageInput`):
+            The image to resize.
+        size (`int` or `Tuple[int, int]`):
+            Max image size an input image can be. Must be a dictionary with the key "longest_edge".
+        patch_size (`int` or `Tuple[int, int]`):
+            The patch_size as `(height, width)` to use for resizing the image. If patch_size is an integer, `(patch_size, patch_size)`
+            will be used
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If unset, will use the inferred format from the input.
+    Returns:
+        `tuple`: The target (height, width) dimension of the output image after resizing.
+    """
+    max_height, max_width = size if isinstance(size, (tuple, list)) else (size, size)
+    patch_height, patch_width = patch_size if isinstance(patch_size, (tuple, list)) else (patch_size, patch_size)
+    height, width = get_image_size(input_image, input_data_format)
+    ratio = max(height / max_height, width / max_width)
+    if ratio > 1:
+        # Orgiginal implementation uses `round` which utilises bankers rounding, which can lead to surprising results
+        # Here we use floor to ensure the image is always smaller than the given "longest_edge"
+        height = int(math.floor(height / ratio))
+        width = int(math.floor(width / ratio))
+    num_height_tokens, num_width_tokens = _num_image_tokens((height, width), (patch_height, patch_width))
+    return num_height_tokens * patch_height, num_width_tokens * patch_width
+class PixtralImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Pixtral image processor.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"longest_edge": 1024}`):
+            Size of the maximum dimension of either the height or width dimension of the image. Used to control how
+            images are resized. If either the height or width are greater than `size["longest_edge"]` then both the height and width are rescaled by `height / ratio`, `width /ratio` where `ratio = max(height / longest_edge, width / longest_edge)`
+        patch_size (`Dict[str, int]` *optional*, defaults to `{"height": 16, "width": 16}`):
+            Size of the patches in the model, used to calculate the output image size. Can be overridden by `patch_size` in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        patch_size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"longest_edge": 1024}
+        patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}
+        patch_size = get_size_dict(patch_size, default_to_square=True)
+        self.do_resize = do_resize
+        self.size = size
+        self.patch_size = patch_size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else [0.48145466, 0.4578275, 0.40821073]
+        self.image_std = image_std if image_std is not None else [0.26862954, 0.26130258, 0.27577711]
+        self.do_convert_rgb = do_convert_rgb
+        self._valid_processor_keys = [
+            "images",
+            "do_resize",
+            "size",
+            "patch_size",
+            "resample",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "image_mean",
+            "image_std",
+            "do_convert_rgb",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        patch_size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Dict containing the longest possible edge of the image.
+            patch_size (`Dict[str, int]`):
+                Patch size used to calculate the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        if "longest_edge" in size:
+            size = (size["longest_edge"], size["longest_edge"])
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("size must contain either 'longest_edge' or 'height' and 'width'.")
+        if "height" in patch_size and "width" in patch_size:
+            patch_size = (patch_size["height"], patch_size["width"])
+        else:
+            raise ValueError("patch_size must contain either 'shortest_edge' or 'height' and 'width'.")
+        output_size = get_resize_output_image_size(
+            image,
+            size=size,
+            patch_size=patch_size,
+            input_data_format=input_data_format,
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+    def _pad_for_batching(
+        self,
+        pixel_values: List[np.ndarray],
+        image_sizes: List[List[int]],
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches.
+        Args:
+            pixel_values (`List[np.ndarray]`):
+                An array of pixel values of each images of shape (`batch_size`, `height`, `width`, `channels`)
+            image_sizes (`List[List[int]]`):
+                A list of sizes for each image in `pixel_values` in (height, width) format.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use the inferred format of the input image.
+        Returns:
+            List[`np.ndarray`]: The padded images.
+        """
+        max_shape = (
+            max([size[0] for size in image_sizes]),
+            max([size[1] for size in image_sizes]),
+        )
+        pixel_values = [
+            pad(
+                image,
+                padding=((0, max_shape[0] - size[0]), (0, max_shape[1] - size[1])),
+                data_format=data_format,
+                input_data_format=input_data_format,
+            )
+            for image, size in zip(pixel_values, image_sizes)
+        ]
+        return pixel_values
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Dict[str, int] = None,
+        patch_size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Describes the maximum input dimensions to the model.
+            patch_size (`Dict[str, int]`, *optional*, defaults to `self.patch_size`):
+                Patch size in the model. Used to calculate the image after resizing.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        patch_size = get_size_dict(patch_size, default_to_square=True)
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+        images = make_list_of_images(images)
+        if not valid_images(images[0]):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        batch_images = []
+        batch_image_sizes = []
+        for image in images:
+            if do_resize:
+                image = self.resize(
+                    image=image,
+                    size=size,
+                    patch_size=patch_size,
+                    resample=resample,
+                    input_data_format=input_data_format,
+                )
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            batch_images.append(image)
+            batch_image_sizes.append(get_image_size(image, data_format))
+        pixel_values = self._pad_for_batching(
+            pixel_values=batch_images,
+            image_sizes=batch_image_sizes,
+            input_data_format=data_format,
+            data_format=data_format,
+        )
+        return BatchFeature(
+            data={"pixel_values": pixel_values, "image_sizes": batch_image_sizes}, tensor_type=return_tensors
+        )
+__all__ = ["PixtralImageProcessor"]

docs/transformers/build/lib/transformers/models/pixtral/modeling_pixtral.py ADDED Viewed

	@@ -0,0 +1,505 @@

+# coding=utf-8
+# Copyright 2024 Mistral and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Pixtral model."""
+from typing import Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from ... import PreTrainedModel
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_rope_utils import dynamic_rope_update
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_pixtral import PixtralVisionConfig
+logger = logging.get_logger(__name__)
+def position_ids_in_meshgrid(patch_embeds_list, max_width):
+    positions = []
+    for patch in patch_embeds_list:
+        height, width = patch.shape[-2:]
+        mesh = torch.meshgrid(torch.arange(height), torch.arange(width), indexing="ij")
+        h_grid, v_grid = torch.stack(mesh, dim=-1).reshape(-1, 2).chunk(2, -1)
+        ids = h_grid * max_width + v_grid
+        positions.append(ids[:, 0])
+    return torch.cat(positions)
+class PixtralRotaryEmbedding(nn.Module):
+    """
+    The key with pixtral embedding is just that you have a frequency for each pixel positions.
+    If you have height x width pixels (or embedding pixels), then the frequency used for ROPE
+    is given by indexing the pre_computed frequency on the width and height.
+    What you output is of dimension (batch, height * width, dim) with dim the embed dim.
+    This simply means that for each image hidden state, you are going to add
+    a corresponding positional embedding, based on its index in the grid.
+    """
+    def __init__(self, config, device=None):
+        super().__init__()
+        self.rope_type = "default"
+        self.dim = config.head_dim
+        self.base = config.rope_theta
+        max_patches_per_side = config.image_size // config.patch_size
+        freqs = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float() / self.dim))
+        h = torch.arange(max_patches_per_side, device=freqs.device)
+        w = torch.arange(max_patches_per_side, device=freqs.device)
+        freqs_h = torch.outer(h, freqs[::2]).float()
+        freqs_w = torch.outer(w, freqs[1::2]).float()
+        inv_freq = torch.cat(
+            [
+                freqs_h[:, None, :].repeat(1, max_patches_per_side, 1),
+                freqs_w[None, :, :].repeat(max_patches_per_side, 1, 1),
+            ],
+            dim=-1,
+        ).reshape(-1, self.dim // 2)  # we reshape to only index on the position indexes, not tuple of indexes
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        # TODO maybe make it torch compatible later on. We can also just slice
+        self.register_buffer("inv_freq", torch.cat((inv_freq, inv_freq), dim=-1), persistent=False)
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        freqs = self.inv_freq[position_ids]
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            emb = freqs
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+class PixtralAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.o_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+        batch_size, patches, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, patches, self.num_heads, self.head_dim).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, unsqueeze_dim=0)
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, patches, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Pixtral
+class PixtralMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Pixtral
+class PixtralRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        PixtralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+class PixtralAttentionLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention_norm = PixtralRMSNorm(config.hidden_size, eps=1e-5)
+        self.feed_forward = PixtralMLP(config)
+        self.attention = PixtralAttention(config)
+        self.ffn_norm = PixtralRMSNorm(config.hidden_size, eps=1e-5)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        output_attentions: Optional[bool] = None,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.attention_norm(hidden_states)
+        hidden_states, attn_weights = self.attention(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.ffn_norm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class PixtralTransformer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layers = torch.nn.ModuleList()
+        for _ in range(config.num_hidden_layers):
+            self.layers.append(PixtralAttentionLayer(config))
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embeddings which serve as input to the Transformer.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_embeddings,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    position_embeddings=position_embeddings,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+PIXTRAL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`PixtralVisionConfig`]):
+            Model configuration class with all the parameters of the vision encoder. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+class PixtralPreTrainedModel(PreTrainedModel):
+    config_class = PixtralVisionConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PixtralAttentionLayer"]
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, PixtralRMSNorm):
+            module.weight.data.fill_(1.0)
+PIXTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`AutoImageProcessor.__call__`]
+            for details.
+        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`, *optional*):
+            The sizes of the images in the batch, being (height, width) for each image.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+def generate_block_attention_mask(patch_embeds_list, tensor):
+    dtype = tensor.dtype
+    device = tensor.device
+    seq_len = tensor.shape[1]
+    d_min = torch.finfo(dtype).min
+    causal_mask = torch.full((seq_len, seq_len), fill_value=d_min, dtype=dtype, device=device)
+    block_end_idx = torch.tensor(patch_embeds_list).cumsum(-1)
+    block_start_idx = torch.tensor([0] + patch_embeds_list[:-1]).cumsum(-1)
+    for start, end in zip(block_start_idx, block_end_idx):
+        causal_mask[start:end, start:end] = 0
+    causal_mask = causal_mask[None, None, :, :].expand(tensor.shape[0], 1, -1, -1)
+    return causal_mask
+@add_start_docstrings(
+    "The bare Pixtral vision encoder outputting raw hidden-states without any specific head on top.",
+    PIXTRAL_START_DOCSTRING,
+)
+class PixtralVisionModel(PixtralPreTrainedModel):
+    base_model_prefix = "vision_encoder"
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.patch_conv = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=config.hidden_size,
+            kernel_size=config.patch_size,
+            stride=config.patch_size,
+            bias=False,
+        )
+        self.patch_size = config.patch_size
+        self.ln_pre = PixtralRMSNorm(config.hidden_size, eps=1e-5)
+        self.transformer = PixtralTransformer(config)
+        self.patch_positional_embedding = PixtralRotaryEmbedding(config)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.patch_conv
+    @add_start_docstrings_to_model_forward(PIXTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        image_sizes: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        *args,
+        **kwargs,
+    ) -> Union[Tuple, BaseModelOutput]:
+        """
+        Returns:
+            pixel_values: tensor of token features for
+                all tokens of all images of shape (N_toks, D)
+        """
+        # pass images through initial convolution independently
+        patch_embeds = self.patch_conv(pixel_values)
+        patch_embeds_list = [
+            embed[..., : (size[0] // self.patch_size), : (size[1] // self.patch_size)]
+            for embed, size in zip(patch_embeds, image_sizes)
+        ]
+        # flatten to a single sequence
+        patch_embeds = torch.cat([p.flatten(1).T for p in patch_embeds_list], dim=0).unsqueeze(0)
+        patch_embeds = self.ln_pre(patch_embeds)
+        # positional embeddings
+        position_ids = position_ids_in_meshgrid(
+            patch_embeds_list, max_width=self.config.image_size // self.config.patch_size
+        )
+        position_embeddings = self.patch_positional_embedding(patch_embeds, position_ids)
+        attention_mask = generate_block_attention_mask(
+            [p.shape[-2] * p.shape[-1] for p in patch_embeds_list], patch_embeds
+        )
+        out = self.transformer(
+            patch_embeds,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+        return out
+__all__ = ["PixtralVisionModel", "PixtralPreTrainedModel"]