Duplicate from TrumpMcDonaldz/Florence-2-large-onnx

Browse files

Files changed (14) hide show

.gitattributes +35 -0
.gitignore +468 -0
README.md +80 -0
added_tokens.json +1026 -0
config.json +233 -0
generation_config.json +4 -0
merges.txt +0 -0
preprocessor_config.json +83 -0
processing_florence2.py +1088 -0
processor_config.json +6 -0
special_tokens_map.json +0 -0
tokenizer.json +0 -0
tokenizer_config.json +0 -0
vocab.json +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,468 @@

+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+# Generated files
+.idea/**/contentModel.xml
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+# CMake
+cmake-build-*/
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+# File-based project format
+*.iws
+# IntelliJ
+out/
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+# JIRA plugin
+atlassian-ide-plugin.xml
+# Cursive Clojure plugin
+.idea/replstate.xml
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+# Editor-based Rest Client
+.idea/httpRequests
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+# Dump file
+*.stackdump
+# Folder config file
+[Dd]esktop.ini
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+# Windows shortcuts
+*.lnk
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+# Mono auto generated files
+mono_crash.*
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Ww][Ii][Nn]32/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+# ASP.NET Scaffolding
+ScaffoldingReadMe.txt
+# StyleCop
+StyleCopReport.xml
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+# Chutzpah Test files
+_Chutzpah*
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+# Visual Studio Trace Files
+*.e2e
+# TFS 2012 Local Workspace
+$tf/
+# Guidance Automation Toolkit
+*.gpState
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+# TeamCity is a build add-in
+_TeamCity*
+# DotCover is a Code Coverage Tool
+*.dotCover
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+# Coverlet is a free, cross platform Code Coverage Tool
+coverage*.json
+coverage*.xml
+coverage*.info
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+# Web workbench (sass)
+.sass-cache/
+# Installshield output folder
+[Ee]xpress/
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+# Click-Once directory
+publish/
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+# Microsoft Azure Emulator
+ecf/
+rcf/
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+# RIA/Silverlight projects
+Generated_Code/
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+# Microsoft Fakes
+FakesAssemblies/
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+# Visual Studio 6 build log
+*.plg
+# Visual Studio 6 workspace options file
+*.opt
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+# FAKE - F# Make
+.fake/
+# CodeRush personal settings
+.cr/personal
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+# Tabs Studio
+*.tss
+# Telerik's JustMock configuration file
+*.jmconfig
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+# OpenCover UI analysis results
+OpenCover/
+# Azure Stream Analytics local run output
+ASALocalRun/
+# MSBuild Binary and Structured Log
+*.binlog
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+# Local History for Visual Studio
+.localhistory/
+# BeatPulse healthcheck temp database
+healthchecksdb
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
+# Fody - auto-generated XML schema
+FodyWeavers.xsd
+# Added by TrumpMcDonaldz:
+# Ignore DS_Store
+.DS_Store
+# Ignore appsettings.json
+appsettings.json
+# Ignore .idea
+.idea/

README.md ADDED Viewed

	@@ -0,0 +1,80 @@

+---
+license: mit
+pipeline_tag: image-text-to-text
+tags:
+- vision
+- text-generation
+- text2text-generation
+- image-to-text
+library_name: transformers.js
+---
+https://huggingface.co/microsoft/Florence-2-large with ONNX weights to be compatible with Transformers.js.
+## Usage (Transformers.js)
+> [!IMPORTANT]
+> NOTE: Florence-2 support is experimental and requires you to install Transformers.js [v3](https://github.com/xenova/transformers.js/tree/v3) from source.
+If you haven't already, you can install the [Transformers.js](https://huggingface.co/docs/transformers.js) JavaScript library from [GitHub](https://github.com/xenova/transformers.js/tree/v3) using:
+```bash
+npm install xenova/transformers.js#v3
+```
+**Example:** Perform image captioning with `onnx-community/Florence-2-large`.
+```js
+import {
+    Florence2ForConditionalGeneration,
+    AutoProcessor,
+    AutoTokenizer,
+    RawImage,
+} from '@xenova/transformers';
+// Load model, processor, and tokenizer
+const model_id = 'onnx-community/Florence-2-large';
+const model = await Florence2ForConditionalGeneration.from_pretrained(model_id, {
+    dtype: {
+        embed_tokens: 'fp16', // or 'fp32'
+        vision_encoder: 'fp16', // or 'fp32'
+        encoder_model: 'q4',
+        decoder_model_merged: 'q4',
+    },
+});
+const processor = await AutoProcessor.from_pretrained(model_id);
+const tokenizer = await AutoTokenizer.from_pretrained(model_id);
+// Load image and prepare vision inputs
+const url = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg';
+const image = await RawImage.fromURL(url);
+const vision_inputs = await processor(image);
+// Specify task and prepare text inputs
+const task = '<MORE_DETAILED_CAPTION>';
+const prompts = processor.construct_prompts(task);
+const text_inputs = tokenizer(prompts);
+// Generate text
+const generated_ids = await model.generate({
+    ...text_inputs,
+    ...vision_inputs,
+    max_new_tokens: 256,
+});
+// Decode generated text
+const generated_text = tokenizer.batch_decode(generated_ids, { skip_special_tokens: false })[0];
+// Post-process the generated text
+const result = processor.post_process_generation(generated_text, task, image.size);
+console.log(result);
+// { '<MORE_DETAILED_CAPTION>': 'The image shows a vintage Volkswagen Beetle car parked on a cobblestone street in front of a yellow building with two wooden doors. The car is a bright turquoise color and has a classic design with a round body and a sloping roofline. It has two doors on either side of the car, one on the left side and one in the center, with a brown door on the right side. The doors are made of wood and have a rustic, weathered look. The building behind the car is painted in a light yellow color and appears to be old and dilapidated. The sky is blue and there are trees in the background. The image is taken from a low angle, looking up at the car and the building.' }
+```
+We also released an online demo, which you can try yourself: https://huggingface.co/spaces/Xenova/florence2-webgpu
+<video controls autoplay src="https://cdn-uploads.huggingface.co/production/uploads/61b253b7ac5ecaae3d1efe0c/BJj3jQXNqS_7Nt2MSb2ss.mp4"></video>
+---
+Note: Having a separate repo for ONNX weights is intended to be a temporary solution until WebML gains more traction. If you would like to make your models web-ready, we recommend converting to ONNX using [🤗 Optimum](https://huggingface.co/docs/optimum/index) and structuring your repo like this one (with ONNX weights located in a subfolder named `onnx`).

added_tokens.json ADDED Viewed

	@@ -0,0 +1,1026 @@

+{
+  "</cap>": 51270,
+  "</dcap>": 51274,
+  "</grounding>": 51276,
+  "</ncap>": 51272,
+  "</ocr>": 50268,
+  "</od>": 50266,
+  "</poly>": 51287,
+  "</proposal>": 51285,
+  "</region_cap>": 51281,
+  "</region_to_desciption>": 51283,
+  "</seg>": 51278,
+  "<and>": 51288,
+  "<cap>": 51269,
+  "<dcap>": 51273,
+  "<grounding>": 51275,
+  "<loc_0>": 50269,
+  "<loc_100>": 50369,
+  "<loc_101>": 50370,
+  "<loc_102>": 50371,
+  "<loc_103>": 50372,
+  "<loc_104>": 50373,
+  "<loc_105>": 50374,
+  "<loc_106>": 50375,
+  "<loc_107>": 50376,
+  "<loc_108>": 50377,
+  "<loc_109>": 50378,
+  "<loc_10>": 50279,
+  "<loc_110>": 50379,
+  "<loc_111>": 50380,
+  "<loc_112>": 50381,
+  "<loc_113>": 50382,
+  "<loc_114>": 50383,
+  "<loc_115>": 50384,
+  "<loc_116>": 50385,
+  "<loc_117>": 50386,
+  "<loc_118>": 50387,
+  "<loc_119>": 50388,
+  "<loc_11>": 50280,
+  "<loc_120>": 50389,
+  "<loc_121>": 50390,
+  "<loc_122>": 50391,
+  "<loc_123>": 50392,
+  "<loc_124>": 50393,
+  "<loc_125>": 50394,
+  "<loc_126>": 50395,
+  "<loc_127>": 50396,
+  "<loc_128>": 50397,
+  "<loc_129>": 50398,
+  "<loc_12>": 50281,
+  "<loc_130>": 50399,
+  "<loc_131>": 50400,
+  "<loc_132>": 50401,
+  "<loc_133>": 50402,
+  "<loc_134>": 50403,
+  "<loc_135>": 50404,
+  "<loc_136>": 50405,
+  "<loc_137>": 50406,
+  "<loc_138>": 50407,
+  "<loc_139>": 50408,
+  "<loc_13>": 50282,
+  "<loc_140>": 50409,
+  "<loc_141>": 50410,
+  "<loc_142>": 50411,
+  "<loc_143>": 50412,
+  "<loc_144>": 50413,
+  "<loc_145>": 50414,
+  "<loc_146>": 50415,
+  "<loc_147>": 50416,
+  "<loc_148>": 50417,
+  "<loc_149>": 50418,
+  "<loc_14>": 50283,
+  "<loc_150>": 50419,
+  "<loc_151>": 50420,
+  "<loc_152>": 50421,
+  "<loc_153>": 50422,
+  "<loc_154>": 50423,
+  "<loc_155>": 50424,
+  "<loc_156>": 50425,
+  "<loc_157>": 50426,
+  "<loc_158>": 50427,
+  "<loc_159>": 50428,
+  "<loc_15>": 50284,
+  "<loc_160>": 50429,
+  "<loc_161>": 50430,
+  "<loc_162>": 50431,
+  "<loc_163>": 50432,
+  "<loc_164>": 50433,
+  "<loc_165>": 50434,
+  "<loc_166>": 50435,
+  "<loc_167>": 50436,
+  "<loc_168>": 50437,
+  "<loc_169>": 50438,
+  "<loc_16>": 50285,
+  "<loc_170>": 50439,
+  "<loc_171>": 50440,
+  "<loc_172>": 50441,
+  "<loc_173>": 50442,
+  "<loc_174>": 50443,
+  "<loc_175>": 50444,
+  "<loc_176>": 50445,
+  "<loc_177>": 50446,
+  "<loc_178>": 50447,
+  "<loc_179>": 50448,
+  "<loc_17>": 50286,
+  "<loc_180>": 50449,
+  "<loc_181>": 50450,
+  "<loc_182>": 50451,
+  "<loc_183>": 50452,
+  "<loc_184>": 50453,
+  "<loc_185>": 50454,
+  "<loc_186>": 50455,
+  "<loc_187>": 50456,
+  "<loc_188>": 50457,
+  "<loc_189>": 50458,
+  "<loc_18>": 50287,
+  "<loc_190>": 50459,
+  "<loc_191>": 50460,
+  "<loc_192>": 50461,
+  "<loc_193>": 50462,
+  "<loc_194>": 50463,
+  "<loc_195>": 50464,
+  "<loc_196>": 50465,
+  "<loc_197>": 50466,
+  "<loc_198>": 50467,
+  "<loc_199>": 50468,
+  "<loc_19>": 50288,
+  "<loc_1>": 50270,
+  "<loc_200>": 50469,
+  "<loc_201>": 50470,
+  "<loc_202>": 50471,
+  "<loc_203>": 50472,
+  "<loc_204>": 50473,
+  "<loc_205>": 50474,
+  "<loc_206>": 50475,
+  "<loc_207>": 50476,
+  "<loc_208>": 50477,
+  "<loc_209>": 50478,
+  "<loc_20>": 50289,
+  "<loc_210>": 50479,
+  "<loc_211>": 50480,
+  "<loc_212>": 50481,
+  "<loc_213>": 50482,
+  "<loc_214>": 50483,
+  "<loc_215>": 50484,
+  "<loc_216>": 50485,
+  "<loc_217>": 50486,
+  "<loc_218>": 50487,
+  "<loc_219>": 50488,
+  "<loc_21>": 50290,
+  "<loc_220>": 50489,
+  "<loc_221>": 50490,
+  "<loc_222>": 50491,
+  "<loc_223>": 50492,
+  "<loc_224>": 50493,
+  "<loc_225>": 50494,
+  "<loc_226>": 50495,
+  "<loc_227>": 50496,
+  "<loc_228>": 50497,
+  "<loc_229>": 50498,
+  "<loc_22>": 50291,
+  "<loc_230>": 50499,
+  "<loc_231>": 50500,
+  "<loc_232>": 50501,
+  "<loc_233>": 50502,
+  "<loc_234>": 50503,
+  "<loc_235>": 50504,
+  "<loc_236>": 50505,
+  "<loc_237>": 50506,
+  "<loc_238>": 50507,
+  "<loc_239>": 50508,
+  "<loc_23>": 50292,
+  "<loc_240>": 50509,
+  "<loc_241>": 50510,
+  "<loc_242>": 50511,
+  "<loc_243>": 50512,
+  "<loc_244>": 50513,
+  "<loc_245>": 50514,
+  "<loc_246>": 50515,
+  "<loc_247>": 50516,
+  "<loc_248>": 50517,
+  "<loc_249>": 50518,
+  "<loc_24>": 50293,
+  "<loc_250>": 50519,
+  "<loc_251>": 50520,
+  "<loc_252>": 50521,
+  "<loc_253>": 50522,
+  "<loc_254>": 50523,
+  "<loc_255>": 50524,
+  "<loc_256>": 50525,
+  "<loc_257>": 50526,
+  "<loc_258>": 50527,
+  "<loc_259>": 50528,
+  "<loc_25>": 50294,
+  "<loc_260>": 50529,
+  "<loc_261>": 50530,
+  "<loc_262>": 50531,
+  "<loc_263>": 50532,
+  "<loc_264>": 50533,
+  "<loc_265>": 50534,
+  "<loc_266>": 50535,
+  "<loc_267>": 50536,
+  "<loc_268>": 50537,
+  "<loc_269>": 50538,
+  "<loc_26>": 50295,
+  "<loc_270>": 50539,
+  "<loc_271>": 50540,
+  "<loc_272>": 50541,
+  "<loc_273>": 50542,
+  "<loc_274>": 50543,
+  "<loc_275>": 50544,
+  "<loc_276>": 50545,
+  "<loc_277>": 50546,
+  "<loc_278>": 50547,
+  "<loc_279>": 50548,
+  "<loc_27>": 50296,
+  "<loc_280>": 50549,
+  "<loc_281>": 50550,
+  "<loc_282>": 50551,
+  "<loc_283>": 50552,
+  "<loc_284>": 50553,
+  "<loc_285>": 50554,
+  "<loc_286>": 50555,
+  "<loc_287>": 50556,
+  "<loc_288>": 50557,
+  "<loc_289>": 50558,
+  "<loc_28>": 50297,
+  "<loc_290>": 50559,
+  "<loc_291>": 50560,
+  "<loc_292>": 50561,
+  "<loc_293>": 50562,
+  "<loc_294>": 50563,
+  "<loc_295>": 50564,
+  "<loc_296>": 50565,
+  "<loc_297>": 50566,
+  "<loc_298>": 50567,
+  "<loc_299>": 50568,
+  "<loc_29>": 50298,
+  "<loc_2>": 50271,
+  "<loc_300>": 50569,
+  "<loc_301>": 50570,
+  "<loc_302>": 50571,
+  "<loc_303>": 50572,
+  "<loc_304>": 50573,
+  "<loc_305>": 50574,
+  "<loc_306>": 50575,
+  "<loc_307>": 50576,
+  "<loc_308>": 50577,
+  "<loc_309>": 50578,
+  "<loc_30>": 50299,
+  "<loc_310>": 50579,
+  "<loc_311>": 50580,
+  "<loc_312>": 50581,
+  "<loc_313>": 50582,
+  "<loc_314>": 50583,
+  "<loc_315>": 50584,
+  "<loc_316>": 50585,
+  "<loc_317>": 50586,
+  "<loc_318>": 50587,
+  "<loc_319>": 50588,
+  "<loc_31>": 50300,
+  "<loc_320>": 50589,
+  "<loc_321>": 50590,
+  "<loc_322>": 50591,
+  "<loc_323>": 50592,
+  "<loc_324>": 50593,
+  "<loc_325>": 50594,
+  "<loc_326>": 50595,
+  "<loc_327>": 50596,
+  "<loc_328>": 50597,
+  "<loc_329>": 50598,
+  "<loc_32>": 50301,
+  "<loc_330>": 50599,
+  "<loc_331>": 50600,
+  "<loc_332>": 50601,
+  "<loc_333>": 50602,
+  "<loc_334>": 50603,
+  "<loc_335>": 50604,
+  "<loc_336>": 50605,
+  "<loc_337>": 50606,
+  "<loc_338>": 50607,
+  "<loc_339>": 50608,
+  "<loc_33>": 50302,
+  "<loc_340>": 50609,
+  "<loc_341>": 50610,
+  "<loc_342>": 50611,
+  "<loc_343>": 50612,
+  "<loc_344>": 50613,
+  "<loc_345>": 50614,
+  "<loc_346>": 50615,
+  "<loc_347>": 50616,
+  "<loc_348>": 50617,
+  "<loc_349>": 50618,
+  "<loc_34>": 50303,
+  "<loc_350>": 50619,
+  "<loc_351>": 50620,
+  "<loc_352>": 50621,
+  "<loc_353>": 50622,
+  "<loc_354>": 50623,
+  "<loc_355>": 50624,
+  "<loc_356>": 50625,
+  "<loc_357>": 50626,
+  "<loc_358>": 50627,
+  "<loc_359>": 50628,
+  "<loc_35>": 50304,
+  "<loc_360>": 50629,
+  "<loc_361>": 50630,
+  "<loc_362>": 50631,
+  "<loc_363>": 50632,
+  "<loc_364>": 50633,
+  "<loc_365>": 50634,
+  "<loc_366>": 50635,
+  "<loc_367>": 50636,
+  "<loc_368>": 50637,
+  "<loc_369>": 50638,
+  "<loc_36>": 50305,
+  "<loc_370>": 50639,
+  "<loc_371>": 50640,
+  "<loc_372>": 50641,
+  "<loc_373>": 50642,
+  "<loc_374>": 50643,
+  "<loc_375>": 50644,
+  "<loc_376>": 50645,
+  "<loc_377>": 50646,
+  "<loc_378>": 50647,
+  "<loc_379>": 50648,
+  "<loc_37>": 50306,
+  "<loc_380>": 50649,
+  "<loc_381>": 50650,
+  "<loc_382>": 50651,
+  "<loc_383>": 50652,
+  "<loc_384>": 50653,
+  "<loc_385>": 50654,
+  "<loc_386>": 50655,
+  "<loc_387>": 50656,
+  "<loc_388>": 50657,
+  "<loc_389>": 50658,
+  "<loc_38>": 50307,
+  "<loc_390>": 50659,
+  "<loc_391>": 50660,
+  "<loc_392>": 50661,
+  "<loc_393>": 50662,
+  "<loc_394>": 50663,
+  "<loc_395>": 50664,
+  "<loc_396>": 50665,
+  "<loc_397>": 50666,
+  "<loc_398>": 50667,
+  "<loc_399>": 50668,
+  "<loc_39>": 50308,
+  "<loc_3>": 50272,
+  "<loc_400>": 50669,
+  "<loc_401>": 50670,
+  "<loc_402>": 50671,
+  "<loc_403>": 50672,
+  "<loc_404>": 50673,
+  "<loc_405>": 50674,
+  "<loc_406>": 50675,
+  "<loc_407>": 50676,
+  "<loc_408>": 50677,
+  "<loc_409>": 50678,
+  "<loc_40>": 50309,
+  "<loc_410>": 50679,
+  "<loc_411>": 50680,
+  "<loc_412>": 50681,
+  "<loc_413>": 50682,
+  "<loc_414>": 50683,
+  "<loc_415>": 50684,
+  "<loc_416>": 50685,
+  "<loc_417>": 50686,
+  "<loc_418>": 50687,
+  "<loc_419>": 50688,
+  "<loc_41>": 50310,
+  "<loc_420>": 50689,
+  "<loc_421>": 50690,
+  "<loc_422>": 50691,
+  "<loc_423>": 50692,
+  "<loc_424>": 50693,
+  "<loc_425>": 50694,
+  "<loc_426>": 50695,
+  "<loc_427>": 50696,
+  "<loc_428>": 50697,
+  "<loc_429>": 50698,
+  "<loc_42>": 50311,
+  "<loc_430>": 50699,
+  "<loc_431>": 50700,
+  "<loc_432>": 50701,
+  "<loc_433>": 50702,
+  "<loc_434>": 50703,
+  "<loc_435>": 50704,
+  "<loc_436>": 50705,
+  "<loc_437>": 50706,
+  "<loc_438>": 50707,
+  "<loc_439>": 50708,
+  "<loc_43>": 50312,
+  "<loc_440>": 50709,
+  "<loc_441>": 50710,
+  "<loc_442>": 50711,
+  "<loc_443>": 50712,
+  "<loc_444>": 50713,
+  "<loc_445>": 50714,
+  "<loc_446>": 50715,
+  "<loc_447>": 50716,
+  "<loc_448>": 50717,
+  "<loc_449>": 50718,
+  "<loc_44>": 50313,
+  "<loc_450>": 50719,
+  "<loc_451>": 50720,
+  "<loc_452>": 50721,
+  "<loc_453>": 50722,
+  "<loc_454>": 50723,
+  "<loc_455>": 50724,
+  "<loc_456>": 50725,
+  "<loc_457>": 50726,
+  "<loc_458>": 50727,
+  "<loc_459>": 50728,
+  "<loc_45>": 50314,
+  "<loc_460>": 50729,
+  "<loc_461>": 50730,
+  "<loc_462>": 50731,
+  "<loc_463>": 50732,
+  "<loc_464>": 50733,
+  "<loc_465>": 50734,
+  "<loc_466>": 50735,
+  "<loc_467>": 50736,
+  "<loc_468>": 50737,
+  "<loc_469>": 50738,
+  "<loc_46>": 50315,
+  "<loc_470>": 50739,
+  "<loc_471>": 50740,
+  "<loc_472>": 50741,
+  "<loc_473>": 50742,
+  "<loc_474>": 50743,
+  "<loc_475>": 50744,
+  "<loc_476>": 50745,
+  "<loc_477>": 50746,
+  "<loc_478>": 50747,
+  "<loc_479>": 50748,
+  "<loc_47>": 50316,
+  "<loc_480>": 50749,
+  "<loc_481>": 50750,
+  "<loc_482>": 50751,
+  "<loc_483>": 50752,
+  "<loc_484>": 50753,
+  "<loc_485>": 50754,
+  "<loc_486>": 50755,
+  "<loc_487>": 50756,
+  "<loc_488>": 50757,
+  "<loc_489>": 50758,
+  "<loc_48>": 50317,
+  "<loc_490>": 50759,
+  "<loc_491>": 50760,
+  "<loc_492>": 50761,
+  "<loc_493>": 50762,
+  "<loc_494>": 50763,
+  "<loc_495>": 50764,
+  "<loc_496>": 50765,
+  "<loc_497>": 50766,
+  "<loc_498>": 50767,
+  "<loc_499>": 50768,
+  "<loc_49>": 50318,
+  "<loc_4>": 50273,
+  "<loc_500>": 50769,
+  "<loc_501>": 50770,
+  "<loc_502>": 50771,
+  "<loc_503>": 50772,
+  "<loc_504>": 50773,
+  "<loc_505>": 50774,
+  "<loc_506>": 50775,
+  "<loc_507>": 50776,
+  "<loc_508>": 50777,
+  "<loc_509>": 50778,
+  "<loc_50>": 50319,
+  "<loc_510>": 50779,
+  "<loc_511>": 50780,
+  "<loc_512>": 50781,
+  "<loc_513>": 50782,
+  "<loc_514>": 50783,
+  "<loc_515>": 50784,
+  "<loc_516>": 50785,
+  "<loc_517>": 50786,
+  "<loc_518>": 50787,
+  "<loc_519>": 50788,
+  "<loc_51>": 50320,
+  "<loc_520>": 50789,
+  "<loc_521>": 50790,
+  "<loc_522>": 50791,
+  "<loc_523>": 50792,
+  "<loc_524>": 50793,
+  "<loc_525>": 50794,
+  "<loc_526>": 50795,
+  "<loc_527>": 50796,
+  "<loc_528>": 50797,
+  "<loc_529>": 50798,
+  "<loc_52>": 50321,
+  "<loc_530>": 50799,
+  "<loc_531>": 50800,
+  "<loc_532>": 50801,
+  "<loc_533>": 50802,
+  "<loc_534>": 50803,
+  "<loc_535>": 50804,
+  "<loc_536>": 50805,
+  "<loc_537>": 50806,
+  "<loc_538>": 50807,
+  "<loc_539>": 50808,
+  "<loc_53>": 50322,
+  "<loc_540>": 50809,
+  "<loc_541>": 50810,
+  "<loc_542>": 50811,
+  "<loc_543>": 50812,
+  "<loc_544>": 50813,
+  "<loc_545>": 50814,
+  "<loc_546>": 50815,
+  "<loc_547>": 50816,
+  "<loc_548>": 50817,
+  "<loc_549>": 50818,
+  "<loc_54>": 50323,
+  "<loc_550>": 50819,
+  "<loc_551>": 50820,
+  "<loc_552>": 50821,
+  "<loc_553>": 50822,
+  "<loc_554>": 50823,
+  "<loc_555>": 50824,
+  "<loc_556>": 50825,
+  "<loc_557>": 50826,
+  "<loc_558>": 50827,
+  "<loc_559>": 50828,
+  "<loc_55>": 50324,
+  "<loc_560>": 50829,
+  "<loc_561>": 50830,
+  "<loc_562>": 50831,
+  "<loc_563>": 50832,
+  "<loc_564>": 50833,
+  "<loc_565>": 50834,
+  "<loc_566>": 50835,
+  "<loc_567>": 50836,
+  "<loc_568>": 50837,
+  "<loc_569>": 50838,
+  "<loc_56>": 50325,
+  "<loc_570>": 50839,
+  "<loc_571>": 50840,
+  "<loc_572>": 50841,
+  "<loc_573>": 50842,
+  "<loc_574>": 50843,
+  "<loc_575>": 50844,
+  "<loc_576>": 50845,
+  "<loc_577>": 50846,
+  "<loc_578>": 50847,
+  "<loc_579>": 50848,
+  "<loc_57>": 50326,
+  "<loc_580>": 50849,
+  "<loc_581>": 50850,
+  "<loc_582>": 50851,
+  "<loc_583>": 50852,
+  "<loc_584>": 50853,
+  "<loc_585>": 50854,
+  "<loc_586>": 50855,
+  "<loc_587>": 50856,
+  "<loc_588>": 50857,
+  "<loc_589>": 50858,
+  "<loc_58>": 50327,
+  "<loc_590>": 50859,
+  "<loc_591>": 50860,
+  "<loc_592>": 50861,
+  "<loc_593>": 50862,
+  "<loc_594>": 50863,
+  "<loc_595>": 50864,
+  "<loc_596>": 50865,
+  "<loc_597>": 50866,
+  "<loc_598>": 50867,
+  "<loc_599>": 50868,
+  "<loc_59>": 50328,
+  "<loc_5>": 50274,
+  "<loc_600>": 50869,
+  "<loc_601>": 50870,
+  "<loc_602>": 50871,
+  "<loc_603>": 50872,
+  "<loc_604>": 50873,
+  "<loc_605>": 50874,
+  "<loc_606>": 50875,
+  "<loc_607>": 50876,
+  "<loc_608>": 50877,
+  "<loc_609>": 50878,
+  "<loc_60>": 50329,
+  "<loc_610>": 50879,
+  "<loc_611>": 50880,
+  "<loc_612>": 50881,
+  "<loc_613>": 50882,
+  "<loc_614>": 50883,
+  "<loc_615>": 50884,
+  "<loc_616>": 50885,
+  "<loc_617>": 50886,
+  "<loc_618>": 50887,
+  "<loc_619>": 50888,
+  "<loc_61>": 50330,
+  "<loc_620>": 50889,
+  "<loc_621>": 50890,
+  "<loc_622>": 50891,
+  "<loc_623>": 50892,
+  "<loc_624>": 50893,
+  "<loc_625>": 50894,
+  "<loc_626>": 50895,
+  "<loc_627>": 50896,
+  "<loc_628>": 50897,
+  "<loc_629>": 50898,
+  "<loc_62>": 50331,
+  "<loc_630>": 50899,
+  "<loc_631>": 50900,
+  "<loc_632>": 50901,
+  "<loc_633>": 50902,
+  "<loc_634>": 50903,
+  "<loc_635>": 50904,
+  "<loc_636>": 50905,
+  "<loc_637>": 50906,
+  "<loc_638>": 50907,
+  "<loc_639>": 50908,
+  "<loc_63>": 50332,
+  "<loc_640>": 50909,
+  "<loc_641>": 50910,
+  "<loc_642>": 50911,
+  "<loc_643>": 50912,
+  "<loc_644>": 50913,
+  "<loc_645>": 50914,
+  "<loc_646>": 50915,
+  "<loc_647>": 50916,
+  "<loc_648>": 50917,
+  "<loc_649>": 50918,
+  "<loc_64>": 50333,
+  "<loc_650>": 50919,
+  "<loc_651>": 50920,
+  "<loc_652>": 50921,
+  "<loc_653>": 50922,
+  "<loc_654>": 50923,
+  "<loc_655>": 50924,
+  "<loc_656>": 50925,
+  "<loc_657>": 50926,
+  "<loc_658>": 50927,
+  "<loc_659>": 50928,
+  "<loc_65>": 50334,
+  "<loc_660>": 50929,
+  "<loc_661>": 50930,
+  "<loc_662>": 50931,
+  "<loc_663>": 50932,
+  "<loc_664>": 50933,
+  "<loc_665>": 50934,
+  "<loc_666>": 50935,
+  "<loc_667>": 50936,
+  "<loc_668>": 50937,
+  "<loc_669>": 50938,
+  "<loc_66>": 50335,
+  "<loc_670>": 50939,
+  "<loc_671>": 50940,
+  "<loc_672>": 50941,
+  "<loc_673>": 50942,
+  "<loc_674>": 50943,
+  "<loc_675>": 50944,
+  "<loc_676>": 50945,
+  "<loc_677>": 50946,
+  "<loc_678>": 50947,
+  "<loc_679>": 50948,
+  "<loc_67>": 50336,
+  "<loc_680>": 50949,
+  "<loc_681>": 50950,
+  "<loc_682>": 50951,
+  "<loc_683>": 50952,
+  "<loc_684>": 50953,
+  "<loc_685>": 50954,
+  "<loc_686>": 50955,
+  "<loc_687>": 50956,
+  "<loc_688>": 50957,
+  "<loc_689>": 50958,
+  "<loc_68>": 50337,
+  "<loc_690>": 50959,
+  "<loc_691>": 50960,
+  "<loc_692>": 50961,
+  "<loc_693>": 50962,
+  "<loc_694>": 50963,
+  "<loc_695>": 50964,
+  "<loc_696>": 50965,
+  "<loc_697>": 50966,
+  "<loc_698>": 50967,
+  "<loc_699>": 50968,
+  "<loc_69>": 50338,
+  "<loc_6>": 50275,
+  "<loc_700>": 50969,
+  "<loc_701>": 50970,
+  "<loc_702>": 50971,
+  "<loc_703>": 50972,
+  "<loc_704>": 50973,
+  "<loc_705>": 50974,
+  "<loc_706>": 50975,
+  "<loc_707>": 50976,
+  "<loc_708>": 50977,
+  "<loc_709>": 50978,
+  "<loc_70>": 50339,
+  "<loc_710>": 50979,
+  "<loc_711>": 50980,
+  "<loc_712>": 50981,
+  "<loc_713>": 50982,
+  "<loc_714>": 50983,
+  "<loc_715>": 50984,
+  "<loc_716>": 50985,
+  "<loc_717>": 50986,
+  "<loc_718>": 50987,
+  "<loc_719>": 50988,
+  "<loc_71>": 50340,
+  "<loc_720>": 50989,
+  "<loc_721>": 50990,
+  "<loc_722>": 50991,
+  "<loc_723>": 50992,
+  "<loc_724>": 50993,
+  "<loc_725>": 50994,
+  "<loc_726>": 50995,
+  "<loc_727>": 50996,
+  "<loc_728>": 50997,
+  "<loc_729>": 50998,
+  "<loc_72>": 50341,
+  "<loc_730>": 50999,
+  "<loc_731>": 51000,
+  "<loc_732>": 51001,
+  "<loc_733>": 51002,
+  "<loc_734>": 51003,
+  "<loc_735>": 51004,
+  "<loc_736>": 51005,
+  "<loc_737>": 51006,
+  "<loc_738>": 51007,
+  "<loc_739>": 51008,
+  "<loc_73>": 50342,
+  "<loc_740>": 51009,
+  "<loc_741>": 51010,
+  "<loc_742>": 51011,
+  "<loc_743>": 51012,
+  "<loc_744>": 51013,
+  "<loc_745>": 51014,
+  "<loc_746>": 51015,
+  "<loc_747>": 51016,
+  "<loc_748>": 51017,
+  "<loc_749>": 51018,
+  "<loc_74>": 50343,
+  "<loc_750>": 51019,
+  "<loc_751>": 51020,
+  "<loc_752>": 51021,
+  "<loc_753>": 51022,
+  "<loc_754>": 51023,
+  "<loc_755>": 51024,
+  "<loc_756>": 51025,
+  "<loc_757>": 51026,
+  "<loc_758>": 51027,
+  "<loc_759>": 51028,
+  "<loc_75>": 50344,
+  "<loc_760>": 51029,
+  "<loc_761>": 51030,
+  "<loc_762>": 51031,
+  "<loc_763>": 51032,
+  "<loc_764>": 51033,
+  "<loc_765>": 51034,
+  "<loc_766>": 51035,
+  "<loc_767>": 51036,
+  "<loc_768>": 51037,
+  "<loc_769>": 51038,
+  "<loc_76>": 50345,
+  "<loc_770>": 51039,
+  "<loc_771>": 51040,
+  "<loc_772>": 51041,
+  "<loc_773>": 51042,
+  "<loc_774>": 51043,
+  "<loc_775>": 51044,
+  "<loc_776>": 51045,
+  "<loc_777>": 51046,
+  "<loc_778>": 51047,
+  "<loc_779>": 51048,
+  "<loc_77>": 50346,
+  "<loc_780>": 51049,
+  "<loc_781>": 51050,
+  "<loc_782>": 51051,
+  "<loc_783>": 51052,
+  "<loc_784>": 51053,
+  "<loc_785>": 51054,
+  "<loc_786>": 51055,
+  "<loc_787>": 51056,
+  "<loc_788>": 51057,
+  "<loc_789>": 51058,
+  "<loc_78>": 50347,
+  "<loc_790>": 51059,
+  "<loc_791>": 51060,
+  "<loc_792>": 51061,
+  "<loc_793>": 51062,
+  "<loc_794>": 51063,
+  "<loc_795>": 51064,
+  "<loc_796>": 51065,
+  "<loc_797>": 51066,
+  "<loc_798>": 51067,
+  "<loc_799>": 51068,
+  "<loc_79>": 50348,
+  "<loc_7>": 50276,
+  "<loc_800>": 51069,
+  "<loc_801>": 51070,
+  "<loc_802>": 51071,
+  "<loc_803>": 51072,
+  "<loc_804>": 51073,
+  "<loc_805>": 51074,
+  "<loc_806>": 51075,
+  "<loc_807>": 51076,
+  "<loc_808>": 51077,
+  "<loc_809>": 51078,
+  "<loc_80>": 50349,
+  "<loc_810>": 51079,
+  "<loc_811>": 51080,
+  "<loc_812>": 51081,
+  "<loc_813>": 51082,
+  "<loc_814>": 51083,
+  "<loc_815>": 51084,
+  "<loc_816>": 51085,
+  "<loc_817>": 51086,
+  "<loc_818>": 51087,
+  "<loc_819>": 51088,
+  "<loc_81>": 50350,
+  "<loc_820>": 51089,
+  "<loc_821>": 51090,
+  "<loc_822>": 51091,
+  "<loc_823>": 51092,
+  "<loc_824>": 51093,
+  "<loc_825>": 51094,
+  "<loc_826>": 51095,
+  "<loc_827>": 51096,
+  "<loc_828>": 51097,
+  "<loc_829>": 51098,
+  "<loc_82>": 50351,
+  "<loc_830>": 51099,
+  "<loc_831>": 51100,
+  "<loc_832>": 51101,
+  "<loc_833>": 51102,
+  "<loc_834>": 51103,
+  "<loc_835>": 51104,
+  "<loc_836>": 51105,
+  "<loc_837>": 51106,
+  "<loc_838>": 51107,
+  "<loc_839>": 51108,
+  "<loc_83>": 50352,
+  "<loc_840>": 51109,
+  "<loc_841>": 51110,
+  "<loc_842>": 51111,
+  "<loc_843>": 51112,
+  "<loc_844>": 51113,
+  "<loc_845>": 51114,
+  "<loc_846>": 51115,
+  "<loc_847>": 51116,
+  "<loc_848>": 51117,
+  "<loc_849>": 51118,
+  "<loc_84>": 50353,
+  "<loc_850>": 51119,
+  "<loc_851>": 51120,
+  "<loc_852>": 51121,
+  "<loc_853>": 51122,
+  "<loc_854>": 51123,
+  "<loc_855>": 51124,
+  "<loc_856>": 51125,
+  "<loc_857>": 51126,
+  "<loc_858>": 51127,
+  "<loc_859>": 51128,
+  "<loc_85>": 50354,
+  "<loc_860>": 51129,
+  "<loc_861>": 51130,
+  "<loc_862>": 51131,
+  "<loc_863>": 51132,
+  "<loc_864>": 51133,
+  "<loc_865>": 51134,
+  "<loc_866>": 51135,
+  "<loc_867>": 51136,
+  "<loc_868>": 51137,
+  "<loc_869>": 51138,
+  "<loc_86>": 50355,
+  "<loc_870>": 51139,
+  "<loc_871>": 51140,
+  "<loc_872>": 51141,
+  "<loc_873>": 51142,
+  "<loc_874>": 51143,
+  "<loc_875>": 51144,
+  "<loc_876>": 51145,
+  "<loc_877>": 51146,
+  "<loc_878>": 51147,
+  "<loc_879>": 51148,
+  "<loc_87>": 50356,
+  "<loc_880>": 51149,
+  "<loc_881>": 51150,
+  "<loc_882>": 51151,
+  "<loc_883>": 51152,
+  "<loc_884>": 51153,
+  "<loc_885>": 51154,
+  "<loc_886>": 51155,
+  "<loc_887>": 51156,
+  "<loc_888>": 51157,
+  "<loc_889>": 51158,
+  "<loc_88>": 50357,
+  "<loc_890>": 51159,
+  "<loc_891>": 51160,
+  "<loc_892>": 51161,
+  "<loc_893>": 51162,
+  "<loc_894>": 51163,
+  "<loc_895>": 51164,
+  "<loc_896>": 51165,
+  "<loc_897>": 51166,
+  "<loc_898>": 51167,
+  "<loc_899>": 51168,
+  "<loc_89>": 50358,
+  "<loc_8>": 50277,
+  "<loc_900>": 51169,
+  "<loc_901>": 51170,
+  "<loc_902>": 51171,
+  "<loc_903>": 51172,
+  "<loc_904>": 51173,
+  "<loc_905>": 51174,
+  "<loc_906>": 51175,
+  "<loc_907>": 51176,
+  "<loc_908>": 51177,
+  "<loc_909>": 51178,
+  "<loc_90>": 50359,
+  "<loc_910>": 51179,
+  "<loc_911>": 51180,
+  "<loc_912>": 51181,
+  "<loc_913>": 51182,
+  "<loc_914>": 51183,
+  "<loc_915>": 51184,
+  "<loc_916>": 51185,
+  "<loc_917>": 51186,
+  "<loc_918>": 51187,
+  "<loc_919>": 51188,
+  "<loc_91>": 50360,
+  "<loc_920>": 51189,
+  "<loc_921>": 51190,
+  "<loc_922>": 51191,
+  "<loc_923>": 51192,
+  "<loc_924>": 51193,
+  "<loc_925>": 51194,
+  "<loc_926>": 51195,
+  "<loc_927>": 51196,
+  "<loc_928>": 51197,
+  "<loc_929>": 51198,
+  "<loc_92>": 50361,
+  "<loc_930>": 51199,
+  "<loc_931>": 51200,
+  "<loc_932>": 51201,
+  "<loc_933>": 51202,
+  "<loc_934>": 51203,
+  "<loc_935>": 51204,
+  "<loc_936>": 51205,
+  "<loc_937>": 51206,
+  "<loc_938>": 51207,
+  "<loc_939>": 51208,
+  "<loc_93>": 50362,
+  "<loc_940>": 51209,
+  "<loc_941>": 51210,
+  "<loc_942>": 51211,
+  "<loc_943>": 51212,
+  "<loc_944>": 51213,
+  "<loc_945>": 51214,
+  "<loc_946>": 51215,
+  "<loc_947>": 51216,
+  "<loc_948>": 51217,
+  "<loc_949>": 51218,
+  "<loc_94>": 50363,
+  "<loc_950>": 51219,
+  "<loc_951>": 51220,
+  "<loc_952>": 51221,
+  "<loc_953>": 51222,
+  "<loc_954>": 51223,
+  "<loc_955>": 51224,
+  "<loc_956>": 51225,
+  "<loc_957>": 51226,
+  "<loc_958>": 51227,
+  "<loc_959>": 51228,
+  "<loc_95>": 50364,
+  "<loc_960>": 51229,
+  "<loc_961>": 51230,
+  "<loc_962>": 51231,
+  "<loc_963>": 51232,
+  "<loc_964>": 51233,
+  "<loc_965>": 51234,
+  "<loc_966>": 51235,
+  "<loc_967>": 51236,
+  "<loc_968>": 51237,
+  "<loc_969>": 51238,
+  "<loc_96>": 50365,
+  "<loc_970>": 51239,
+  "<loc_971>": 51240,
+  "<loc_972>": 51241,
+  "<loc_973>": 51242,
+  "<loc_974>": 51243,
+  "<loc_975>": 51244,
+  "<loc_976>": 51245,
+  "<loc_977>": 51246,
+  "<loc_978>": 51247,
+  "<loc_979>": 51248,
+  "<loc_97>": 50366,
+  "<loc_980>": 51249,
+  "<loc_981>": 51250,
+  "<loc_982>": 51251,
+  "<loc_983>": 51252,
+  "<loc_984>": 51253,
+  "<loc_985>": 51254,
+  "<loc_986>": 51255,
+  "<loc_987>": 51256,
+  "<loc_988>": 51257,
+  "<loc_989>": 51258,
+  "<loc_98>": 50367,
+  "<loc_990>": 51259,
+  "<loc_991>": 51260,
+  "<loc_992>": 51261,
+  "<loc_993>": 51262,
+  "<loc_994>": 51263,
+  "<loc_995>": 51264,
+  "<loc_996>": 51265,
+  "<loc_997>": 51266,
+  "<loc_998>": 51267,
+  "<loc_999>": 51268,
+  "<loc_99>": 50368,
+  "<loc_9>": 50278,
+  "<ncap>": 51271,
+  "<ocr>": 50267,
+  "<od>": 50265,
+  "<poly>": 51286,
+  "<proposal>": 51284,
+  "<region_cap>": 51280,
+  "<region_to_desciption>": 51282,
+  "<seg>": 51277,
+  "<sep>": 51279
+}

config.json ADDED Viewed

	@@ -0,0 +1,233 @@

+{
+  "_name_or_path": "./Florence-2-large",
+  "architectures": [
+    "Florence2ForConditionalGeneration"
+  ],
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "ignore_index": -100,
+  "is_encoder_decoder": true,
+  "model_type": "florence2",
+  "pad_token_id": 1,
+  "projection_dim": 1024,
+  "text_config": {
+    "_name_or_path": "",
+    "activation_dropout": 0.1,
+    "activation_function": "gelu",
+    "add_bias_logits": false,
+    "add_cross_attention": false,
+    "add_final_layer_norm": false,
+    "architectures": null,
+    "attention_dropout": 0.1,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "classif_dropout": 0.1,
+    "classifier_dropout": 0.0,
+    "cross_attention_hidden_size": null,
+    "d_model": 1024,
+    "decoder_attention_heads": 16,
+    "decoder_ffn_dim": 4096,
+    "decoder_layerdrop": 0.0,
+    "decoder_layers": 12,
+    "decoder_start_token_id": 2,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.1,
+    "early_stopping": true,
+    "encoder_attention_heads": 16,
+    "encoder_ffn_dim": 4096,
+    "encoder_layerdrop": 0.0,
+    "encoder_layers": 12,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": 0,
+    "forced_eos_token_id": 2,
+    "gradient_checkpointing": false,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1",
+      "2": "LABEL_2"
+    },
+    "init_std": 0.02,
+    "is_decoder": false,
+    "is_encoder_decoder": true,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1,
+      "LABEL_2": 2
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 1024,
+    "min_length": 0,
+    "model_type": "florence2_language",
+    "no_repeat_ngram_size": 3,
+    "normalize_before": false,
+    "num_beam_groups": 1,
+    "num_beams": 3,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "scale_embedding": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 51289
+  },
+  "transformers_version": "4.42.0.dev0",
+  "use_cache": true,
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "depths": [
+      1,
+      1,
+      9,
+      1
+    ],
+    "dim_embed": [
+      256,
+      512,
+      1024,
+      2048
+    ],
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "drop_path_rate": 0.1,
+    "early_stopping": false,
+    "enable_checkpoint": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_feature_source": [
+      "spatial_avg_pool",
+      "temporal_avg_pool"
+    ],
+    "image_pos_embed": {
+      "max_pos_embeddings": 50,
+      "type": "learned_abs_2d"
+    },
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "",
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_groups": [
+      8,
+      16,
+      32,
+      64
+    ],
+    "num_heads": [
+      8,
+      16,
+      32,
+      64
+    ],
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_padding": [
+      3,
+      1,
+      1,
+      1
+    ],
+    "patch_prenorm": [
+      false,
+      true,
+      true,
+      true
+    ],
+    "patch_size": [
+      7,
+      3,
+      3,
+      3
+    ],
+    "patch_stride": [
+      4,
+      2,
+      2,
+      2
+    ],
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 1024,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "visual_temporal_embedding": {
+      "max_temporal_embeddings": 100,
+      "type": "COSINE"
+    },
+    "window_size": 12
+  },
+  "vocab_size": 51289
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "num_beams": 3,
+  "transformers_version": "4.42.0.dev0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,83 @@

+{
+  "_valid_processor_keys": [
+    "images",
+    "do_resize",
+    "size",
+    "resample",
+    "do_center_crop",
+    "crop_size",
+    "do_rescale",
+    "rescale_factor",
+    "do_normalize",
+    "image_mean",
+    "image_std",
+    "do_convert_rgb",
+    "return_tensors",
+    "data_format",
+    "input_data_format"
+  ],
+  "crop_size": {
+    "height": 768,
+    "width": 768
+  },
+  "do_center_crop": false,
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.485,
+    0.456,
+    0.406
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_seq_length": 577,
+  "image_std": [
+    0.229,
+    0.224,
+    0.225
+  ],
+  "processor_class": "Florence2Processor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 768,
+    "width": 768
+  },
+  "tasks_answer_post_processing_type": {
+    "<OCR>": "pure_text",
+    "<OCR_WITH_REGION>": "ocr",
+    "<CAPTION>": "pure_text",
+    "<DETAILED_CAPTION>": "pure_text",
+    "<MORE_DETAILED_CAPTION>": "pure_text",
+    "<OD>": "description_with_bboxes",
+    "<DENSE_REGION_CAPTION>": "description_with_bboxes",
+    "<CAPTION_TO_PHRASE_GROUNDING>": "phrase_grounding",
+    "<REFERRING_EXPRESSION_SEGMENTATION>": "polygons",
+    "<REGION_TO_SEGMENTATION>": "polygons",
+    "<OPEN_VOCABULARY_DETECTION>": "description_with_bboxes_or_polygons",
+    "<REGION_TO_CATEGORY>": "pure_text",
+    "<REGION_TO_DESCRIPTION>": "pure_text",
+    "<REGION_TO_OCR>": "pure_text",
+    "<REGION_PROPOSAL>": "bboxes"
+  },
+  "task_prompts_without_inputs": {
+    "<OCR>": "What is the text in the image?",
+    "<OCR_WITH_REGION>": "What is the text in the image, with regions?",
+    "<CAPTION>": "What does the image describe?",
+    "<DETAILED_CAPTION>": "Describe in detail what is shown in the image.",
+    "<MORE_DETAILED_CAPTION>": "Describe with a paragraph what is shown in the image.",
+    "<OD>": "Locate the objects with category name in the image.",
+    "<DENSE_REGION_CAPTION>": "Locate the objects in the image, with their descriptions.",
+    "<REGION_PROPOSAL>": "Locate the region proposals in the image."
+  },
+  "task_prompts_with_input": {
+    "<CAPTION_TO_PHRASE_GROUNDING>": "Locate the phrases in the caption: {input}",
+    "<REFERRING_EXPRESSION_SEGMENTATION>": "Locate {input} in the image with mask",
+    "<REGION_TO_SEGMENTATION>": "What is the polygon mask of region {input}",
+    "<OPEN_VOCABULARY_DETECTION>": "Locate {input} in the image.",
+    "<REGION_TO_CATEGORY>": "What is the region {input}?",
+    "<REGION_TO_DESCRIPTION>": "What does the region {input} describe?",
+    "<REGION_TO_OCR>": "What text is in the region {input}?"
+  }
+}

processing_florence2.py ADDED Viewed

	@@ -0,0 +1,1088 @@

+# coding=utf-8
+# Copyright 2024 Microsoft and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Florence-2.
+"""
+import re
+import logging
+from typing import List, Optional, Union
+import numpy as np
+import torch
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, is_valid_image
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import (
+    PaddingStrategy,
+    PreTokenizedInput,
+    TextInput,
+    TruncationStrategy,
+)
+from transformers.utils import TensorType
+logger = logging.getLogger(__name__)
+# Copied from transformers.models.idefics2.processing_idefics2.is_url
+def is_url(val) -> bool:
+    return isinstance(val, str) and val.startswith("http")
+# Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
+def is_image_or_image_url(elem):
+    return is_url(elem) or is_valid_image(elem)
+def _is_str_or_image(elem):
+    return isinstance(elem, (str)) or is_image_or_image_url(elem)
+class Florence2Processor(ProcessorMixin):
+    r"""
+    Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor.
+    [`Florence2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and [`BartTokenizerFast`]. See the
+    [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information.
+    Args:
+        image_processor ([`CLIPImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`BartTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "CLIPImageProcessor"
+    tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+    ):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+        if not hasattr(image_processor, "image_seq_length"):
+            raise ValueError("Image processor is missing an `image_seq_length` attribute.")
+        self.image_seq_length = image_processor.image_seq_length
+        tokens_to_add = {
+                'additional_special_tokens': \
+                    tokenizer.additional_special_tokens + \
+                    ['<od>', '</od>', '<ocr>', '</ocr>'] + \
+                    [f'<loc_{x}>' for x in range(1000)] + \
+                    ['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>']
+            }
+        tokenizer.add_special_tokens(tokens_to_add)
+        self.tasks_answer_post_processing_type = {
+            '<OCR>': 'pure_text',
+            '<OCR_WITH_REGION>': 'ocr',
+            '<CAPTION>': 'pure_text',
+            '<DETAILED_CAPTION>': 'pure_text',
+            '<MORE_DETAILED_CAPTION>': 'pure_text',
+            '<OD>': 'description_with_bboxes',
+            '<DENSE_REGION_CAPTION>': 'description_with_bboxes',
+            '<CAPTION_TO_PHRASE_GROUNDING>': "phrase_grounding",
+            '<REFERRING_EXPRESSION_SEGMENTATION>': 'polygons',
+            '<REGION_TO_SEGMENTATION>': 'polygons',
+            '<OPEN_VOCABULARY_DETECTION>': 'description_with_bboxes_or_polygons',
+            '<REGION_TO_CATEGORY>': 'pure_text',
+            '<REGION_TO_DESCRIPTION>': 'pure_text',
+            '<REGION_TO_OCR>': 'pure_text',
+            '<REGION_PROPOSAL>': 'bboxes'
+        }
+        self.task_prompts_without_inputs = {
+            '<OCR>': 'What is the text in the image?',
+            '<OCR_WITH_REGION>': 'What is the text in the image, with regions?',
+            '<CAPTION>': 'What does the image describe?',
+            '<DETAILED_CAPTION>': 'Describe in detail what is shown in the image.',
+            '<MORE_DETAILED_CAPTION>': 'Describe with a paragraph what is shown in the image.',
+            '<OD>': 'Locate the objects with category name in the image.',
+            '<DENSE_REGION_CAPTION>': 'Locate the objects in the image, with their descriptions.',
+            '<REGION_PROPOSAL>': 'Locate the region proposals in the image.'
+        }
+        self.task_prompts_with_input = {
+            '<CAPTION_TO_PHRASE_GROUNDING>': "Locate the phrases in the caption: {input}",
+            '<REFERRING_EXPRESSION_SEGMENTATION>': 'Locate {input} in the image with mask',
+            '<REGION_TO_SEGMENTATION>': 'What is the polygon mask of region {input}',
+            '<OPEN_VOCABULARY_DETECTION>': 'Locate {input} in the image.',
+            '<REGION_TO_CATEGORY>': 'What is the region {input}?',
+            '<REGION_TO_DESCRIPTION>': 'What does the region {input} describe?',
+            '<REGION_TO_OCR>': 'What text is in the region {input}?',
+        }
+        self.post_processor = Florence2PostProcesser(tokenizer=tokenizer)
+        super().__init__(image_processor, tokenizer)
+    def _construct_prompts(self, text):
+        # replace the task tokens with the task prompts if task token is in the text
+        prompts = []
+        for _text in text:
+            # 1. fixed task prompts without additional inputs
+            for task_token, task_prompt in self.task_prompts_without_inputs.items():
+                if task_token in _text:
+                    assert _text == task_token, f"Task token {task_token} should be the only token in the text."
+                    _text = task_prompt
+                    break
+            # 2. task prompts with additional inputs
+            for task_token, task_prompt in self.task_prompts_with_input.items():
+                if task_token in _text:
+                    _text = task_prompt.format(input=_text.replace(task_token, ''))
+                    break
+            prompts.append(_text)
+        return prompts
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        tokenize_newline_separately: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length=None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        do_resize: bool = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional["ChannelDimension"] = "channels_first",  # noqa: F821
+        input_data_format: Optional[
+            Union[str, "ChannelDimension"]  # noqa: F821
+        ] = None,
+        resample: "PILImageResampling" = None,  # noqa: F821
+        do_convert_rgb: bool = None,
+        do_thumbnail: bool = None,
+        do_align_long_axis: bool = None,
+        do_rescale: bool = None,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+            tokenize_newline_separately (`bool`, defaults to `True`):
+                Adds a separately tokenized '\n' at the end of the prompt.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
+              is provided, the `input_ids` will also contain the suffix input ids.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **labels** -- Labels compatible with training if `suffix` is not None
+        """
+        return_token_type_ids = False
+        if images is None:
+            raise ValueError("`images` are expected as arguments to a `Florence2Processor` instance.")
+        if text is None:
+            logger.warning_once(
+                "You are using Florence-2 without a text prompt."
+            )
+            text = ""
+        if isinstance(text, List) and isinstance(images, List):
+            if len(images) < len(text):
+                raise ValueError(
+                    f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image."
+                )
+        if _is_str_or_image(text):
+            text = [text]
+        elif isinstance(text, list) and _is_str_or_image(text[0]):
+            pass
+        pixel_values = self.image_processor(
+            images,
+            do_resize=do_resize,
+            do_normalize=do_normalize,
+            return_tensors=return_tensors,
+            image_mean=image_mean,
+            image_std=image_std,
+            input_data_format=input_data_format,
+            data_format=data_format,
+            resample=resample,
+            do_convert_rgb=do_convert_rgb,
+        )["pixel_values"]
+        if max_length is not None:
+            max_length -= self.image_seq_length  # max_length has to account for the image tokens
+        text = self._construct_prompts(text)
+        inputs = self.tokenizer(
+            text,
+            return_tensors=return_tensors,
+            padding=padding,
+            max_length=max_length,
+            truncation=truncation,
+            return_token_type_ids=return_token_type_ids,
+        )
+        return_data = {**inputs, "pixel_values": pixel_values}
+        if return_token_type_ids:
+            labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
+            return_data.update({"labels": labels})
+        return BatchFeature(data=return_data)
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Florence2
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Florence2
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Florence2
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    def post_process_generation(self, text, task, image_size):
+        """
+        Post-process the output of the model to each of the task outputs.
+        Args:
+            text (`str`): The text to post-process.
+            task (`str`): The task to post-process the text for.
+            image_size (`Tuple[int, int]`): The size of the image. height x width.
+        """
+        task_answer_post_processing_type = self.tasks_answer_post_processing_type.get(task, 'pure_text')
+        task_answer = self.post_processor(
+            text=text,
+            image_size=image_size,
+            parse_tasks=task_answer_post_processing_type,
+        )[task_answer_post_processing_type]
+        if task_answer_post_processing_type == 'pure_text':
+            final_answer = task_answer
+            # remove the special tokens
+            final_answer = final_answer.replace('<s>', '').replace('</s>', '')
+        elif task_answer_post_processing_type in ['od', 'description_with_bboxes', 'bboxes']:
+            od_instances = task_answer
+            bboxes_od = [_od_instance['bbox'] for _od_instance in od_instances]
+            labels_od = [str(_od_instance['cat_name']) for _od_instance in od_instances]
+            final_answer = {'bboxes': bboxes_od, 'labels': labels_od}
+        elif task_answer_post_processing_type in ['ocr']:
+            bboxes = [_od_instance['quad_box'] for _od_instance in task_answer]
+            labels = [str(_od_instance['text']) for _od_instance in task_answer]
+            final_answer = {'quad_boxes': bboxes, 'labels': labels}
+        elif task_answer_post_processing_type in ['phrase_grounding']:
+            bboxes = []
+            labels = []
+            for _grounded_phrase in task_answer:
+                for _bbox in _grounded_phrase['bbox']:
+                    bboxes.append(_bbox)
+                    labels.append(_grounded_phrase['cat_name'])
+            final_answer = {'bboxes': bboxes, 'labels': labels}
+        elif task_answer_post_processing_type in ['description_with_polygons', 'polygons']:
+            labels = []
+            polygons = []
+            for result in task_answer:
+                label = result['cat_name']
+                _polygons = result['polygons']
+                labels.append(label)
+                polygons.append(_polygons)
+            final_answer = {'polygons': polygons, 'labels': labels}
+        elif task_answer_post_processing_type in ['description_with_bboxes_or_polygons']:
+            bboxes = []
+            bboxes_labels = []
+            polygons = []
+            polygons_labels = []
+            for result in task_answer:
+                label = result['cat_name']
+                if 'polygons' in result:
+                    _polygons = result['polygons']
+                    polygons.append(_polygons)
+                    polygons_labels.append(label)
+                else:
+                    _bbox = result['bbox']
+                    bboxes.append(_bbox)
+                    bboxes_labels.append(label)
+            final_answer = {'bboxes': bboxes, 'bboxes_labels': bboxes_labels, 'polygons': polygons, 'polygons_labels': polygons_labels}
+        else:
+            raise ValueError('Unknown task answer post processing type: {}'.format(task_answer_post_processing_type))
+        final_answer = {
+            task: final_answer}
+        return final_answer
+class BoxQuantizer(object):
+    def __init__(self, mode, bins):
+        self.mode = mode
+        self.bins = bins
+    def quantize(self, boxes: torch.Tensor, size):
+        bins_w, bins_h = self.bins  # Quantization bins.
+        size_w, size_h = size       # Original image size.
+        size_per_bin_w = size_w / bins_w
+        size_per_bin_h = size_h / bins_h
+        xmin, ymin, xmax, ymax = boxes.split(1, dim=-1)  # Shape: 4 * [N, 1].
+        if self.mode == 'floor':
+            quantized_xmin = (
+                xmin / size_per_bin_w).floor().clamp(0, bins_w - 1)
+            quantized_ymin = (
+                ymin / size_per_bin_h).floor().clamp(0, bins_h - 1)
+            quantized_xmax = (
+                xmax / size_per_bin_w).floor().clamp(0, bins_w - 1)
+            quantized_ymax = (
+                ymax / size_per_bin_h).floor().clamp(0, bins_h - 1)
+        elif self.mode == 'round':
+            raise NotImplementedError()
+        else:
+            raise ValueError('Incorrect quantization type.')
+        quantized_boxes = torch.cat(
+            (quantized_xmin, quantized_ymin, quantized_xmax, quantized_ymax), dim=-1
+        ).int()
+        return quantized_boxes
+    def dequantize(self, boxes: torch.Tensor, size):
+        bins_w, bins_h = self.bins  # Quantization bins.
+        size_w, size_h = size       # Original image size.
+        size_per_bin_w = size_w / bins_w
+        size_per_bin_h = size_h / bins_h
+        xmin, ymin, xmax, ymax = boxes.split(1, dim=-1)  # Shape: 4 * [N, 1].
+        if self.mode == 'floor':
+            # Add 0.5 to use the center position of the bin as the coordinate.
+            dequantized_xmin = (xmin + 0.5) * size_per_bin_w
+            dequantized_ymin = (ymin + 0.5) * size_per_bin_h
+            dequantized_xmax = (xmax + 0.5) * size_per_bin_w
+            dequantized_ymax = (ymax + 0.5) * size_per_bin_h
+        elif self.mode == 'round':
+            raise NotImplementedError()
+        else:
+            raise ValueError('Incorrect quantization type.')
+        dequantized_boxes = torch.cat(
+            (dequantized_xmin, dequantized_ymin,
+             dequantized_xmax, dequantized_ymax), dim=-1
+        )
+        return dequantized_boxes
+class CoordinatesQuantizer(object):
+    """
+    Quantize coornidates (Nx2)
+    """
+    def __init__(self, mode, bins):
+        self.mode = mode
+        self.bins = bins
+    def quantize(self, coordinates: torch.Tensor, size):
+        bins_w, bins_h = self.bins  # Quantization bins.
+        size_w, size_h = size       # Original image size.
+        size_per_bin_w = size_w / bins_w
+        size_per_bin_h = size_h / bins_h
+        assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
+        x, y = coordinates.split(1, dim=-1)  # Shape: 4 * [N, 1].
+        if self.mode == 'floor':
+            quantized_x = (x / size_per_bin_w).floor().clamp(0, bins_w - 1)
+            quantized_y = (y / size_per_bin_h).floor().clamp(0, bins_h - 1)
+        elif self.mode == 'round':
+            raise NotImplementedError()
+        else:
+            raise ValueError('Incorrect quantization type.')
+        quantized_coordinates = torch.cat(
+            (quantized_x, quantized_y), dim=-1
+        ).int()
+        return quantized_coordinates
+    def dequantize(self, coordinates: torch.Tensor, size):
+        bins_w, bins_h = self.bins  # Quantization bins.
+        size_w, size_h = size       # Original image size.
+        size_per_bin_w = size_w / bins_w
+        size_per_bin_h = size_h / bins_h
+        assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
+        x, y = coordinates.split(1, dim=-1)  # Shape: 4 * [N, 1].
+        if self.mode == 'floor':
+            # Add 0.5 to use the center position of the bin as the coordinate.
+            dequantized_x = (x + 0.5) * size_per_bin_w
+            dequantized_y = (y + 0.5) * size_per_bin_h
+        elif self.mode == 'round':
+            raise NotImplementedError()
+        else:
+            raise ValueError('Incorrect quantization type.')
+        dequantized_coordinates = torch.cat(
+            (dequantized_x, dequantized_y), dim=-1
+        )
+        return dequantized_coordinates
+class Florence2PostProcesser(object):
+    """
+    Florence-2 post process for converting text prediction to various tasks results.
+    Args:
+        config: A dict of configs.
+        tokenizer: A tokenizer for decoding text to spans.
+        sample config:
+            UNIFIED_POST_PROCESS:
+                # commom configs
+                NUM_BBOX_HEIGHT_BINS: 1000
+                NUM_BBOX_WIDTH_BINS: 1000
+                COORDINATES_HEIGHT_BINS: 1000
+                COORDINATES_WIDTH_BINS: 1000
+                # task specific configs, override the common configs
+                PRASE_TASKS:
+                    - TASK_NAME: 'video_dense_caption'
+                      PATTERN: 'r<time_(\d+)><time_(\d+)>([a-zA-Z0-9 ]+)'
+                      SCORE_MODE: 'avg_cat_name_scores'
+                      NUM_BINS: 100
+                    - TASK_NAME: 'od'
+                      PATTERN: 'r<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>([a-zA-Z0-9 ]+)'
+                      SCORE_MODE: 'avg_cat_name_scores'
+    Returns:
+        parsed_dict (dict): A dict of parsed results.
+    """
+    def __init__(
+        self,
+        tokenizer=None
+    ):
+        parse_tasks = []
+        parse_task_configs = {}
+        config = self._create_default_config()
+        for task in config['PARSE_TASKS']:
+            parse_tasks.append(task['TASK_NAME'])
+            parse_task_configs[task['TASK_NAME']] = task
+        self.config = config
+        self.parse_tasks = parse_tasks
+        self.parse_tasks_configs = parse_task_configs
+        self.tokenizer =  tokenizer
+        if self.tokenizer is not None:
+            self.all_special_tokens = set(self.tokenizer.all_special_tokens)
+        self.init_quantizers()
+        self.black_list_of_phrase_grounding = self._create_black_list_of_phrase_grounding()
+    def _create_black_list_of_phrase_grounding(self):
+        black_list = {}
+        if 'phrase_grounding' in self.parse_tasks and self.parse_tasks_configs['phrase_grounding']['FILTER_BY_BLACK_LIST']:
+            black_list =  set(
+                ['it', 'I', 'me', 'mine',
+                 'you', 'your', 'yours',
+                 'he', 'him', 'his',
+                 'she', 'her', 'hers',
+                 'they', 'them', 'their', 'theirs',
+                 'one', 'oneself',
+                 'we', 'us', 'our', 'ours',
+                 'you', 'your', 'yours',
+                 'they', 'them', 'their', 'theirs',
+                 'mine', 'yours', 'his', 'hers', 'its',
+                 'ours', 'yours', 'theirs',
+                 'myself', 'yourself', 'himself', 'herself', 'itself',
+                 'ourselves', 'yourselves', 'themselves',
+                 'this', 'that',
+                 'these', 'those',
+                 'who', 'whom', 'whose', 'which', 'what',
+                 'who', 'whom', 'whose', 'which', 'that',
+                 'all', 'another', 'any', 'anybody', 'anyone', 'anything',
+                 'each', 'everybody', 'everyone', 'everything',
+                 'few', 'many', 'nobody', 'none', 'one', 'several',
+                 'some', 'somebody', 'someone', 'something',
+                 'each other', 'one another',
+                 'myself', 'yourself', 'himself', 'herself', 'itself',
+                 'ourselves', 'yourselves', 'themselves',
+                 'the image', 'image', 'images', 'the', 'a', 'an', 'a group',
+                 'other objects', 'lots', 'a set',
+                 ]
+            )
+        return black_list
+    def _create_default_config(self):
+        config = {
+            'NUM_BBOX_HEIGHT_BINS': 1000,
+            'NUM_BBOX_WIDTH_BINS': 1000,
+            'BOX_QUANTIZATION_MODE': 'floor',
+            'COORDINATES_HEIGHT_BINS': 1000,
+            'COORDINATES_WIDTH_BINS': 1000,
+            'COORDINATES_QUANTIZATION_MODE': 'floor',
+            'PARSE_TASKS': [
+                {
+                    'TASK_NAME': 'od',
+                    'PATTERN': r'([a-zA-Z0-9 ]+)<loc_(\\d+)><loc_(\\d+)><loc_(\\d+)><loc_(\\d+)>'
+                },
+                {
+                    'TASK_NAME': 'ocr',
+                    'PATTERN':  r'(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>',
+                    'AREA_THRESHOLD': 0.00
+                },
+                {
+                    'TASK_NAME': 'phrase_grounding',
+                    'FILTER_BY_BLACK_LIST': True
+                },
+                {
+                    'TASK_NAME': 'pure_text',
+                },
+                {
+                    'TASK_NAME': 'description_with_bboxes',
+                },
+                {
+                    'TASK_NAME': 'description_with_polygons',
+                },
+                {
+                    'TASK_NAME': 'polygons',
+                },
+                {
+                    'TASK_NAME': 'bboxes',
+                },
+                {
+                    'TASK_NAME': 'description_with_bboxes_or_polygons',
+                }
+            ]
+        }
+        return config
+    def init_quantizers(self):
+        # we have box_quantizer (od, grounding) and coordinates_quantizer (ocr, referring_segmentation)
+        num_bbox_height_bins = self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
+        num_bbox_width_bins = self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
+        box_quantization_mode = self.config.get('BOX_QUANTIZATION_MODE', 'floor')
+        self.box_quantizer = BoxQuantizer(
+            box_quantization_mode,
+            (num_bbox_width_bins, num_bbox_height_bins),
+        )
+        num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
+        num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
+        box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
+        self.coordinates_quantizer = CoordinatesQuantizer(
+            box_quantization_mode,
+            (num_bbox_width_bins, num_bbox_height_bins),
+        )
+    def decode_with_spans(self, tokenizer, token_ids):
+        filtered_tokens = tokenizer.convert_ids_to_tokens(
+            token_ids, skip_special_tokens=False)
+        assert len(filtered_tokens) == len(token_ids)
+        # To avoid mixing byte-level and unicode for byte-level BPT
+        # we need to build string separately for added tokens and byte-level tokens
+        # cf. https://github.com/huggingface/transformers/issues/1133
+        sub_texts = []
+        for token in filtered_tokens:
+            if token in self.all_special_tokens:
+                sub_texts.append(token)
+            else:
+                if isinstance(tokenizer, (BartTokenizer, BartTokenizerFast)):
+                    sub_text = tokenizer.convert_tokens_to_string([token])
+                elif isinstance(tokenizer, (T5Tokenizer, T5TokenizerFast)):
+                    # Ref: https://github.com/google/sentencepiece#whitespace-is-treated-as-a-basic-symbol
+                    # Note: Do not strip sub_text as it may have functional whitespace
+                    sub_text = token.replace('▁', ' ')
+                else:
+                    raise ValueError(f'type {type(tokenizer)} not supported')
+                sub_texts.append(sub_text)
+        text = ''
+        spans = []
+        for sub_text in sub_texts:
+            span = (len(text), len(text) + len(sub_text))  # [start index, end index).
+            text += sub_text
+            spans.append(span)
+        # Text format:
+        # 1. T5Tokenizer/T5TokenizerFast:
+        #      "<loc_1><loc_2><loc_3><loc_4> transplanting dog<loc_1><loc_2><loc_3><loc_4> cat</s>"
+        #    Equivalent to t5_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
+        # 2. BartTokenizer (need to double check):
+        #      "<s><loc_1><loc_2><loc_3><loc_4>transplanting dog<loc_1><loc_2><loc_3><loc_4>cat</s>"
+        #    Equivalent to bart_tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False, spaces_between_special_tokens=False)
+        return text, spans
+    def parse_od_from_text_and_spans(
+        self,
+        text,
+        pattern,
+        image_size,
+        phrase_centric=False
+    ):
+        parsed = list(re.finditer(pattern, text))
+        instances = []
+        for i in range(len(parsed)):
+            # Prepare instance.
+            instance = {}
+            if phrase_centric:
+                bbox_bins = [int(parsed[i].group(j)) for j in range(2, 6)]
+            else:
+                bbox_bins = [int(parsed[i].group(j)) for j in range(1, 5)]
+            instance['bbox'] = self.box_quantizer.dequantize(
+                boxes=torch.tensor(bbox_bins),
+                size=image_size
+            ).tolist()
+            if phrase_centric:
+                instance['cat_name'] = parsed[i].group(1).lower().strip()
+            else:
+                instance['cat_name'] = parsed[i].group(5).lower().strip()
+            instances.append(instance)
+        return instances
+    def parse_ocr_from_text_and_spans(self,
+                                    text,
+                                     pattern,
+                                     image_size,
+                                     area_threshold=-1.0,
+        ):
+        bboxes = []
+        labels = []
+        text = text.replace('<s>', '')
+        # ocr with regions
+        parsed = re.findall(pattern, text)
+        instances = []
+        image_width, image_height = image_size
+        for ocr_line in parsed:
+            ocr_content = ocr_line[0]
+            quad_box = ocr_line[1:]
+            quad_box = [int(i) for i in quad_box]
+            quad_box = self.coordinates_quantizer.dequantize(
+                torch.tensor(np.array(quad_box).reshape(-1, 2)),
+                size=image_size
+            ).reshape(-1).tolist()
+            if area_threshold > 0:
+                x_coords = [i for i in quad_box[0::2]]
+                y_coords = [i for i in quad_box[1::2]]
+                # apply the Shoelace formula
+                area = 0.5 * abs(sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1)))
+                if area < (image_width * image_height) * area_threshold:
+                    continue
+            bboxes.append(quad_box)
+            labels.append(ocr_content)
+            instances.append({
+                'quad_box': quad_box,
+                'text': ocr_content,
+            })
+        return instances
+    def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
+        # ignore <s> </s> and <pad>
+        cur_span = 0
+        if text.startswith('<s>'):
+            cur_span += 3
+        text = text.replace('<s>', '')
+        text = text.replace('</s>', '')
+        text = text.replace('<pad>', '')
+        pattern = r"([^<]+(?:<loc_\d+>){4,})"
+        phrases = re.findall(pattern, text)
+        # pattern should be text pattern and od pattern
+        pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
+        box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
+        instances = []
+        for pharse_text in phrases:
+            phrase_text_strip = pharse_text.replace('<ground>', '', 1)
+            phrase_text_strip = pharse_text.replace('<obj>', '', 1)
+            if phrase_text_strip == '':
+                cur_span += len(pharse_text)
+                continue
+            # Prepare instance.
+            instance = {}
+            # parse phrase, get string
+            phrase = re.search(pattern, phrase_text_strip)
+            if phrase is None:
+                cur_span += len(pharse_text)
+                continue
+            # parse bboxes by box_pattern
+            bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
+            if len(bboxes_parsed) == 0:
+                cur_span += len(pharse_text)
+                continue
+            phrase = phrase.group()
+            # remove leading and trailing spaces
+            phrase = phrase.strip()
+            if phrase in self.black_list_of_phrase_grounding:
+                cur_span += len(pharse_text)
+                continue
+            # a list of list
+            bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
+            instance['bbox'] = self.box_quantizer.dequantize(
+                boxes=torch.tensor(bbox_bins),
+                size=image_size
+            ).tolist()
+            # exclude non-ascii characters
+            phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
+            instance['cat_name'] = phrase
+            instances.append(instance)
+        return instances
+    def parse_description_with_bboxes_from_text_and_spans(self, text, pattern, image_size, allow_empty_phrase=False):
+        # temporary parse solution, split by '.'
+        # ignore <s> </s> and <pad>
+        text = text.replace('<s>', '')
+        text = text.replace('</s>', '')
+        text = text.replace('<pad>', '')
+        if allow_empty_phrase:
+            pattern = rf"(?:(?:<loc_\d+>){{4,}})"
+        else:
+            pattern = r"([^<]+(?:<loc_\d+>){4,})"
+        phrases = re.findall(pattern, text)
+        # pattern should be text pattern and od pattern
+        pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
+        box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
+        instances = []
+        for pharse_text in phrases:
+            phrase_text_strip = pharse_text.replace('<ground>', '', 1)
+            phrase_text_strip = pharse_text.replace('<obj>', '', 1)
+            if phrase_text_strip == '' and not allow_empty_phrase:
+                continue
+            # parse phrase, get string
+            phrase = re.search(pattern, phrase_text_strip)
+            if phrase is None:
+                continue
+            phrase = phrase.group()
+            # remove leading and trailing spaces
+            phrase = phrase.strip()
+            # parse bboxes by box_pattern
+            bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
+            if len(bboxes_parsed) == 0:
+                continue
+            # a list of list
+            bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
+            bboxes = self.box_quantizer.dequantize(
+                boxes=torch.tensor(bbox_bins),
+                size=image_size
+            ).tolist()
+            phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
+            for _bboxes in bboxes:
+                # Prepare instance.
+                instance = {}
+                instance['bbox'] = _bboxes
+                # exclude non-ascii characters
+                instance['cat_name'] = phrase
+                instances.append(instance)
+        return instances
+    def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
+                                                            allow_empty_phrase=False,
+                                                            polygon_sep_token='<sep>',
+                                                            polygon_start_token='<poly>',
+                                                            polygon_end_token='</poly>',
+                                                            with_box_at_start=False,
+                                                            ):
+        # ref_seg format: '<expression><x1><y1><x2><y2><><><sep><><><><>'
+        # ignore <s> </s> and <pad>
+        text = text.replace('<s>', '')
+        text = text.replace('</s>', '')
+        text = text.replace('<pad>', '')
+        if allow_empty_phrase:
+            pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
+        else:
+            # [^<]+: This part matches one or more characters that are not the < symbol.
+            # The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
+            #
+            pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
+        phrases = re.findall(pattern, text)
+        phrase_string_pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_|<poly>)'
+        box_pattern =  rf'((?:<loc_\d+>)+)(?:{re.escape(polygon_sep_token)}|$)'
+        # one polygons instance is separated by polygon_start_token and polygon_end_token
+        polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'
+        instances = []
+        for phrase_text in phrases:
+            # exclude loc_\d+>
+            # need to get span if want to include category score
+            phrase_text_strip = re.sub(r'^loc_\d+>', '', phrase_text, count=1)
+            # phrase = phrase.replace('<poly>', '')
+            # phrase = phrase.replace('poly>', '')
+            if phrase_text_strip == '' and not allow_empty_phrase:
+                continue
+            # parse phrase, get string
+            phrase = re.search(phrase_string_pattern, phrase_text_strip)
+            if phrase is None:
+                continue
+            phrase = phrase.group()
+            # remove leading and trailing spaces
+            phrase = phrase.strip()
+            # parse bboxes by box_pattern
+            # split by polygon_start_token and polygon_end_token first using polygons_instance_pattern
+            if polygon_start_token in phrase_text and polygon_end_token in phrase_text:
+                polygons_instances_parsed = list(re.finditer(polygons_instance_pattern, phrase_text))
+            else:
+                polygons_instances_parsed = [phrase_text]
+            for _polygons_instances_parsed in polygons_instances_parsed:
+                # Prepare instance.
+                instance = {}
+                # polygons_parsed= list(re.finditer(box_pattern, phrase_text))
+                if isinstance(_polygons_instances_parsed, str):
+                    polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
+                else:
+                    polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
+                if len(polygons_parsed) == 0:
+                    continue
+                # a list of list (polygon)
+                bbox = []
+                polygons = []
+                for _polygon_parsed in polygons_parsed:
+                    # group 1: whole <loc_\d+>...</loc_\d+>
+                    _polygon = _polygon_parsed.group(1)
+                    # parse into list of int
+                    _polygon = [int(_loc_parsed.group(1)) for _loc_parsed in re.finditer(r'<loc_(\d+)>', _polygon)]
+                    if with_box_at_start and len(bbox) == 0:
+                        if len(_polygon) > 4:
+                            # no valid bbox prediction
+                            bbox = _polygon[:4]
+                            _polygon = _polygon[4:]
+                        else:
+                            bbox = [0, 0, 0, 0]
+                    # abandon last element if is not paired
+                    if len(_polygon) % 2 == 1:
+                        _polygon = _polygon[:-1]
+                    # reshape into (n, 2)
+                    _polygon = self.coordinates_quantizer.dequantize(
+                        torch.tensor(np.array(_polygon).reshape(-1, 2)),
+                        size=image_size
+                    ).reshape(-1).tolist()
+                    # reshape back
+                    polygons.append(_polygon)
+                instance['cat_name'] = phrase
+                instance['polygons'] = polygons
+                if len(bbox) != 0:
+                    instance['bbox'] = self.box_quantizer.dequantize(
+                        boxes=torch.tensor([bbox]),
+                        size=image_size
+                    ).tolist()[0]
+                instances.append(instance)
+        return instances
+    def __call__(
+        self,
+        text=None,
+        image_size=None,
+        parse_tasks=None,
+    ):
+        """
+        Args:
+            text: model outputs
+            image_size: (width, height)
+            parse_tasks: a list of tasks to parse, if None, parse all tasks.
+        """
+        if parse_tasks is not None:
+            if isinstance(parse_tasks, str):
+                parse_tasks = [parse_tasks]
+            for _parse_task in parse_tasks:
+                assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'
+        # sequence or text should be provided
+        assert text is not None, 'text should be provided'
+        parsed_dict = {
+            'text': text
+        }
+        for task in self.parse_tasks:
+            if parse_tasks is not None and task not in parse_tasks:
+                continue
+            pattern = self.parse_tasks_configs[task].get('PATTERN', None)
+            if task == 'ocr':
+                instances = self.parse_ocr_from_text_and_spans(
+                    text,
+                    pattern=pattern,
+                    image_size=image_size,
+                    area_threshold=self.parse_tasks_configs[task].get('AREA_THRESHOLD', 0.0),
+                )
+                parsed_dict['ocr'] = instances
+            elif task == 'phrase_grounding':
+                instances = self.parse_phrase_grounding_from_text_and_spans(
+                    text,
+                    pattern=pattern,
+                    image_size=image_size,
+                )
+                parsed_dict['phrase_grounding'] = instances
+            elif task == 'pure_text':
+                parsed_dict['pure_text'] = text
+            elif task == 'description_with_bboxes':
+                instances = self.parse_description_with_bboxes_from_text_and_spans(
+                    text,
+                    pattern=pattern,
+                    image_size=image_size,
+                )
+                parsed_dict['description_with_bboxes'] = instances
+            elif task == 'description_with_polygons':
+                instances = self.parse_description_with_polygons_from_text_and_spans(
+                    text,
+                    pattern=pattern,
+                    image_size=image_size,
+                )
+                parsed_dict['description_with_polygons'] = instances
+            elif task == 'polygons':
+                instances = self.parse_description_with_polygons_from_text_and_spans(
+                    text,
+                    pattern=pattern,
+                    image_size=image_size,
+                    allow_empty_phrase=True,
+                )
+                parsed_dict['polygons'] = instances
+            elif task == 'bboxes':
+                instances = self.parse_description_with_bboxes_from_text_and_spans(
+                    text,
+                    pattern=pattern,
+                    image_size=image_size,
+                    allow_empty_phrase=True,
+                )
+                parsed_dict['bboxes'] = instances
+            elif task == 'description_with_bboxes_or_polygons':
+                if '<poly>' in text:
+                    # only support either polygons or bboxes, not both at the same time
+                    instances = self.parse_description_with_polygons_from_text_and_spans(
+                        text,
+                        pattern=pattern,
+                        image_size=image_size,
+                    )
+                else:
+                    instances = self.parse_description_with_bboxes_from_text_and_spans(
+                        text,
+                        pattern=pattern,
+                        image_size=image_size,
+                    )
+                parsed_dict['description_with_bboxes_or_polygons'] = instances
+            else:
+                raise ValueError("task {} is not supported".format(task))
+        return parsed_dict

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_florence2.Florence2Processor"
+  },
+  "processor_class": "Florence2Processor"
+}

special_tokens_map.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff