{ "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "view-in-github" }, "source": [ "\"Open" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "x6dFfL0QUr8P", "outputId": "58f3b497-f4e8-46bc-a40c-b564f6e14010" }, "outputs": [], "source": [ "#@title Check out source repo if not automatically available\n", "# !git clone https://github.com/GuardianUI/ui-refexp\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "RQdzURjDWYco", "outputId": "2628c536-780e-4544-8f37-33a7e79ee367" }, "outputs": [], "source": [ "# Go to hf space dir if not already there\n", "# !cd ui-refexp/hf-space && \n", "\n", "!pip3 install -r requirements.txt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# from PIL import Image, ImageDraw\n", "# from transformers import DonutProcessor, VisionEncoderDecoderModel\n", "\n", "# pretrained_repo_name = 'ivelin/donut-refexp-click'\n", "# pretrained_revision = 'main'\n", "# # revision can be git commit hash, branch or tag\n", "# # use 'main' for latest revision\n", "# print(f\"Loading model checkpoint: {pretrained_repo_name}\")\n", "\n", "# proc = DonutProcessor.from_pretrained(\n", "# pretrained_repo_name, revision=pretrained_revision, use_auth_token=\"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\")\n", "# proc.image_processor.do_align_long_axis = False\n", "# proc.image_processor.do_resize = False\n", "# proc.image_processor.do_thumbnail = False\n", "# proc.image_processor.do_pad = False\n", "# proc.image_processor.do_rescale = False\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/gitpod/.pyenv/versions/3.8.16/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Loading model checkpoint: ivelin/donut-refexp-click\n", "processor image size: {'height': 1280, 'width': 960}\n", "Running on local URL:\n", "Running on public URL: https://f2beb057-2b06-4a52.gradio.live\n", "\n", "This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "(image, prompt): , click on search button\n", "predicted decoder sequence: <s_refexp><s_prompt> click on search button</s_prompt><s_target_center><s_x> 0.23</s_x><s_y> 0.33</s_y></s_target_center></s>\n", "predicted decoder sequence before token2json: <s_prompt> click on search button</s_prompt><s_target_center><s_x> 0.23</s_x><s_y> 0.33</s_y></s_target_center>\n", "predicted center_point with text coordinates: {'x': '0.23', 'y': '0.33'}\n", "predicted center_point with float coordinates: {'x': 0.23, 'y': 0.33, 'decoder output sequence (before x,y adjustment)': ' click on search button 0.23 0.33'}\n", "input image size: (2719, 980)\n", "processed prompt: click on search button\n", "point={'x': 0.23, 'y': 0.33, 'decoder output sequence (before x,y adjustment)': ' click on search button 0.23 0.33'}, input_image_size=(2719, 980), output_image_size=(960, 1280)\n", ">>> resized_width=960\n", ">>> resized_height=346\n", "translated point={'x': 0.23, 'y': 1.2208092485549134, 'decoder output sequence (before x,y adjustment)': ' click on search button 0.23 0.33'}, resized_image_size: (960, 346)\n", "to image pixel values: x, y: (625, 1196)\n", "(image, prompt): , click on search names\n", "predicted decoder sequence: <s_refexp><s_prompt> click on search names</s_prompt><s_target_center><s_x> 0.5</s_x><s_y> 0.18</s_y></s_target_center></s>\n", "predicted decoder sequence before token2json: <s_prompt> click on search names</s_prompt><s_target_center><s_x> 0.5</s_x><s_y> 0.18</s_y></s_target_center>\n", "predicted center_point with text coordinates: {'x': '0.5', 'y': '0.18'}\n", "predicted center_point with float coordinates: {'x': 0.5, 'y': 0.18, 'decoder output sequence (before x,y adjustment)': ' click on search names 0.5 0.18'}\n", "input image size: (2719, 980)\n", "processed prompt: click on search names\n", "point={'x': 0.5, 'y': 0.18, 'decoder output sequence (before x,y adjustment)': ' click on search names 0.5 0.18'}, input_image_size=(2719, 980), output_image_size=(960, 1280)\n", ">>> resized_width=960\n", ">>> resized_height=346\n", "translated point={'x': 0.5, 'y': 0.6658959537572254, 'decoder output sequence (before x,y adjustment)': ' click on search names 0.5 0.18'}, resized_image_size: (960, 346)\n", "to image pixel values: x, y: (1359, 652)\n" ] } ], "source": [ "import app\n", "\n", "# img = Image.open('val-image-4.jpg')\n", "# print(img.size)\n", "# display(img)\n", "# out_size = (proc.image_processor.size['width'],\n", "# proc.image_processor.size['height'])\n", "# oimg = app.prepare_image_for_encoder(img, output_image_size=out_size)\n", "# print(oimg.size)\n", "# display(oimg)\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import transformers\n", "\n", "# turn off normalization so we can see the image\n", "# otherwise its tiny [0..1] float values that all look like the color black(0)\n", "# proc.image_processor.do_normalize = False\n", "\n", "# npimg = proc.image_processor.preprocess(oimg)\n", "# pimg = transformers.image_transforms.to_pil_image(npimg['pixel_values'][0])\n", "# pimg.save('tmp.png')\n", "# display(pimg)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "accelerator": "GPU", "colab": { "include_colab_link": true, "provenance": [] }, "gpuClass": "standard", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.16" }, "vscode": { "interpreter": { "hash": "9ac03a0a6051494cc606d484d27d20fce22fb7b4d169f583271e11d5ba46a56e" } } }, "nbformat": 4, "nbformat_minor": 0 }