Spaces:

YuukiAsuna
/

Vintern-1B-v2-ViTable-docvqa

Running

App Files Files Community

MiyamizuMitsuha commited on 10 days ago

Commit

ce57d08

•

1 Parent(s): 479d45f

Update app

Browse files

Files changed (2) hide show

app.py +87 -34
requirements.txt +6 -1

app.py CHANGED Viewed

@@ -99,8 +99,6 @@ def safe_cuda(self, *args, **kwargs):
 torch.Tensor.cuda = safe_cuda
 model_name = "YuukiAsuna/Vintern-1B-v2-ViTable-docvqa"
@@ -116,42 +114,97 @@ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, us
-def Vintern_1B_v2_ViTable_docvqa(image, question, chat_history=[]):
-    pixel_values = load_image(image, max_num=12).to(torch.bfloat16).cuda()
-    generation_config = dict(max_new_tokens= 1024, do_sample=False, num_beams = 3, repetition_penalty=2.0)
-    # question = input("Question: ")
-    question = '<image>\n' + question
-    response = model.chat(tokenizer, pixel_values, question, generation_config)
     print(f'User: {question}\nAssistant: {response}')
-    print("="*30)
-    # Update the chat history
-    chat_history.append((image, None))
-    chat_history.append((question, None))
-    chat_history.append((None, response))
-    return chat_history
-interface = gr.Interface(
-    fn=Vintern_1B_v2_ViTable_docvqa,
-    inputs=[
-        gr.Image(label="Upload Image", type="filepath"),  # Image input
-        gr.Textbox(label="Enter your question"),          # Text input
-    ],
-    outputs=gr.Chatbot(label="Chat History"),                           # Chatbot-style output
-    title="Vintern-1B-v2-ViTable-docvqa,",
-    # description="A chatbot that accepts both images and text, displays images, and provides conversational responses.",
-    allow_flagging="never",
 )
-# Launch the chatbot
-interface.launch()

 torch.Tensor.cuda = safe_cuda
 model_name = "YuukiAsuna/Vintern-1B-v2-ViTable-docvqa"
+@spaces.GPU
+def chat(message, history):
+    print(history)
+    print(message)
+    if len(history) == 0 or len(message["files"]) != 0:
+        test_image = message["files"][0]["path"]
+    else:
+        test_image = history[0][0][0]
+    pixel_values = load_image(test_image, max_num=12).to(torch.bfloat16).cuda()
+    generation_config = dict(max_new_tokens= 1024, do_sample=True, num_beams = 3, repetition_penalty=2.5)
+    if len(history) == 0:
+        question = '<image>\n'+message["text"]
+        response, conv_history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
+    else:
+        conv_history = []
+        for chat_pair in history:
+            if chat_pair[1] is not None:
+                if len(conv_history) == 0 and len(message["files"]) == 0:
+                    chat_pair[0] = '<image>\n' + chat_pair[0]
+                conv_history.append(tuple(chat_pair))
+        print(conv_history)
+        if len(message["files"]) != 0:
+            question = '<image>\n'+message["text"]
+        else:
+            question = message["text"]
+        response, conv_history = model.chat(tokenizer, pixel_values, question, generation_config, history=conv_history, return_history=True)
     print(f'User: {question}\nAssistant: {response}')
+    return response
+CSS ="""
+# @media only screen and (max-width: 600px){
+#     #component-3 {
+#       height: 90dvh !important;
+#       transform-origin: top; /* Đảm bảo rằng phần tử mở rộng từ trên xuống */
+#       border-style: solid;
+#       overflow: hidden;
+#       flex-grow: 1;
+#       min-width: min(160px, 100%);
+#       border-width: var(--block-border-width);
+#     }
+# }
+#component-3 {
+  height: 50dvh !important;
+  transform-origin: top; /* Đảm bảo rằng phần tử mở rộng từ trên xuống */
+  border-style: solid;
+  overflow: hidden;
+  flex-grow: 1;
+  min-width: min(160px, 100%);
+  border-width: var(--block-border-width);
+}
+/* Đảm bảo ảnh bên trong nút hiển thị đúng cách cho các nút có aria-label chỉ định */
+button.svelte-1lcyrx4[aria-label="user's message: a file of type image/jpeg, "] img.svelte-1pijsyv {
+  width: 100%;
+  object-fit: contain;
+  height: 100%;
+  border-radius: 13px; /* Thêm bo góc cho ảnh */
+  max-width: 50vw;     /* Giới hạn chiều rộng ảnh */
+}
+/* Đặt chiều cao cho nút và cho phép chọn văn bản chỉ cho các nút có aria-label chỉ định */
+button.svelte-1lcyrx4[aria-label="user's message: a file of type image/jpeg, "] {
+  user-select: text;
+  text-align: left;
+  height: 300px;
+}
+/* Thêm bo góc và giới hạn chiều rộng cho ảnh không thuộc avatar container */
+.message-wrap.svelte-1lcyrx4 > div.svelte-1lcyrx4 .svelte-1lcyrx4:not(.avatar-container) img {
+  border-radius: 13px;
+  max-width: 50vw;
+}
+.message-wrap.svelte-1lcyrx4 .message.svelte-1lcyrx4 img {
+    margin: var(--size-2);
+    max-height: 500px;
+}
+"""
+demo = gr.ChatInterface(
+    fn=chat,
+    description="""Try [Vintern-1B-v2-ViTable-docvqa](https://huggingface.co/YuukiAsuna/Vintern-1B-v2-ViTable-docvqa) in this demo. Vintern-1B-v2-ViTable-docvqa is a finetuned version of [Vintern-1B-v2](https://huggingface.co/5CD-AI/Vintern-1B-v2)""",
+    title="Vintern-1B-v2-ViTable-docvqa",
+    multimodal=True,
+    css=CSS
 )
+demo.queue().launch()

requirements.txt CHANGED Viewed

@@ -12,4 +12,9 @@ accelerate
 bitsandbytes
 peft
 tensorboardX
-flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl

 bitsandbytes
 peft
 tensorboardX
+flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+spaces
+pypandoc
+fastapi
+wheel
+imageio