Spaces:

drnko
/

sparrow-qwen2

Sleeping

App Files Files Community

Mohammed422 commited on Oct 1

Commit

714bf44

1 Parent(s): 81a90a1

small changes

Browse files

Files changed (1) hide show

app.py +343 -39

app.py CHANGED Viewed

@@ -1,3 +1,250 @@
 import gradio as gr
 import spaces
 from gradio.themes.base import Base
@@ -9,6 +256,7 @@ import os
 import json
 import fitz  # PyMuPDF
 # Define a custom theme inheriting from the soft theme
 class CustomTheme(Base):
     def __init__(self):
@@ -16,10 +264,12 @@ class CustomTheme(Base):
         self.primary_hue = "blue"
         self.secondary_hue = "sky"
 custom_theme = CustomTheme()
 DESCRIPTION = "A powerful vision-language model that can understand images and text to provide detailed analysis."
 def array_to_image_path(image_filepath, max_width=1250, max_height=1750):
     if image_filepath is None:
         raise ValueError("No image provided.")
@@ -31,6 +281,7 @@ def array_to_image_path(image_filepath, max_width=1250, max_height=1750):
     return os.path.abspath(image_filepath), img.width, img.height
 def convert_pdf_to_images(pdf_path):
     """Opens a PDF and converts each page into a high-resolution PNG image."""
     image_paths = []
@@ -40,21 +291,21 @@ def convert_pdf_to_images(pdf_path):
     for i, page in enumerate(doc):
         pix = page.get_pixmap(dpi=200)
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-        image_path = f"{base_name}_page_{i+1}_{timestamp}.png"
         pix.save(image_path)
         image_paths.append(image_path)
     doc.close()
     return image_paths
 # Initialize the model and processor
 model = Qwen2VLForConditionalGeneration.from_pretrained(
-    "Qwen/Qwen2-VL-7B-Instruct",
-    torch_dtype="auto",
-    device_map="auto"
 )
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
 @spaces.GPU
 def run_inference(uploaded_files, text_input):
     results = []
@@ -67,7 +318,9 @@ def run_inference(uploaded_files, text_input):
     )
     if not uploaded_files:
-        error_json = json.dumps({"error": "No file provided. Please upload an image or PDF."}, indent=4)
         return error_json, gr.Button(interactive=False)
     image_paths_to_process = []
@@ -76,56 +329,92 @@ def run_inference(uploaded_files, text_input):
         file_path = file_obj.name
         temp_files_to_clean.append(file_path)
-        if file_path.lower().endswith('.pdf'):
             pdf_page_images = convert_pdf_to_images(file_path)
             image_paths_to_process.extend(pdf_page_images)
             temp_files_to_clean.extend(pdf_page_images)
-        elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp')):
             image_paths_to_process.append(file_path)
         else:
             unsupported_files.append(os.path.basename(file_path))
     if unsupported_files:
         unsupported_str = ", ".join(unsupported_files)
-        results.append(json.dumps({
-            "error": f"Unsupported file type(s) were ignored: {unsupported_str}",
-            "details": "Please upload only images (PNG, JPG, etc.) or PDF files."
-        }, indent=4))
     for image_file in image_paths_to_process:
         try:
             image_path, width, height = array_to_image_path(image_file)
             messages = [
-                {"role": "user", "content": [
-                    {"type": "image", "image": image_path, "resized_height": height, "resized_width": width},
-                    {"type": "text", "text": json_prompt}
-                ]}
             ]
-            text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
             image_inputs, video_inputs = process_vision_info(messages)
-            inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to("cuda")
             generated_ids = model.generate(**inputs, max_new_tokens=4096)
-            generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
-            raw_output = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True)
             raw_text = raw_output[0]
             try:
-                start_index = raw_text.find('{')
-                end_index = raw_text.rfind('}') + 1
                 if start_index != -1 and end_index != 0:
                     json_string = raw_text[start_index:end_index]
                     parsed_json = json.loads(json_string)
-                    parsed_json['source_page'] = os.path.basename(image_path)
                     formatted_json = json.dumps(parsed_json, indent=4)
                     results.append(formatted_json)
                 else:
-                    results.append(f'{{"error": "Model did not return valid JSON.", "source_page": "{os.path.basename(image_path)}", "raw_response": "{raw_text}"}}')
             except json.JSONDecodeError:
-                results.append(f'{{"error": "Failed to decode JSON.", "source_page": "{os.path.basename(image_path)}", "raw_response": "{raw_text}"}}')
         except Exception as e:
-            results.append(f'{{"error": "An unexpected error occurred during processing.", "details": "{str(e)}"}}')
     for f in temp_files_to_clean:
         if os.path.exists(f):
@@ -153,20 +442,30 @@ def generate_explanation(json_text):
     )
     messages = [{"role": "user", "content": explanation_prompt}]
-    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[text], return_tensors="pt").to("cuda")
     generated_ids = model.generate(**inputs, max_new_tokens=2048)
-    generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
-    explanation_output = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
     return explanation_output
-# --- FINAL AND MOST ROBUST CSS FIX ---
 css = """
   .gradio-container { font-family: 'IBM Plex Sans', sans-serif; }
-  /* --- Light Mode Styles --- */
   #output-code, #output-code pre, #output-code code {
     background-color: #f0f0f0;
     border: 1px solid #e0e0e0;
@@ -185,7 +484,7 @@ css = """
     border-radius: 7px;
   }
-  /* --- Dark Mode Overrides targeting Gradio's .dark class --- */
   .dark #output-code, .dark #output-code pre, .dark #output-code code {
     background-color: #2b2b2b !important;
     border: 1px solid #444 !important;
@@ -194,11 +493,9 @@ css = """
   .dark #explanation-box {
     border: 1px solid #444 !important;
   }
-  /* This is a catch-all to ensure all parts of the syntax start light-colored */
   .dark #output-code code span {
      color: #f0f0f0 !important;
   }
-  /* Then, we apply specific colors for syntax highlighting on top */
   .dark #output-code .token.punctuation { color: #ccc !important; }
   .dark #output-code .token.property, .dark #output-code .token.string { color: #90ee90 !important; }
   .dark #output-code .token.number { color: #add8e6 !important; }
@@ -214,7 +511,7 @@ with gr.Blocks(theme=custom_theme, css=css) as demo:
             input_files = gr.Files(label="Upload Images or PDFs")
             text_input = gr.Textbox(
                 label="Your Query",
-                placeholder="e.g., Extract the total amount from this receipt."
             )
             submit_btn = gr.Button("Analyze File(s)", variant="primary")
@@ -223,22 +520,29 @@ with gr.Blocks(theme=custom_theme, css=css) as demo:
                 label="Full JSON Response",
                 language="json",
                 elem_id="output-code",
-                interactive=False # This makes the output field read-only
             )
-            explanation_btn = gr.Button("📄 Generate Detailed Explanation", interactive=False)
-            explanation_output = gr.Markdown(label="Detailed Explanation", elem_id="explanation-box")
     submit_btn.click(
         fn=run_inference,
         inputs=[input_files, text_input],
-        outputs=[output_text, explanation_btn]
     )
     explanation_btn.click(
         fn=generate_explanation,
         inputs=[output_text],
         outputs=[explanation_output],
-        show_progress='full'
     )
 demo.queue()

+# import gradio as gr
+# import spaces
+# from gradio.themes.base import Base
+# from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
+# from qwen_vl_utils import process_vision_info
+# from PIL import Image
+# from datetime import datetime
+# import os
+# import json
+# import fitz  # PyMuPDF
+# # Define a custom theme inheriting from the soft theme
+# class CustomTheme(Base):
+#     def __init__(self):
+#         super().__init__()
+#         self.primary_hue = "blue"
+#         self.secondary_hue = "sky"
+# custom_theme = CustomTheme()
+# DESCRIPTION = "A powerful vision-language model that can understand images and text to provide detailed analysis."
+# def array_to_image_path(image_filepath, max_width=1250, max_height=1750):
+#     if image_filepath is None:
+#         raise ValueError("No image provided.")
+#     img = Image.open(image_filepath)
+#     width, height = img.size
+#     if width > max_width or height > max_height:
+#         img.thumbnail((max_width, max_height))
+#     return os.path.abspath(image_filepath), img.width, img.height
+# def convert_pdf_to_images(pdf_path):
+#     """Opens a PDF and converts each page into a high-resolution PNG image."""
+#     image_paths = []
+#     doc = fitz.open(pdf_path)
+#     base_name = os.path.splitext(os.path.basename(pdf_path))[0]
+#     for i, page in enumerate(doc):
+#         pix = page.get_pixmap(dpi=200)
+#         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+#         image_path = f"{base_name}_page_{i+1}_{timestamp}.png"
+#         pix.save(image_path)
+#         image_paths.append(image_path)
+#     doc.close()
+#     return image_paths
+# # Initialize the model and processor
+# model = Qwen2VLForConditionalGeneration.from_pretrained(
+#     "Qwen/Qwen2-VL-7B-Instruct",
+#     torch_dtype="auto",
+#     device_map="auto"
+# )
+# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+# @spaces.GPU
+# def run_inference(uploaded_files, text_input):
+#     results = []
+#     temp_files_to_clean = []
+#     json_prompt = (
+#         f"{text_input}\n\nBased on the image and the query, respond ONLY with a single, "
+#         "valid JSON object. This object should be well-structured, using nested objects "
+#         "and arrays to logically represent the information."
+#     )
+#     if not uploaded_files:
+#         error_json = json.dumps({"error": "No file provided. Please upload an image or PDF."}, indent=4)
+#         return error_json, gr.Button(interactive=False)
+#     image_paths_to_process = []
+#     unsupported_files = []
+#     for file_obj in uploaded_files:
+#         file_path = file_obj.name
+#         temp_files_to_clean.append(file_path)
+#         if file_path.lower().endswith('.pdf'):
+#             pdf_page_images = convert_pdf_to_images(file_path)
+#             image_paths_to_process.extend(pdf_page_images)
+#             temp_files_to_clean.extend(pdf_page_images)
+#         elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp')):
+#             image_paths_to_process.append(file_path)
+#         else:
+#             unsupported_files.append(os.path.basename(file_path))
+#     if unsupported_files:
+#         unsupported_str = ", ".join(unsupported_files)
+#         results.append(json.dumps({
+#             "error": f"Unsupported file type(s) were ignored: {unsupported_str}",
+#             "details": "Please upload only images (PNG, JPG, etc.) or PDF files."
+#         }, indent=4))
+#     for image_file in image_paths_to_process:
+#         try:
+#             image_path, width, height = array_to_image_path(image_file)
+#             messages = [
+#                 {"role": "user", "content": [
+#                     {"type": "image", "image": image_path, "resized_height": height, "resized_width": width},
+#                     {"type": "text", "text": json_prompt}
+#                 ]}
+#             ]
+#             text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+#             image_inputs, video_inputs = process_vision_info(messages)
+#             inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to("cuda")
+#             generated_ids = model.generate(**inputs, max_new_tokens=4096)
+#             generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+#             raw_output = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+#             raw_text = raw_output[0]
+#             try:
+#                 start_index = raw_text.find('{')
+#                 end_index = raw_text.rfind('}') + 1
+#                 if start_index != -1 and end_index != 0:
+#                     json_string = raw_text[start_index:end_index]
+#                     parsed_json = json.loads(json_string)
+#                     parsed_json['source_page'] = os.path.basename(image_path)
+#                     formatted_json = json.dumps(parsed_json, indent=4)
+#                     results.append(formatted_json)
+#                 else:
+#                     results.append(f'{{"error": "Model did not return valid JSON.", "source_page": "{os.path.basename(image_path)}", "raw_response": "{raw_text}"}}')
+#             except json.JSONDecodeError:
+#                 results.append(f'{{"error": "Failed to decode JSON.", "source_page": "{os.path.basename(image_path)}", "raw_response": "{raw_text}"}}')
+#         except Exception as e:
+#             results.append(f'{{"error": "An unexpected error occurred during processing.", "details": "{str(e)}"}}')
+#     for f in temp_files_to_clean:
+#         if os.path.exists(f):
+#             try:
+#                 os.remove(f)
+#             except OSError as e:
+#                 print(f"Error deleting file {f}: {e}")
+#     final_json = "\n---\n".join(results)
+#     is_error = '"error":' in final_json
+#     return final_json, gr.Button(interactive=not is_error)
+# @spaces.GPU
+# def generate_explanation(json_text):
+#     if not json_text or '"error":' in json_text:
+#         return "Cannot generate an explanation. Please produce a valid JSON output first. 🙁"
+#     explanation_prompt = (
+#         "You are an expert data analyst. Your task is to provide a comprehensive, human-readable explanation "
+#         "of the following JSON data, which may represent one or more pages from a document. First, provide a textual explanation. "
+#         "If the JSON contains data from multiple sources (pages), explain each one. Then, if the JSON data represents a table, "
+#         "a list of items, or a receipt, you **must** re-format the key information into a Markdown table for clarity.\n\n"
+#         f"JSON Data:\n```json\n{json_text}\n```"
+#     )
+#     messages = [{"role": "user", "content": explanation_prompt}]
+#     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+#     inputs = processor(text=[text], return_tensors="pt").to("cuda")
+#     generated_ids = model.generate(**inputs, max_new_tokens=2048)
+#     generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+#     explanation_output = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
+#     return explanation_output
+# # --- FINAL AND MOST ROBUST CSS FIX ---
+# css = """
+#   .gradio-container { font-family: 'IBM Plex Sans', sans-serif; }
+#   /* --- Light Mode Styles --- */
+#   #output-code, #output-code pre, #output-code code {
+#     background-color: #f0f0f0;
+#     border: 1px solid #e0e0e0;
+#     border-radius: 7px;
+#     color: #333;
+#   }
+#   #output-code .token.punctuation { color: #393a34; }
+#   #output-code .token.property, #output-code .token.string { color: #0b7500; }
+#   #output-code .token.number { color: #2973b7; }
+#   #output-code .token.boolean { color: #9a050f; }
+#   #explanation-box {
+#     min-height: 200px;
+#     border: 1px solid #e0e0e0;
+#     padding: 15px;
+#     border-radius: 7px;
+#   }
+#   /* --- Dark Mode Overrides targeting Gradio's .dark class --- */
+#   .dark #output-code, .dark #output-code pre, .dark #output-code code {
+#     background-color: #2b2b2b !important;
+#     border: 1px solid #444 !important;
+#     color: #f0f0f0 !important;
+#   }
+#   .dark #explanation-box {
+#     border: 1px solid #444 !important;
+#   }
+#   /* This is a catch-all to ensure all parts of the syntax start light-colored */
+#   .dark #output-code code span {
+#      color: #f0f0f0 !important;
+#   }
+#   /* Then, we apply specific colors for syntax highlighting on top */
+#   .dark #output-code .token.punctuation { color: #ccc !important; }
+#   .dark #output-code .token.property, .dark #output-code .token.string { color: #90ee90 !important; }
+#   .dark #output-code .token.number { color: #add8e6 !important; }
+#   .dark #output-code .token.boolean { color: #f08080 !important; }
+# """
+# with gr.Blocks(theme=custom_theme, css=css) as demo:
+#     gr.Markdown("# Sparrow Qwen2-VL-7B Vision AI 👁️")
+#     gr.Markdown(DESCRIPTION)
+#     with gr.Row():
+#         with gr.Column(scale=1):
+#             input_files = gr.Files(label="Upload Images or PDFs")
+#             text_input = gr.Textbox(
+#                 label="Your Query",
+#                 placeholder="e.g., Extract the total amount from this receipt."
+#             )
+#             submit_btn = gr.Button("Analyze File(s)", variant="primary")
+#         with gr.Column(scale=2):
+#             output_text = gr.Code(
+#                 label="Full JSON Response",
+#                 language="json",
+#                 elem_id="output-code",
+#                 interactive=False # This makes the output field read-only
+#             )
+#             explanation_btn = gr.Button("📄 Generate Detailed Explanation", interactive=False)
+#             explanation_output = gr.Markdown(label="Detailed Explanation", elem_id="explanation-box")
+#     submit_btn.click(
+#         fn=run_inference,
+#         inputs=[input_files, text_input],
+#         outputs=[output_text, explanation_btn]
+#     )
+#     explanation_btn.click(
+#         fn=generate_explanation,
+#         inputs=[output_text],
+#         outputs=[explanation_output],
+#         show_progress='full'
+#     )
+# demo.queue()
+# demo.launch(debug=True)
 import gradio as gr
 import spaces
 from gradio.themes.base import Base
 import json
 import fitz  # PyMuPDF
 # Define a custom theme inheriting from the soft theme
 class CustomTheme(Base):
     def __init__(self):
         self.primary_hue = "blue"
         self.secondary_hue = "sky"
 custom_theme = CustomTheme()
 DESCRIPTION = "A powerful vision-language model that can understand images and text to provide detailed analysis."
 def array_to_image_path(image_filepath, max_width=1250, max_height=1750):
     if image_filepath is None:
         raise ValueError("No image provided.")
     return os.path.abspath(image_filepath), img.width, img.height
 def convert_pdf_to_images(pdf_path):
     """Opens a PDF and converts each page into a high-resolution PNG image."""
     image_paths = []
     for i, page in enumerate(doc):
         pix = page.get_pixmap(dpi=200)
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        image_path = f"{base_name}_page_{i + 1}_{timestamp}.png"
         pix.save(image_path)
         image_paths.append(image_path)
     doc.close()
     return image_paths
 # Initialize the model and processor
 model = Qwen2VLForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
 )
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
 @spaces.GPU
 def run_inference(uploaded_files, text_input):
     results = []
     )
     if not uploaded_files:
+        error_json = json.dumps(
+            {"error": "No file provided. Please upload an image or PDF."}, indent=4
+        )
         return error_json, gr.Button(interactive=False)
     image_paths_to_process = []
         file_path = file_obj.name
         temp_files_to_clean.append(file_path)
+        if file_path.lower().endswith(".pdf"):
             pdf_page_images = convert_pdf_to_images(file_path)
             image_paths_to_process.extend(pdf_page_images)
             temp_files_to_clean.extend(pdf_page_images)
+        elif file_path.lower().endswith(
+            (".png", ".jpg", ".jpeg", ".bmp", ".gif", ".webp")
+        ):
             image_paths_to_process.append(file_path)
         else:
             unsupported_files.append(os.path.basename(file_path))
     if unsupported_files:
         unsupported_str = ", ".join(unsupported_files)
+        results.append(
+            json.dumps(
+                {
+                    "error": f"Unsupported file type(s) were ignored: {unsupported_str}",
+                    "details": "Please upload only images (PNG, JPG, etc.) or PDF files.",
+                },
+                indent=4,
+            )
+        )
     for image_file in image_paths_to_process:
         try:
             image_path, width, height = array_to_image_path(image_file)
             messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            "image": image_path,
+                            "resized_height": height,
+                            "resized_width": width,
+                        },
+                        {"type": "text", "text": json_prompt},
+                    ],
+                }
             ]
+            text = processor.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
             image_inputs, video_inputs = process_vision_info(messages)
+            inputs = processor(
+                text=[text],
+                images=image_inputs,
+                videos=video_inputs,
+                padding=True,
+                return_tensors="pt",
+            ).to("cuda")
             generated_ids = model.generate(**inputs, max_new_tokens=4096)
+            generated_ids_trimmed = [
+                out_ids[len(in_ids) :]
+                for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+            ]
+            raw_output = processor.batch_decode(
+                generated_ids_trimmed,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=True,
+            )
             raw_text = raw_output[0]
             try:
+                start_index = raw_text.find("{")
+                end_index = raw_text.rfind("}") + 1
                 if start_index != -1 and end_index != 0:
                     json_string = raw_text[start_index:end_index]
                     parsed_json = json.loads(json_string)
+                    parsed_json["source_page"] = os.path.basename(image_path)
                     formatted_json = json.dumps(parsed_json, indent=4)
                     results.append(formatted_json)
                 else:
+                    results.append(
+                        f'{{"error": "Model did not return valid JSON.", "source_page": "{os.path.basename(image_path)}", "raw_response": "{raw_text}"}}'
+                    )
             except json.JSONDecodeError:
+                results.append(
+                    f'{{"error": "Failed to decode JSON.", "source_page": "{os.path.basename(image_path)}", "raw_response": "{raw_text}"}}'
+                )
         except Exception as e:
+            results.append(
+                f'{{"error": "An unexpected error occurred during processing.", "details": "{str(e)}"}}'
+            )
     for f in temp_files_to_clean:
         if os.path.exists(f):
     )
     messages = [{"role": "user", "content": explanation_prompt}]
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
     inputs = processor(text=[text], return_tensors="pt").to("cuda")
     generated_ids = model.generate(**inputs, max_new_tokens=2048)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :]
+        for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    explanation_output = processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=True,
+    )[0]
     return explanation_output
+# Define the Gradio UI
 css = """
   .gradio-container { font-family: 'IBM Plex Sans', sans-serif; }
+  /* Default (Light Mode) Styles */
   #output-code, #output-code pre, #output-code code {
     background-color: #f0f0f0;
     border: 1px solid #e0e0e0;
     border-radius: 7px;
   }
+  /* Dark Mode Overrides targeting Gradio's .dark class */
   .dark #output-code, .dark #output-code pre, .dark #output-code code {
     background-color: #2b2b2b !important;
     border: 1px solid #444 !important;
   .dark #explanation-box {
     border: 1px solid #444 !important;
   }
   .dark #output-code code span {
      color: #f0f0f0 !important;
   }
   .dark #output-code .token.punctuation { color: #ccc !important; }
   .dark #output-code .token.property, .dark #output-code .token.string { color: #90ee90 !important; }
   .dark #output-code .token.number { color: #add8e6 !important; }
             input_files = gr.Files(label="Upload Images or PDFs")
             text_input = gr.Textbox(
                 label="Your Query",
+                placeholder="e.g., Extract the total amount from this receipt.",
             )
             submit_btn = gr.Button("Analyze File(s)", variant="primary")
                 label="Full JSON Response",
                 language="json",
                 elem_id="output-code",
+                interactive=False,
+            )
+            explanation_btn = gr.Button(
+                "📄 Generate Detailed Explanation", interactive=False
+            )
+            explanation_output = gr.Markdown(
+                label="Detailed Explanation", elem_id="explanation-box"
             )
+    # Add api_name to create stable API endpoints
     submit_btn.click(
         fn=run_inference,
         inputs=[input_files, text_input],
+        outputs=[output_text, explanation_btn],
+        api_name="analyze_document",
     )
     explanation_btn.click(
         fn=generate_explanation,
         inputs=[output_text],
         outputs=[explanation_output],
+        show_progress="full",
+        api_name="generate_explanation",
     )
 demo.queue()