Spaces:

daranaka
/

manga-narrator-ai

Runtime error

App Files Files Community

daranaka commited on Oct 20, 2024

Commit

42eb874

verified ·

1 Parent(s): 7545c8a

Update app.py

Browse files

Files changed (1) hide show

app.py +99 -76

app.py CHANGED Viewed

@@ -1,86 +1,109 @@
-import os
 import streamlit as st
-from PIL import Image
-import pytesseract
 from ultralytics import YOLO
-from transformers import pipeline
 import numpy as np
-from torchvision import transforms
-# Initialize models
-yolo_model = YOLO('yolov5n')  # YOLO model for panel and character detection
-summarizer = pipeline('summarization', model="facebook/bart-large-cnn")  # Text summarizer model
-# Hyperparameters
-st.sidebar.title("Adjust Hyperparameters")
-detection_threshold = st.sidebar.slider("Detection Confidence Threshold", 0.1, 1.0, 0.4)
-text_summary_length = st.sidebar.slider("Text Summary Length (Words)", 30, 150, 50)
-# Upload section
-st.title("Manga Narration AI")
-uploaded_files = st.file_uploader("Upload up to 60 Manga Images", accept_multiple_files=True, type=["jpg", "jpeg", "png"], key="images")
-# Ensure there are uploaded files
-if uploaded_files:
-    st.write(f"Processing {len(uploaded_files)} images...")
-    progress = st.progress(0)
-    narration_script = ""
-    num_images = len(uploaded_files)
-    for i, uploaded_file in enumerate(uploaded_files):
         # Update progress bar
-        progress.progress((i + 1) / num_images)
-        # Open image and display
-        image = Image.open(uploaded_file)
-        st.image(image, caption=f"Processing {uploaded_file.name}", use_column_width=True)
-        # Convert image to numpy array for YOLO
-        img_np = np.array(image)
-        # Panel and character detection using YOLO
-        results = yolo_model(img_np)
-        panels = []
-        characters = []
-        for res in results:
-            for detection in res.boxes.xyxy:
-                # Filter detections based on confidence
-                if detection.conf >= detection_threshold:
-                    x1, y1, x2, y2 = map(int, detection.xyxy)
-                    crop = image.crop((x1, y1, x2, y2))
-                    label = res.names[int(detection.cls)]
-                    if label == "person":
-                        characters.append(crop)
-                    else:
-                        panels.append(crop)
-        # Display detected characters and panels
-        st.write(f"Detected {len(panels)} panels and {len(characters)} characters in {uploaded_file.name}.")
-        for panel in panels:
-            st.image(panel, caption="Detected Panel", use_column_width=True)
-        for character in characters:
-            st.image(character, caption="Detected Character", use_column_width=True)
-        # Text extraction using OCR (Tesseract)
-        panel_text = ""
-        for panel in panels:
-            panel_text += pytesseract.image_to_string(panel) + " "
-        if panel_text:
-            # Summarize extracted text for clear narration
-            summary = summarizer(panel_text, max_length=text_summary_length, min_length=int(text_summary_length / 2), do_sample=False)[0]['summary_text']
-            narration_script += f"{summary}\n"
-            st.write(f"Summary: {summary}")
-        else:
-            st.write(f"No text detected in panels of {uploaded_file.name}.")
-    # Final narration script
-    st.success("Narration generation completed.")
-    st.write("Generated Narration Script:")
-    st.text(narration_script)
-# Add download option for generated narration
-if narration_script:
-    st.download_button("Download Narration", narration_script, "narration.txt")

 import streamlit as st
+import torch
 from ultralytics import YOLO
+import pytesseract
+from PIL import Image
 import numpy as np
+from transformers import pipeline
+import os
+import time
+# Set up the Tesseract command line path (optional, depending on your setup)
+pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
+# Load the YOLOv8 model for panel and character detection
+yolo_model = YOLO('yolov8n.pt')  # YOLOv8 nano model for lightweight processing
+# Load the Hugging Face summarizer
+summarizer = pipeline("summarization")
+# App title
+st.title("Manga Narration for the Visually Impaired")
+# Sidebar to upload images
+st.sidebar.title("Upload Manga Images")
+uploaded_files = st.sidebar.file_uploader("Select up to 60 manga images", type=["png", "jpg", "jpeg"], accept_multiple_files=True)
+# Progress bar
+progress_bar = st.sidebar.progress(0)
+# Hyperparameters for tuning
+st.sidebar.title("Hyperparameters")
+confidence_threshold = st.sidebar.slider("YOLO Confidence Threshold", min_value=0.1, max_value=1.0, value=0.25)
+iou_threshold = st.sidebar.slider("YOLO IoU Threshold", min_value=0.1, max_value=1.0, value=0.45)
+summarization_length = st.sidebar.slider("Summary Length (words)", min_value=50, max_value=300, value=100)
+def detect_panels_and_characters(image):
+    # Perform panel and character detection using YOLOv8
+    results = yolo_model.predict(image, conf=confidence_threshold, iou=iou_threshold)
+    # Extract bounding boxes and labels
+    panels = []
+    characters = []
+    for result in results[0].boxes:
+        if result.cls == 0:  # Assuming '0' is the class ID for panels
+            panels.append(result.xyxy.cpu().numpy())  # Panel bounding box
+        elif result.cls == 1:  # Assuming '1' is the class ID for characters
+            characters.append(result.xyxy.cpu().numpy())  # Character bounding box
+    return panels, characters
+def detect_text(image):
+    # Convert image to grayscale for better OCR accuracy
+    gray_image = Image.fromarray(image).convert("L")
+    text = pytesseract.image_to_string(gray_image)
+    return text
+def generate_narration(panels, characters, text):
+    # Match detected text to characters in the panels
+    narration = ""
+    if panels:
+        narration += f"Detected {len(panels)} panels. "
+    if characters:
+        narration += f"{len(characters)} characters were found in the scene. "
+    # Add the summarization of the detected text as narration
+    if text.strip():
+        narration += "Here's a summary of the text: "
+        summary = summarizer(text, max_length=summarization_length, min_length=30, do_sample=False)[0]['summary_text']
+        narration += summary
+    return narration
+def process_images(uploaded_files):
+    narrations = []
+    total_images = len(uploaded_files)
+    for idx, file in enumerate(uploaded_files):
+        # Load the image
+        image = Image.open(file)
+        image_np = np.array(image)
+        # Detect panels and characters
+        panels, characters = detect_panels_and_characters(image_np)
+        # Detect text
+        text = detect_text(image_np)
+        # Generate narration
+        narration = generate_narration(panels, characters, text)
+        narrations.append(narration)
         # Update progress bar
+        progress_bar.progress((idx + 1) / total_images)
+        # Display the current image and its narration
+        st.image(image, caption=f"Image {idx + 1}")
+        st.write(narration)
+    return narrations
+if uploaded_files:
+    # Process uploaded images
+    narrations = process_images(uploaded_files)
+    # Show final results after processing all images
+    st.write("Narration Summary for All Images:")
+    st.write("\n\n".join(narrations))
+else:
+    st.write("Please upload manga images to get started.")