daranaka commited on
Commit
42eb874
·
verified ·
1 Parent(s): 7545c8a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -76
app.py CHANGED
@@ -1,86 +1,109 @@
1
- import os
2
  import streamlit as st
3
- from PIL import Image
4
- import pytesseract
5
  from ultralytics import YOLO
6
- from transformers import pipeline
 
7
  import numpy as np
8
- from torchvision import transforms
 
 
9
 
10
- # Initialize models
11
- yolo_model = YOLO('yolov5n') # YOLO model for panel and character detection
12
- summarizer = pipeline('summarization', model="facebook/bart-large-cnn") # Text summarizer model
13
 
14
- # Hyperparameters
15
- st.sidebar.title("Adjust Hyperparameters")
16
- detection_threshold = st.sidebar.slider("Detection Confidence Threshold", 0.1, 1.0, 0.4)
17
- text_summary_length = st.sidebar.slider("Text Summary Length (Words)", 30, 150, 50)
18
 
19
- # Upload section
20
- st.title("Manga Narration AI")
21
- uploaded_files = st.file_uploader("Upload up to 60 Manga Images", accept_multiple_files=True, type=["jpg", "jpeg", "png"], key="images")
22
 
23
- # Ensure there are uploaded files
24
- if uploaded_files:
25
- st.write(f"Processing {len(uploaded_files)} images...")
26
- progress = st.progress(0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- narration_script = ""
29
- num_images = len(uploaded_files)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- for i, uploaded_file in enumerate(uploaded_files):
32
  # Update progress bar
33
- progress.progress((i + 1) / num_images)
34
-
35
- # Open image and display
36
- image = Image.open(uploaded_file)
37
- st.image(image, caption=f"Processing {uploaded_file.name}", use_column_width=True)
38
-
39
- # Convert image to numpy array for YOLO
40
- img_np = np.array(image)
41
-
42
- # Panel and character detection using YOLO
43
- results = yolo_model(img_np)
44
- panels = []
45
- characters = []
46
-
47
- for res in results:
48
- for detection in res.boxes.xyxy:
49
- # Filter detections based on confidence
50
- if detection.conf >= detection_threshold:
51
- x1, y1, x2, y2 = map(int, detection.xyxy)
52
- crop = image.crop((x1, y1, x2, y2))
53
- label = res.names[int(detection.cls)]
54
- if label == "person":
55
- characters.append(crop)
56
- else:
57
- panels.append(crop)
58
-
59
- # Display detected characters and panels
60
- st.write(f"Detected {len(panels)} panels and {len(characters)} characters in {uploaded_file.name}.")
61
- for panel in panels:
62
- st.image(panel, caption="Detected Panel", use_column_width=True)
63
- for character in characters:
64
- st.image(character, caption="Detected Character", use_column_width=True)
65
-
66
- # Text extraction using OCR (Tesseract)
67
- panel_text = ""
68
- for panel in panels:
69
- panel_text += pytesseract.image_to_string(panel) + " "
70
-
71
- if panel_text:
72
- # Summarize extracted text for clear narration
73
- summary = summarizer(panel_text, max_length=text_summary_length, min_length=int(text_summary_length / 2), do_sample=False)[0]['summary_text']
74
- narration_script += f"{summary}\n"
75
- st.write(f"Summary: {summary}")
76
- else:
77
- st.write(f"No text detected in panels of {uploaded_file.name}.")
78
-
79
- # Final narration script
80
- st.success("Narration generation completed.")
81
- st.write("Generated Narration Script:")
82
- st.text(narration_script)
83
-
84
- # Add download option for generated narration
85
- if narration_script:
86
- st.download_button("Download Narration", narration_script, "narration.txt")
 
 
1
  import streamlit as st
2
+ import torch
 
3
  from ultralytics import YOLO
4
+ import pytesseract
5
+ from PIL import Image
6
  import numpy as np
7
+ from transformers import pipeline
8
+ import os
9
+ import time
10
 
11
+ # Set up the Tesseract command line path (optional, depending on your setup)
12
+ pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
 
13
 
14
+ # Load the YOLOv8 model for panel and character detection
15
+ yolo_model = YOLO('yolov8n.pt') # YOLOv8 nano model for lightweight processing
 
 
16
 
17
+ # Load the Hugging Face summarizer
18
+ summarizer = pipeline("summarization")
 
19
 
20
+ # App title
21
+ st.title("Manga Narration for the Visually Impaired")
22
+
23
+ # Sidebar to upload images
24
+ st.sidebar.title("Upload Manga Images")
25
+ uploaded_files = st.sidebar.file_uploader("Select up to 60 manga images", type=["png", "jpg", "jpeg"], accept_multiple_files=True)
26
+
27
+ # Progress bar
28
+ progress_bar = st.sidebar.progress(0)
29
+
30
+ # Hyperparameters for tuning
31
+ st.sidebar.title("Hyperparameters")
32
+ confidence_threshold = st.sidebar.slider("YOLO Confidence Threshold", min_value=0.1, max_value=1.0, value=0.25)
33
+ iou_threshold = st.sidebar.slider("YOLO IoU Threshold", min_value=0.1, max_value=1.0, value=0.45)
34
+ summarization_length = st.sidebar.slider("Summary Length (words)", min_value=50, max_value=300, value=100)
35
+
36
+ def detect_panels_and_characters(image):
37
+ # Perform panel and character detection using YOLOv8
38
+ results = yolo_model.predict(image, conf=confidence_threshold, iou=iou_threshold)
39
+
40
+ # Extract bounding boxes and labels
41
+ panels = []
42
+ characters = []
43
+ for result in results[0].boxes:
44
+ if result.cls == 0: # Assuming '0' is the class ID for panels
45
+ panels.append(result.xyxy.cpu().numpy()) # Panel bounding box
46
+ elif result.cls == 1: # Assuming '1' is the class ID for characters
47
+ characters.append(result.xyxy.cpu().numpy()) # Character bounding box
48
+
49
+ return panels, characters
50
 
51
+ def detect_text(image):
52
+ # Convert image to grayscale for better OCR accuracy
53
+ gray_image = Image.fromarray(image).convert("L")
54
+ text = pytesseract.image_to_string(gray_image)
55
+ return text
56
+
57
+ def generate_narration(panels, characters, text):
58
+ # Match detected text to characters in the panels
59
+ narration = ""
60
+ if panels:
61
+ narration += f"Detected {len(panels)} panels. "
62
+ if characters:
63
+ narration += f"{len(characters)} characters were found in the scene. "
64
+
65
+ # Add the summarization of the detected text as narration
66
+ if text.strip():
67
+ narration += "Here's a summary of the text: "
68
+ summary = summarizer(text, max_length=summarization_length, min_length=30, do_sample=False)[0]['summary_text']
69
+ narration += summary
70
+
71
+ return narration
72
+
73
+ def process_images(uploaded_files):
74
+ narrations = []
75
+ total_images = len(uploaded_files)
76
+
77
+ for idx, file in enumerate(uploaded_files):
78
+ # Load the image
79
+ image = Image.open(file)
80
+ image_np = np.array(image)
81
+
82
+ # Detect panels and characters
83
+ panels, characters = detect_panels_and_characters(image_np)
84
+
85
+ # Detect text
86
+ text = detect_text(image_np)
87
+
88
+ # Generate narration
89
+ narration = generate_narration(panels, characters, text)
90
+ narrations.append(narration)
91
 
 
92
  # Update progress bar
93
+ progress_bar.progress((idx + 1) / total_images)
94
+
95
+ # Display the current image and its narration
96
+ st.image(image, caption=f"Image {idx + 1}")
97
+ st.write(narration)
98
+
99
+ return narrations
100
+
101
+ if uploaded_files:
102
+ # Process uploaded images
103
+ narrations = process_images(uploaded_files)
104
+
105
+ # Show final results after processing all images
106
+ st.write("Narration Summary for All Images:")
107
+ st.write("\n\n".join(narrations))
108
+ else:
109
+ st.write("Please upload manga images to get started.")