Spaces:
Sleeping
Sleeping
File size: 10,738 Bytes
e4e56ea 6edd739 e4e56ea 850c648 e4e56ea 850c648 e4e56ea 850c648 e4e56ea 850c648 e4e56ea 850c648 e4e56ea 850c648 e4e56ea 850c648 e4e56ea |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 |
import streamlit as st
import os
import pytesseract
from PIL import Image
import time
from utils import extract_frames_interval, extract_frames_pyscenedetect
st.title('๐ผ๏ธ Step 3: Video Processing (Frame Extraction & OCR)')
# Check if video path exists
if ('video_path' not in st.session_state or
not st.session_state['video_path'] or
not os.path.exists(st.session_state['video_path'])
):
st.warning('Video file not found. Please go back to the **๐ค Upload** page and process a video first.')
st.stop()
video_path = st.session_state['video_path']
st.write(f'Video file to process: `{os.path.basename(video_path)}`')
#
# ==================================================================
#
col_method, col_config = st.columns(2)
# --- Method ---
# with col_model.expander('**MODEL**', expanded=True):
with col_method.container(border=True):
# extraction_method = st.selectbox(
# 'Extraction method:',
# ('interval', 'video2slides', 'pyscenedetect'),
# index=0
# )
extraction_method = st.radio(
'Extraction method:',
('interval', 'video2slides', 'pyscenedetect'),
index=0,
horizontal=True,
)
# col_config_frame_interval, col_config_ocr_lang = st.columns(2)
# frame_interval = col_config_frame_interval.slider('Extract frames every `X` seconds:', min_value=1, max_value=60, value=5, step=1)
# ocr_lang = col_config_ocr_lang.text_input('OCR Language(s) (e.g. `rus`, `rus+eng`):', value='rus')
ocr_lang = st.text_input('OCR Language(s) (e.g. `rus`, `rus+eng`):', value='rus')
# --- Configuration ---
with col_config.expander(f'**`{extraction_method}` METHOD CONFIG**', expanded=True):
match extraction_method:
case 'interval':
extraction_interval = st.number_input(
'Frames extraction interval:',
min_value=0, max_value=25, step=1, format='%i', value=5,
help='Extract frames every `x` seconds'
)
case 'video2slides':
print('video2slides')
case 'pyscenedetect':
extraction_threshold = st.number_input(
'Frames extraction threshold:',
min_value=0.1, max_value=10.0, step=0.1, format='%f', value=1.5,
)
# --- Semantic Segmentation Placeholder ---
# st.markdown("---")
# --- Tesseract Configuration (Optional but recommended) ---
# Uncomment and set the path if tesseract is not in your PATH
# pytesseract.pytesseract.tesseract_cmd = r'/path/to/your/tesseract' # Example: '/usr/bin/tesseract' or 'C:\Program Files\Tesseract-OCR\tesseract.exe'
# # --- Frame Extraction and OCR ---
# st.subheader('OCR')
_, col_button_extract, _ = st.columns([2, 1, 2])
if col_button_extract.button('Extract Frames', type='primary', use_container_width=True):
# st.session_state['ocr_text'] = None # clear previous results
st.session_state['frames_paths'] = []
# all_ocr_results = []
col_info, col_complete, col_next = st.columns(3)
match extraction_method:
case 'interval':
with st.spinner(f'Extracting frames every {extraction_interval} seconds (using interval method)..'):
start_time = time.time()
frames_dir, frame_paths = extract_frames_interval(video_path, 'frames_pyscenedetect', interval_sec=extraction_interval)
extract_time = time.time() - start_time
if frames_dir and frame_paths:
st.session_state['frames_dir'] = frames_dir
st.session_state['frames_paths'] = frame_paths # store paths
col_info.success(f'Extracted {len(frame_paths)} frames in {extract_time:.2f}s.')
else:
col_info.error('Failed to extract frames')
st.stop()
case 'video2slides':
pass
case 'pyscenedetect':
with st.spinner(f'Extracting frames with `threshold={extraction_threshold}` (using pyscenedetect method)..'):
start_time = time.time()
frames_dir, frame_paths = extract_frames_pyscenedetect(video_path, 'frames_pyscenedetect', threshold=extraction_threshold)
extract_time = time.time() - start_time
if frames_dir and frame_paths:
st.session_state['frames_dir'] = frames_dir
st.session_state['frames_paths'] = frame_paths # store paths
col_info.success(f'Extracted {len(frame_paths)} frames in {extract_time:.2f}s.')
else:
col_info.error('Failed to extract frames')
st.stop()
if st.session_state['frames_paths']:
total_frames = len(st.session_state['frames_paths'])
# col_info.write(f'Performing OCR on {total_frames} frames..')
# ocr_progress = st.progress(0)
start_ocr_time = time.time()
extracted_texts = []
processed_count = 0
# Use columns to display some example frames
max_display_frames = 6
display_cols = st.columns(min(max_display_frames, total_frames) if total_frames > 0 else 1)
display_idx = 0
# Process frames in batches or one by one
for i, frame_path in enumerate(st.session_state['frames_paths']):
img = Image.open(frame_path)
# Extract timestamp from filename (assuming format frame_XXXXXX.png)
try:
secs = int(os.path.basename(frame_path).split('_')[1].split('.')[0])
timestamp = time.strftime('%H:%M:%S', time.gmtime(secs))
extracted_texts.append({'timestamp': timestamp, 'image': img})
except:
extracted_texts.append({'timestamp': 'N/A', 'image': img}) # fallback if filename parse fails
# Display some examples
if display_idx < max_display_frames and display_idx < len(display_cols):
with display_cols[display_idx]:
st.image(img, caption=f'Frame (t={timestamp})', use_container_width=True)
display_idx += 1
processed_count += 1
# ocr_progress.progress(processed_count / total_frames)
# # Process frames in batches or one by one
# for i, frame_path in enumerate(st.session_state['frames_paths']):
# try:
# img = Image.open(frame_path)
# # --- Potential Preprocessing/Filtering ---
# # Add logic here if needed:
# # - Detect if frame likely contains text (e.g., check contrast, edges)
# # - If segmentation was implemented, crop to slide regions here
# # --- Perform OCR ---
# text = pytesseract.image_to_string(img, lang=ocr_lang)
# # --- Basic Text Cleaning/Filtering ---
# cleaned_text = text.strip()
# if cleaned_text and len(cleaned_text) > 10: # filter very short/noisy results
# # Extract timestamp from filename (assuming format frame_XXXXXX.png)
# try:
# secs = int(os.path.basename(frame_path).split('_')[1].split('.')[0])
# timestamp = time.strftime('%H:%M:%S', time.gmtime(secs))
# extracted_texts.append({'timestamp': timestamp, 'text': cleaned_text})
# except:
# extracted_texts.append({'timestamp': 'N/A', 'text': cleaned_text}) # fallback if filename parse fails
# # Display some examples
# if display_idx < max_display_frames and display_idx < len(display_cols):
# with display_cols[display_idx]:
# st.image(img, caption=f'Frame (t={timestamp})', use_container_width=True)
# st.text(f'OCR:\n{cleaned_text[:100]}..') # show snippet
# display_idx += 1
# processed_count += 1
# ocr_progress.progress(processed_count / total_frames)
# except Exception as ocr_err:
# col_info.warning(f'Could not perform OCR on {os.path.basename(frame_path)}: {ocr_err}')
# processed_count += 1 # still count as processed
# ocr_progress.progress(processed_count / total_frames)
# ocr_time = time.time() - start_ocr_time
# col_complete.success(f'OCR processing finished in {ocr_time:.2f}s.')
# # --- Aggregate and Deduplicate OCR Text ---
# # Simple approach: Combine unique text blocks
# final_ocr_text = ""
# seen_texts = set()
# last_text = ""
# min_similarity_threshold = 0.8 # requires a library like `thefuzz` or similar for proper check
# # basic check: avoid exact consecutive duplicates
# for item in extracted_texts:
# current_text_block = item['text'].strip()
# # Basic check: Only add if significantly different from the last block
# # A more robust check would involve sequence matching or fuzzy matching
# is_duplicate = False
# if last_text:
# # Simple check: exact match or near-exact length/content start?
# if (current_text_block == last_text or
# (abs(len(current_text_block) - len(last_text)) < 10 and
# current_text_block.startswith(last_text[:20]))
# ):
# is_duplicate = True # likely a duplicate from consecutive frames
# if current_text_block and not is_duplicate: # only add non-empty, non-duplicate text
# final_ocr_text += f"\n\n--- Text from frame around {item['timestamp']} ---\n"
# final_ocr_text += current_text_block
# last_text = current_text_block # update last text added
# st.session_state['ocr_text'] = final_ocr_text.strip()
# if st.session_state['ocr_text']:
# col_complete.info('OCR processing complete.')
# col_next.page_link('ui_summarize.py', label='Next Step: **๐ Summarize**', icon='โก๏ธ')
# else:
# col_complete.warning('No significant text found via OCR')
# # --- Display OCR Results ---
# st.subheader('Aggregated OCR Text')
# if 'ocr_text' in st.session_state and st.session_state['ocr_text']:
# st.text_area("Extracted Text from Frames", st.session_state['ocr_text'], height=400)
# else:
# st.info('OCR has not been run or no text was detected')
# st.divider()
# st.subheader('Semantic Segmentation')
|