comic-pipeline-text / src /html_parser.py
umang-immersfy's picture
Added HTML viewer to Gradio UI
d895ad6
from bs4 import BeautifulSoup
import json
def parse_scene_document(html_content):
soup = BeautifulSoup(html_content, 'html.parser')
scene_header = soup.find('h3').text
scene_number = scene_header.split(': ')[1]
# Extract synopsis
synopsis = soup.find('p').text.replace('Synopsis:', '').strip()
# Extract frames from table
frames = []
table = soup.find('table')
if table:
rows = table.find_all('tr')[1:] # Skip header row
for row in rows:
cells = row.find_all('td')
frame = {
'frame_num': cells[0].text.strip(),
'description': cells[1].text.strip(),
'characters': eval(cells[2].text.strip()), # Convert string list to actual list
'narration': cells[3].text.strip(),
'location': cells[4].text.strip(),
'setting': cells[5].text.strip()
}
print(frame)
frames.append(frame)
return {
'scene_number': scene_number,
'synopsis': synopsis,
'frames': frames
}