Spaces:
Paused
Paused
import base64 | |
import io | |
import os | |
import zipfile | |
from dash import Dash, dcc, html, Input, Output, State, callback_context, no_update | |
import dash_bootstrap_components as dbc | |
from docx import Document | |
from docx.enum.style import WD_STYLE_TYPE | |
import markdown | |
import threading | |
import time | |
import PyPDF2 | |
import re | |
app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP]) | |
app.layout = dbc.Container([ | |
html.H1("Auto-Wiki", className="my-4"), | |
dcc.Upload( | |
id='upload-data', | |
children=html.Div([ | |
'Drag and Drop or ', | |
html.A('Select Files') | |
]), | |
style={ | |
'width': '100%', | |
'height': '60px', | |
'lineHeight': '60px', | |
'borderWidth': '1px', | |
'borderStyle': 'dashed', | |
'borderRadius': '5px', | |
'textAlign': 'center', | |
'margin': '10px' | |
}, | |
multiple=True, | |
accept='.docx,.pdf' | |
), | |
html.Div(id='upload-output'), | |
html.Div(id="upload-status", style={"display": "none"}), | |
html.Div(id="conversion-status", style={"display": "none"}), | |
dbc.Button("Convert and Download", id="convert-button", color="primary", className="mt-3", disabled=True), | |
dcc.Download(id="download-zip") | |
]) | |
def process_docx(contents, filename): | |
content_type, content_string = contents.split(',') | |
decoded = base64.b64decode(content_string) | |
doc = Document(io.BytesIO(decoded)) | |
full_text = [] | |
for para in doc.paragraphs: | |
if para.style.name.startswith('Heading'): | |
level = int(para.style.name[-1]) | |
full_text.append(f"{'#' * level} {para.text}") | |
else: | |
text = para.text | |
for run in para.runs: | |
if run.bold: | |
text = text.replace(run.text, f"**{run.text}**") | |
if run.italic: | |
text = text.replace(run.text, f"*{run.text}*") | |
if para.style.name == 'List Bullet': | |
full_text.append(f"- {text}") | |
elif para.style.name == 'List Number': | |
full_text.append(f"1. {text}") | |
else: | |
full_text.append(text) | |
return '\n\n'.join(full_text) | |
def process_pdf(contents, filename): | |
content_type, content_string = contents.split(',') | |
decoded = base64.b64decode(content_string) | |
pdf_file = io.BytesIO(decoded) | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
full_text = [] | |
for page in pdf_reader.pages: | |
text = page.extract_text() | |
# Basic formatting detection (this is a simplified approach and may not catch all formatting) | |
text = re.sub(r'\*\*(.*?)\*\*', r'**\1**', text) # Bold | |
text = re.sub(r'_(.*?)_', r'*\1*', text) # Italic | |
text = re.sub(r'^(\d+\.)\s', r'\1 ', text, flags=re.MULTILINE) # Numbered lists | |
text = re.sub(r'^[β’ββ]\s', '- ', text, flags=re.MULTILINE) # Bullet points | |
# Detect potential headers (simplified approach) | |
lines = text.split('\n') | |
for i, line in enumerate(lines): | |
if i == 0 or (i > 0 and len(line) < 50 and line.strip() and line.strip()[0].isupper()): | |
lines[i] = f"## {line}" | |
full_text.append('\n'.join(lines)) | |
return '\n\n'.join(full_text) | |
def process_files(contents, filenames): | |
processed_files = [] | |
for c, n in zip(contents, filenames): | |
if n.lower().endswith('.docx'): | |
text = process_docx(c, n) | |
elif n.lower().endswith('.pdf'): | |
text = process_pdf(c, n) | |
else: | |
continue # Skip unsupported file types | |
md = markdown.markdown(text) | |
processed_files.append((n.replace('.docx', '.md').replace('.pdf', '.md'), md)) | |
time.sleep(0.1) # Simulate processing time | |
zip_buffer = io.BytesIO() | |
with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: | |
for name, content in processed_files: | |
zip_file.writestr(name, content) | |
return zip_buffer.getvalue() | |
def update_output(list_of_contents, list_of_names, n_clicks, contents, filenames): | |
ctx = callback_context | |
if not ctx.triggered: | |
return no_update | |
if ctx.triggered[0]['prop_id'] == 'upload-data.contents': | |
if list_of_contents is not None: | |
children = [ | |
html.Div([ | |
html.H5(f"File uploaded: {name}"), | |
html.Hr() | |
]) for name in list_of_names | |
] | |
return children, False, "Files uploaded successfully", {"display": "block"}, "", {"display": "none"}, None | |
return no_update | |
if ctx.triggered[0]['prop_id'] == 'convert-button.n_clicks': | |
if n_clicks is None or not contents: | |
return no_update | |
def process_and_download(): | |
zip_data = process_files(contents, filenames) | |
return dcc.send_bytes(zip_data, "converted_files.zip") | |
return ( | |
no_update, | |
True, | |
"", | |
{"display": "none"}, | |
"Converting files... This may take a moment.", | |
{"display": "block"}, | |
process_and_download() | |
) | |
return no_update | |
if __name__ == '__main__': | |
print("Starting the Dash application...") | |
app.run(debug=True, host='0.0.0.0', port=7860) | |
print("Dash application has finished running.") |