Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,177 +1,56 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
#
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
choices = [e for e in ALLOWED_TARGET_EXTS if query.lower() in e]
|
58 |
-
if not choices:
|
59 |
-
st.sidebar.error("No extension matches that filter.")
|
60 |
-
choices = ALLOWED_TARGET_EXTS
|
61 |
-
return st.sidebar.selectbox(
|
62 |
-
"Target extension for **all** files", choices, index=choices.index(".pdf") if ".pdf" in choices else 0
|
63 |
-
)
|
64 |
-
|
65 |
-
def uploader():
|
66 |
-
return st.file_uploader(
|
67 |
-
"Upload files to convert", type=None, accept_multiple_files=True
|
68 |
-
)
|
69 |
-
|
70 |
-
# -----------------------------------------------------------------------------
|
71 |
-
# Conversion functions
|
72 |
-
# -----------------------------------------------------------------------------
|
73 |
-
def convert_image(data: bytes, target_ext: str) -> bytes:
|
74 |
-
img = Image.open(io.BytesIO(data))
|
75 |
-
buf = io.BytesIO()
|
76 |
-
fmt = {".jpg":"JPEG", ".jpeg":"JPEG", ".png":"PNG", ".gif":"GIF",
|
77 |
-
".bmp":"BMP", ".tiff":"TIFF", ".ico":"ICO", ".webp":"WEBP"}[target_ext]
|
78 |
-
img.save(buf, format=fmt)
|
79 |
-
return buf.getvalue()
|
80 |
-
|
81 |
-
|
82 |
-
def convert_text_markup(data: bytes, orig_ext: str, target_ext: str) -> bytes:
|
83 |
-
text = data.decode("utf-8", errors="ignore")
|
84 |
-
return pypandoc.convert_text(text, to=target_ext.lstrip('.'), format=orig_ext.lstrip('.')).encode('utf-8')
|
85 |
-
|
86 |
-
|
87 |
-
def convert_office(temp_dir: str, data: bytes, orig_ext: str, target_ext: str) -> bytes:
|
88 |
-
# Use unoconv to convert office files
|
89 |
-
suffix_in = orig_ext
|
90 |
-
suffix_out = target_ext
|
91 |
-
in_path = Path(temp_dir) / f"input{suffix_in}"
|
92 |
-
out_path = Path(temp_dir) / f"output{suffix_out}"
|
93 |
-
in_path.write_bytes(data)
|
94 |
-
subprocess.run(["unoconv", "-f", suffix_out.lstrip('.'), "-o", str(out_path), str(in_path)], check=True)
|
95 |
-
return out_path.read_bytes()
|
96 |
-
|
97 |
-
|
98 |
-
def convert_media(data: bytes, target_ext: str) -> bytes:
|
99 |
-
# ffmpeg-python streaming
|
100 |
-
process = (
|
101 |
-
ffmpeg.input('pipe:0')
|
102 |
-
.output('pipe:1', format=target_ext.lstrip('.'))
|
103 |
-
.run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True)
|
104 |
-
)
|
105 |
-
out, err = process.communicate(data)
|
106 |
-
return out
|
107 |
-
|
108 |
-
|
109 |
-
def convert_file(file: st.runtime.uploaded_file_manager.UploadedFile, target_ext: str) -> tuple[bytes, str]:
|
110 |
-
name = Path(file.name)
|
111 |
-
orig_ext = name.suffix.lower()
|
112 |
-
raw = file.read()
|
113 |
-
|
114 |
-
if orig_ext in DISALLOWED_SOURCE_EXTS:
|
115 |
-
raise ValueError(f"Disallowed: {orig_ext}")
|
116 |
-
|
117 |
-
mime = magic.from_buffer(raw, mime=True) or ''
|
118 |
-
|
119 |
-
try:
|
120 |
-
if orig_ext in IMAGE_EXTS and target_ext in IMAGE_EXTS:
|
121 |
-
return convert_image(raw, target_ext), "image converted"
|
122 |
-
if mime.startswith('text/') or orig_ext in TEXT_EXTS:
|
123 |
-
if orig_ext != target_ext:
|
124 |
-
return convert_text_markup(raw, orig_ext, target_ext), "text/markup converted"
|
125 |
-
if orig_ext in DOC_EXTS or target_ext in DOC_EXTS:
|
126 |
-
with tempfile.TemporaryDirectory() as tmp:
|
127 |
-
return convert_office(tmp, raw, orig_ext, target_ext), "office/doc converted"
|
128 |
-
if mime.startswith(('audio/','video/')) or orig_ext in MEDIA_EXTS:
|
129 |
-
if orig_ext != target_ext:
|
130 |
-
return convert_media(raw, target_ext), "media converted"
|
131 |
-
except Exception as e:
|
132 |
-
st.warning(f"⚠️ Conversion failed for {file.name}: {e}. Falling back to rename.")
|
133 |
-
|
134 |
-
# Fallback: no conversion, just rename
|
135 |
-
return raw, "renamed only"
|
136 |
-
|
137 |
-
# -----------------------------------------------------------------------------
|
138 |
-
# ZIP packaging
|
139 |
-
# -----------------------------------------------------------------------------
|
140 |
-
def package_zip(files: list[st.runtime.uploaded_file_manager.UploadedFile], target_ext: str) -> io.BytesIO:
|
141 |
-
buf = io.BytesIO()
|
142 |
-
with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
|
143 |
-
for file in files:
|
144 |
-
name = Path(file.name)
|
145 |
-
if name.suffix.lower() in DISALLOWED_SOURCE_EXTS:
|
146 |
-
st.warning(f"Skipping disallowed file: {name.name}")
|
147 |
-
continue
|
148 |
-
data, note = convert_file(file, target_ext)
|
149 |
-
out_name = name.with_suffix(target_ext).name
|
150 |
-
zf.writestr(out_name, data)
|
151 |
-
st.success(f"{note}: {name.name} → {out_name}")
|
152 |
-
buf.seek(0)
|
153 |
-
return buf
|
154 |
-
|
155 |
-
# -----------------------------------------------------------------------------
|
156 |
-
# Main
|
157 |
-
# -----------------------------------------------------------------------------
|
158 |
-
|
159 |
-
def main():
|
160 |
-
st.set_page_config("Universal Converter", page_icon="🔄", layout="centered")
|
161 |
-
st.title("🔄 Universal File-Format Converter")
|
162 |
-
st.write("Upload files of any format; choose a new extension; download a ZIP of converted files.")
|
163 |
-
|
164 |
-
target_ext = sidebar_target_extension()
|
165 |
-
files = uploader()
|
166 |
-
|
167 |
-
if files and st.button("Convert & Download 🚀"):
|
168 |
-
zip_buf = package_zip(files, target_ext)
|
169 |
-
ts = datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')
|
170 |
-
st.download_button("⬇️ Download ZIP", zip_buf,
|
171 |
-
file_name=f"converted_{ts}.zip",
|
172 |
-
mime='application/zip')
|
173 |
-
|
174 |
-
st.caption("© 2025 Universal Converter • Streamlit • Hugging Face Spaces")
|
175 |
-
|
176 |
-
if __name__ == '__main__':
|
177 |
-
main()
|
|
|
1 |
+
# app.py
|
2 |
+
import fitz # PyMuPDF
|
3 |
+
from markdownify import markdownify as md
|
4 |
+
import json
|
5 |
+
import gradio as gr
|
6 |
+
|
7 |
+
def convert_pdf_to_markdown(path):
|
8 |
+
"""Extract each page as HTML, convert to Markdown."""
|
9 |
+
doc = fitz.open(path)
|
10 |
+
pages_md = []
|
11 |
+
for i, page in enumerate(doc, start=1):
|
12 |
+
html = page.get_text("html") or ""
|
13 |
+
# Clean conversion: collapse multiple newlines
|
14 |
+
page_md = md(html).strip()
|
15 |
+
pages_md.append({"page": i, "markdown": page_md})
|
16 |
+
return pages_md
|
17 |
+
|
18 |
+
def process_upload(pdf_file, output_format):
|
19 |
+
"""
|
20 |
+
pdf_file: tempfile-like object from Gradio
|
21 |
+
output_format: "markdown" or "json"
|
22 |
+
"""
|
23 |
+
# Convert and collect
|
24 |
+
pages = convert_pdf_to_markdown(pdf_file.name)
|
25 |
+
|
26 |
+
if output_format == "markdown":
|
27 |
+
# Join all pages
|
28 |
+
full_md = "\n\n---\n\n".join(p["markdown"] for p in pages)
|
29 |
+
return full_md
|
30 |
+
else:
|
31 |
+
# Return pretty JSON
|
32 |
+
return json.dumps({"pages": pages}, indent=2, ensure_ascii=False)
|
33 |
+
|
34 |
+
# Gradio interface
|
35 |
+
demo = gr.Interface(
|
36 |
+
fn=process_upload,
|
37 |
+
inputs=[
|
38 |
+
gr.File(label="Upload your PDF", file_types=[".pdf"]),
|
39 |
+
gr.Radio(choices=["markdown", "json"],
|
40 |
+
value="markdown",
|
41 |
+
label="Output format")
|
42 |
+
],
|
43 |
+
outputs=gr.Code(label="Converted Output"),
|
44 |
+
title="PDF → Markdown/JSON Converter",
|
45 |
+
description=(
|
46 |
+
"Upload a PDF and get back a professionally converted Markdown "
|
47 |
+
"or a structured JSON with each page’s Markdown. "
|
48 |
+
"PDFs with images or complex tables may still need manual review."
|
49 |
+
),
|
50 |
+
examples=[
|
51 |
+
# you can add example PDFs here if desired
|
52 |
+
]
|
53 |
+
)
|
54 |
+
|
55 |
+
if __name__ == "__main__":
|
56 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|