Spaces:
Running
Running
from flask import Flask, render_template, request, send_file, jsonify | |
import fitz # PyMuPDF | |
from docx import Document | |
from openpyxl import Workbook | |
from pdf2docx import Converter | |
import tempfile | |
import os | |
from io import BytesIO | |
import math | |
from PyPDF2 import PdfMerger | |
from flask_cors import CORS | |
import threading, time, requests | |
from dotenv import load_dotenv | |
basedir = os.path.abspath(os.path.dirname(__file__)) | |
load_dotenv(os.path.join(basedir, '.env')) | |
app = Flask(__name__) | |
def background_task(): | |
while True: | |
try: | |
# Example: ping your own Space or another API | |
r = requests.get(os.environ.get('url')) | |
print("Ping:", r.status_code) | |
except Exception as e: | |
print("Error:", e) | |
time.sleep(600) | |
app = Flask(__name__) | |
CORS(app) | |
UPLOAD_FOLDER = "/tmp/uploads" | |
OUTPUT_FOLDER = "/tmp/compressed" | |
os.makedirs(UPLOAD_FOLDER, exist_ok=True) | |
os.makedirs(OUTPUT_FOLDER, exist_ok=True) | |
def index(): | |
return render_template('index.html') | |
def pdf_to_text(): | |
if request.method == 'POST': | |
file = request.files['pdf'] | |
doc = fitz.open(stream=file.read(), filetype="pdf") | |
text = ''.join(page.get_text() for page in doc) | |
output = BytesIO() | |
output.write(text.encode()) | |
output.seek(0) | |
return send_file(output, download_name="output.txt", as_attachment=True) | |
return render_template('pdf_to_text.html') | |
def pdf_to_word(): | |
if request.method == 'POST': | |
file = request.files['pdf'] | |
doc = fitz.open(stream=file.read(), filetype="pdf") | |
word_doc = Document() | |
for page in doc: | |
word_doc.add_paragraph(page.get_text()) | |
output = BytesIO() | |
word_doc.save(output) | |
output.seek(0) | |
return send_file(output, download_name="output.docx", as_attachment=True) | |
return render_template('pdf_to_word.html') | |
def pdf_to_excel(): | |
if request.method == 'POST': | |
file = request.files['pdf'] | |
doc = fitz.open(stream=file.read(), filetype="pdf") | |
wb = Workbook() | |
ws = wb.active | |
for page_num, page in enumerate(doc, start=1): | |
words = page.get_text("words") # list of (x0, y0, x1, y1, word) | |
words = sorted(words, key=lambda w: (w[1], w[0])) # sort by y then x | |
row_map = {} | |
row_index = 1 | |
for w in words: | |
x0, y0, x1, y1, word = w[:5] | |
row_key = round(y0 / 5) # group words by y position | |
if row_key not in row_map: | |
row_map[row_key] = row_index | |
row_index += 1 | |
row = row_map[row_key] | |
col = math.floor(x0 / 50) + 1 # adjust divisor to control column spacing | |
ws.cell(row=row, column=col, value=word) | |
output = BytesIO() | |
wb.save(output) | |
output.seek(0) | |
return send_file(output, download_name="output.xlsx", as_attachment=True) | |
return render_template('pdf_to_excel.html') | |
def index_pdf_merge(): | |
return render_template("pdf_marge.html") | |
def merge_pdfs(): | |
try: | |
# must match <input name="pdfs"> | |
files = request.files.getlist("pdfs") | |
if not files or len(files) < 2: | |
return jsonify({"error": "Please upload at least 2 PDF files"}), 400 | |
# merge PDFs | |
merger = PdfMerger() | |
for file in files: | |
merger.append(file) | |
output = BytesIO() | |
merger.write(output) | |
merger.close() | |
output.seek(0) | |
return send_file( | |
output, | |
download_name="merged.pdf", | |
as_attachment=True | |
) | |
except Exception as e: | |
import traceback | |
print("❌ Merge error:", traceback.format_exc()) | |
return jsonify({"error": str(e)}), 500 | |
def compress_pdf(input_path, output_path, preset="medium", target_kb=None): | |
doc = fitz.open(input_path) | |
# Presets | |
preset_options = { | |
"low": {"garbage": 3, "deflate": True, "clean": True}, | |
"medium": {"garbage": 2, "deflate": True, "clean": True}, | |
"high": {"garbage": 1, "deflate": True, "clean": True}, | |
} | |
opts = preset_options.get(preset, preset_options["medium"]) | |
if target_kb: | |
step = 10 | |
zoom = 1.0 | |
while True: | |
new_doc = fitz.open() | |
for page in doc: | |
pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom)) | |
rect = fitz.Rect(0, 0, pix.width, pix.height) | |
new_page = new_doc.new_page(width=pix.width, height=pix.height) | |
new_page.insert_image(rect, pixmap=pix) | |
new_doc.save(output_path, **opts) | |
size_kb = os.path.getsize(output_path) / 1024 | |
new_doc.close() | |
if size_kb <= target_kb or zoom <= 0.3: | |
break | |
zoom -= 0.1 | |
else: | |
doc.save(output_path, **opts) | |
doc.close() | |
def compress(): | |
if request.method == 'POST': | |
if "pdf" not in request.files: | |
return jsonify({"success": False, "error": "No file uploaded"}) | |
file = request.files["pdf"] | |
preset = request.form.get("preset", "medium") | |
target_kb = request.form.get("target_kb") | |
if target_kb: | |
try: | |
target_kb = int(target_kb) | |
except: | |
target_kb = None | |
input_path = os.path.join(UPLOAD_FOLDER, file.filename) | |
output_path = os.path.join(OUTPUT_FOLDER, "compressed_" + file.filename) | |
file.save(input_path) | |
try: | |
compress_pdf(input_path, output_path, preset, target_kb) | |
return jsonify({"success": True, "filename": "compressed_" + file.filename}) | |
except Exception as e: | |
return jsonify({"success": False, "error": str(e)}) | |
return render_template('compress.html') | |
def download(filename): | |
return send_file(os.path.join(OUTPUT_FOLDER, filename), as_attachment=True) | |
if __name__ == '__main__': | |
app.run(host='0.0.0.0', port=7860) | |