Spaces:
Sleeping
Sleeping
from fastapi import FastAPI, UploadFile, File | |
from fastapi.responses import FileResponse | |
from fastapi.staticfiles import StaticFiles | |
import pytesseract | |
from PIL import Image | |
import os | |
app = FastAPI() | |
# Directories for uploads and outputs | |
UPLOAD_DIR = "uploads" | |
OUTPUT_DIR = "outputs" | |
os.makedirs(UPLOAD_DIR, exist_ok=True) | |
os.makedirs(OUTPUT_DIR, exist_ok=True) | |
# Supported languages and their codes | |
SUPPORTED_LANGUAGES = { | |
"eng": "English", | |
"rus": "Russian", | |
"msa": "Malay", | |
} | |
def detect_language(text): | |
"""Detect the language from the extracted text.""" | |
from langdetect import detect, DetectorFactory | |
DetectorFactory.seed = 0 # Ensures consistent results | |
try: | |
detected_lang = detect(text) | |
return detected_lang | |
except Exception: | |
return "unknown" | |
async def upload_image(file: UploadFile = File(...)): | |
file_path = os.path.join(UPLOAD_DIR, file.filename) | |
with open(file_path, "wb") as f: | |
f.write(await file.read()) | |
try: | |
# Perform OCR | |
image = Image.open(file_path) | |
# Specify languages to be used for OCR | |
text = pytesseract.image_to_string(image, lang='+'.join(SUPPORTED_LANGUAGES.keys())) | |
# Detect language of the extracted text | |
detected_lang = detect_language(text) | |
# Save OCR result as Markdown | |
markdown_path = os.path.join(OUTPUT_DIR, f"{os.path.splitext(file.filename)[0]}.md") | |
with open(markdown_path, "w", encoding="utf-8") as md_file: | |
md_file.write(f"# Detected Language: {detected_lang}\n\n{text}") | |
except Exception as e: | |
return {"error": str(e)} | |
return {"download_url": f"/download/{os.path.basename(markdown_path)}"} | |
async def download_file(filename: str): | |
file_path = os.path.join(OUTPUT_DIR, filename) | |
if os.path.exists(file_path): | |
return FileResponse(file_path, media_type='text/markdown', filename=filename) | |
return {"error": "File not found"} | |
# Serve static files (HTML and assets) | |
app.mount("/", StaticFiles(directory="static", html=True), name="static") | |