Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -11,66 +11,6 @@ import pyclip
|
|
11 |
import pytesseract
|
12 |
from nltk.tokenize import sent_tokenize
|
13 |
from transformers import MarianMTModel, MarianTokenizer
|
14 |
-
# Newly added below
|
15 |
-
from fastapi import FastAPI, File, UploadFile, Body, Depends, HTTPException
|
16 |
-
from fastapi.security.api_key import APIKeyHeader
|
17 |
-
from typing import Optional
|
18 |
-
from fastapi.encoders import jsonable_encoder
|
19 |
-
|
20 |
-
API_KEY = os.environ.get("API_KEY")
|
21 |
-
|
22 |
-
app = FastAPI()
|
23 |
-
api_key_header = APIKeyHeader(name="api_key", auto_error=False)
|
24 |
-
|
25 |
-
def get_api_key(api_key: Optional[str] = Depends(api_key_header)):
|
26 |
-
if api_key is None or api_key != API_KEY:
|
27 |
-
raise HTTPException(status_code=401, detail="Unauthorized access")
|
28 |
-
return api_key
|
29 |
-
|
30 |
-
@app.post("/ocr", response_model=dict)
|
31 |
-
async def ocr(
|
32 |
-
api_key: str = Depends(get_api_key),
|
33 |
-
image: UploadFile = File(...),
|
34 |
-
languages: list = Body(["eng"])
|
35 |
-
):
|
36 |
-
# if api_key != API_KEY:
|
37 |
-
# return {"error": "Invalid API key"}, 401
|
38 |
-
|
39 |
-
try:
|
40 |
-
text = image_to_string(await image.read(), lang="+".join(languages))
|
41 |
-
except Exception as e:
|
42 |
-
return {"error": str(e)}, 500
|
43 |
-
|
44 |
-
return jsonable_encoder({"text": text})
|
45 |
-
|
46 |
-
|
47 |
-
@app.post("/translate", response_model=dict)
|
48 |
-
async def translate(
|
49 |
-
api_key: str = Depends(get_api_key),
|
50 |
-
text: str = Body(...),
|
51 |
-
src: str = "en",
|
52 |
-
trg: str = "zh",
|
53 |
-
):
|
54 |
-
# if api_key != API_KEY:
|
55 |
-
# return {"error": "Invalid API key"}, 401
|
56 |
-
|
57 |
-
tokenizer, model = get_model(src, trg)
|
58 |
-
|
59 |
-
translated_text = ""
|
60 |
-
for sentence in sent_tokenize(text):
|
61 |
-
translated_sub = model.generate(**tokenizer(sentence, return_tensors="pt"))[0]
|
62 |
-
translated_text += tokenizer.decode(translated_sub, skip_special_tokens=True) + "\n"
|
63 |
-
|
64 |
-
return jsonable_encoder({"translated_text": translated_text})
|
65 |
-
|
66 |
-
|
67 |
-
def get_model(src: str, trg: str):
|
68 |
-
model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"
|
69 |
-
tokenizer = MarianTokenizer.from_pretrained(model_name)
|
70 |
-
model = MarianMTModel.from_pretrained(model_name)
|
71 |
-
return tokenizer, model
|
72 |
-
|
73 |
-
# ===============================================
|
74 |
|
75 |
nltk.download('punkt')
|
76 |
|
@@ -113,6 +53,7 @@ def ocr_lang(lang_list):
|
|
113 |
# ocr tesseract
|
114 |
def ocr_tesseract(img, languages):
|
115 |
print("[img]", img)
|
|
|
116 |
ocr_str = pytesseract.image_to_string(img, lang=ocr_lang(languages))
|
117 |
return ocr_str
|
118 |
|
|
|
11 |
import pytesseract
|
12 |
from nltk.tokenize import sent_tokenize
|
13 |
from transformers import MarianMTModel, MarianTokenizer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
nltk.download('punkt')
|
16 |
|
|
|
53 |
# ocr tesseract
|
54 |
def ocr_tesseract(img, languages):
|
55 |
print("[img]", img)
|
56 |
+
print("[languages]", languages)
|
57 |
ocr_str = pytesseract.image_to_string(img, lang=ocr_lang(languages))
|
58 |
return ocr_str
|
59 |
|