PDFTranslate / pdf2zh /translator.py
sanbo
update sth. at 2024-11-26 16:15:47
9b0f4a0
raw
history blame
12.1 kB
import html
import logging
import os
import re
from json import dumps, loads
import deepl
import ollama
import openai
import requests
from azure.ai.translation.text import TextTranslationClient
from azure.core.credentials import AzureKeyCredential
import hmac
import hashlib
import time
from datetime import datetime,UTC
class BaseTranslator:
def __init__(self, service, lang_out, lang_in, model):
self.service = service
self.lang_out = lang_out
self.lang_in = lang_in
self.model = model
def translate(self, text) -> str: ... # noqa: E704
def __str__(self):
return f"{self.service} {self.lang_out} {self.lang_in}"
class GoogleTranslator(BaseTranslator):
def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh-CN" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
self.session = requests.Session()
self.base_link = "http://translate.google.com/m"
self.headers = {
"User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
}
def translate(self, text):
text = text[:5000] # google translate max length
response = self.session.get(
self.base_link,
params={"tl": self.lang_out, "sl": self.lang_in, "q": text},
headers=self.headers,
)
re_result = re.findall(
r'(?s)class="(?:t0|result-container)">(.*?)<', response.text
)
if response.status_code == 400:
result = "IRREPARABLE TRANSLATION ERROR"
elif len(re_result) == 0:
raise ValueError("Empty translation result")
else:
result = html.unescape(re_result[0])
return result
class TencentTranslator(BaseTranslator):
def sign(self,key, msg):
return hmac.new(key, msg.encode("utf-8"), hashlib.sha256).digest()
def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
try:
server_url = (
"tmt.tencentcloudapi.com"
)
self.secret_id = os.getenv("TENCENT_SECRET_ID")
self.secret_key = os.getenv("TENCENT_SECRET_KEY")
except KeyError as e:
missing_var = e.args[0]
raise ValueError(
f"The environment variable '{missing_var}' is required but not set."
) from e
self.session = requests.Session()
self.base_link = f"{server_url}"
def translate(self, text):
text = text[:5000]
data={
"SourceText":text,
"Source":self.lang_in,
"Target":self.lang_out,
"ProjectId":0
}
payloadx = dumps(data)
hashed_request_payload = hashlib.sha256(payloadx.encode("utf-8")).hexdigest()
canonical_request = ("POST" + "\n" +
"/" + "\n" +
"" + "\n" +
"content-type:application/json; charset=utf-8\nhost:tmt.tencentcloudapi.com\nx-tc-action:texttranslate\n" + "\n" +
"content-type;host;x-tc-action" + "\n" +
hashed_request_payload)
timestamp = int(time.time())
date = datetime.fromtimestamp(timestamp, UTC).strftime("%Y-%m-%d")
credential_scope = date + "/tmt/tc3_request"
hashed_canonical_request = hashlib.sha256(canonical_request.encode("utf-8")).hexdigest()
algorithm = "TC3-HMAC-SHA256"
string_to_sign = (algorithm + "\n" +
str(timestamp) + "\n" +
credential_scope + "\n" +
hashed_canonical_request)
secret_date = self.sign(("TC3" + self.secret_key).encode("utf-8"), date)
secret_service = self.sign(secret_date, "tmt")
secret_signing = self.sign(secret_service, "tc3_request")
signed_headers = "content-type;host;x-tc-action"
signature = hmac.new(secret_signing, string_to_sign.encode("utf-8"), hashlib.sha256).hexdigest()
authorization = (algorithm + " " +
"Credential=" + self.secret_id + "/" + credential_scope + ", " +
"SignedHeaders=" + signed_headers + ", " +
"Signature=" + signature)
self.headers = {
"Authorization": authorization,
"Content-Type": "application/json; charset=utf-8",
"Host": "tmt.tencentcloudapi.com",
"X-TC-Action": "TextTranslate",
"X-TC-Region":"ap-beijing",
"X-TC-Timestamp": str(timestamp),
"X-TC-Version": "2018-03-21"
}
response = self.session.post(
"https://"+self.base_link,
json=data,
headers=self.headers,
)
# 1. Status code test
if response.status_code == 200:
result = loads(response.text)
else:
raise ValueError("HTTP error: " + str(response.status_code))
# 2. Result test
try:
result = result['Response']['TargetText']
return result
except KeyError:
result = ""
raise ValueError("No valid key in Tencent's response")
# 3. Result length check
if len(result) == 0:
raise ValueError("Empty translation result")
return result
class DeepLXTranslator(BaseTranslator):
def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
try:
auth_key = os.getenv("DEEPLX_AUTH_KEY")
server_url = (
"https://api.deeplx.org"
if not os.getenv("DEEPLX_SERVER_URL")
else os.getenv("DEEPLX_SERVER_URL")
)
except KeyError as e:
missing_var = e.args[0]
raise ValueError(
f"The environment variable '{missing_var}' is required but not set."
) from e
self.session = requests.Session()
server_url=server_url.rstrip('/')
if auth_key:
self.base_link = f"{server_url}/{auth_key}/translate"
else:
self.base_link = f"{server_url}/translate"
self.headers = {
"User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
}
def translate(self, text):
text = text[:5000] # google translate max length
response = self.session.post(
self.base_link,
dumps(
{
"target_lang": self.lang_out,
"text": text,
}
),
headers=self.headers,
)
# 1. Status code test
if response.status_code == 200:
result = loads(response.text)
else:
raise ValueError("HTTP error: " + str(response.status_code))
# 2. Result test
try:
result = result["data"]
return result
except KeyError:
result = ""
raise ValueError("No valid key in DeepLX's response")
# 3. Result length check
if len(result) == 0:
raise ValueError("Empty translation result")
return result
class DeepLTranslator(BaseTranslator):
def __init__(self, service, lang_out, lang_in, model):
lang_out = "ZH" if lang_out == "auto" else lang_out
lang_in = "EN" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
self.session = requests.Session()
auth_key = os.getenv("DEEPL_AUTH_KEY")
server_url = os.getenv("DEEPL_SERVER_URL")
self.client = deepl.Translator(auth_key, server_url=server_url)
def translate(self, text):
response = self.client.translate_text(
text, target_lang=self.lang_out, source_lang=self.lang_in
)
return response.text
class OllamaTranslator(BaseTranslator):
def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh-CN" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
self.options = {"temperature": 0} # 随机采样可能会打断公式标记
# OLLAMA_HOST
self.client = ollama.Client()
def translate(self, text):
response = self.client.chat(
model=self.model,
options=self.options,
messages=[
{
"role": "system",
"content": "You are a professional,authentic machine translation engine.",
},
{
"role": "user",
"content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
},
],
)
return response["message"]["content"].strip()
class OpenAITranslator(BaseTranslator):
def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh-CN" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
self.options = {"temperature": 0} # 随机采样可能会打断公式标记
# OPENAI_BASE_URL
# OPENAI_API_KEY
self.client = openai.OpenAI()
def translate(self, text) -> str:
response = self.client.chat.completions.create(
model=self.model,
**self.options,
messages=[
{
"role": "system",
"content": "You are a professional,authentic machine translation engine.",
},
{
"role": "user",
"content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
},
],
)
return response.choices[0].message.content.strip()
class AzureTranslator(BaseTranslator):
def __init__(self, service, lang_out, lang_in, model):
lang_out = "zh-Hans" if lang_out == "auto" else lang_out
lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
try:
api_key = os.environ["AZURE_APIKEY"]
endpoint = os.environ["AZURE_ENDPOINT"]
region = os.environ["AZURE_REGION"]
except KeyError as e:
missing_var = e.args[0]
raise ValueError(
f"The environment variable '{missing_var}' is required but not set."
) from e
credential = AzureKeyCredential(api_key)
self.client = TextTranslationClient(
endpoint=endpoint, credential=credential, region=region
)
# https://github.com/Azure/azure-sdk-for-python/issues/9422
logger = logging.getLogger("azure.core.pipeline.policies.http_logging_policy")
logger.setLevel(logging.WARNING)
def translate(self, text) -> str:
response = self.client.translate(
body=[text],
from_language=self.lang_in,
to_language=[self.lang_out],
)
translated_text = response[0].translations[0].text
return translated_text