File size: 3,384 Bytes
5e30561
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f669dd9
5e30561
 
dd9b2b1
5e30561
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import gradio as gr
import os
import shutil
from pypdf import PdfReader
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import fitz

TOKENIZER_REPO = "MediaTek-Research/Breeze-7B-Instruct-v1_0"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_REPO,local_files_only=False,use_fast=True)
tran_hints = "请将以下的文字转为繁体:"
start_flag="<s>"
end_flag="</s>"
model = AutoModelForCausalLM.from_pretrained(
    TOKENIZER_REPO,
    device_map="auto",
    local_files_only=False,
    torch_dtype=torch.bfloat16
)

def generate(text):
    chat_data = []
    text = text.strip()
    if text:
       chat_data.append({"role": "user", "content": text})
    achat=tokenizer.apply_chat_template(chat_data,return_tensors="pt")
    #achat=tokenizer.encode(chat_data,return_tensors="pt",max_length=2048)
    outputs = model.generate(achat,
                         max_new_tokens=2048,
                         top_p=0.01,
                         top_k=85,
                         repetition_penalty=1.1,
                         temperature=0)

    return tokenizer.decode(outputs[0])

def tran_txt(input_txt):
  data_txt=tran_hints+"\n"+input_txt.strip()
  tran_result=generate(data_txt)
  print("tran_result="+tran_result)
#  tran_result=tran_result.strip()
#  index=tran_result.find(start_flag)
#  if index>=0:
#     tran_result=tran_result[len(start_flag):]
#     tran_result=tran_result.strip()
#     c_index=tran_result.find(data_txt)
#     if c_index>=0:
#        tran_result=tran_result[len(data_txt):]
#     e_index=tran_result.find(end_flag)
#     if e_index>=0:
#        tran_result=tran_result[0:e_index]
  return tran_result

def exec_tran(file):
  temp_file=upload_file(file)
  page_texts=read_paragraphs(temp_file)
  temp_result_file=file;
  file_index=temp_result_file.index('.pdf')
  if file_index!=-1:
    temp_result_file=temp_result_file[0:file_index]
    temp_result_file=temp_result_file+"_result.txt"
  else :
    temp_result_file=temp_result_file+"_result.txt"
  tran_file_name=file.name
  with open(temp_result_file,'w') as fw:
    tran_result=tran_txt(tran_hints)
 #   print(tran_result+"\n")
    for page_content in page_texts:
      #lines=page_content.split('\n')
      #for line_content in lines:
          #print("input="+line_content)
          tran_result=tran_txt(page_content)
 #         print("result="+tran_result)
          fw.write(tran_result+"\n")
  return temp_result_file

def upload_file(file):
  UPLOAD_FOLDER="./data"
  if not os.path.exists(UPLOAD_FOLDER):
    os.mkdir(UPLOAD_FOLDER)
  return shutil.copy(file,UPLOAD_FOLDER)

def read_paragraphs(pdf_path):
    document = fitz.open(pdf_path)
    paragraphs = []

    for page in document:
        text = page.get_text("paragraphs")
        para_list = text.split('。')
        paragraphs.extend([para for para in para_list if para.strip()])
    document.close()
    return paragraphs

def load_pdf_pages(filename):
  page_texts=[]
  reader = PdfReader(filename)
  for page in reader.pages:
     page_texts.append(page.extract_text())
  return page_texts

def exec_translate(file):
  upload_file(file)
  page_texts=load_pdf_pages(file.name)

with gr.Blocks() as app:
   file_output=gr.File()
   upload_button=gr.UploadButton("上传pdf文件",file_types=["pdf"],file_count="single")
   upload_button.upload(exec_tran,upload_button,file_output)

app.launch()