Oh Gyuhyeok commited on
Commit
59e8fda
โ€ข
1 Parent(s): 360c8b1

Implement PDF Upload (#2)

Browse files

* Implement PDF Upload

1. Remove line breaks
2. Translate pdfs

easyword_translator/pdf.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pymupdf # PyMuPDF
2
+
3
+
4
+ def read_pdf(file_path):
5
+ # Open the PDF file
6
+ document = pymupdf.open(file_path)
7
+ text = ""
8
+
9
+ # Iterate through the pages
10
+ for page_num in range(len(document)):
11
+ # Extract text from each page
12
+ page = document.load_page(page_num)
13
+ text += page.get_text()
14
+
15
+ # Close the PDF document
16
+ document.close()
17
+
18
+ return text
19
+
20
+
21
+ def remove_line_breaks(text):
22
+ # remove only single line breaks, not paragraphs
23
+ # find line breaks and it is not followed by a period
24
+ for i in range(len(text)):
25
+ if i == 0 or i == len(text) - 1:
26
+ continue
27
+ if text[i] == "\n" and text[i - 1] != "." and text[i + 1] != "\n":
28
+ text = text[:i] + " " + text[i + 1 :]
29
+ return text
30
+
31
+
32
+ if __name__ == "__main__":
33
+ file_path = "example3.pdf"
34
+ pdf_text = read_pdf(file_path)
35
+ print(remove_line_breaks(pdf_text))
easyword_translator/run.py CHANGED
@@ -9,6 +9,8 @@ from langchain_core.prompts import ChatPromptTemplate
9
  from langchain_upstage import ChatUpstage
10
  from rapidfuzz import process
11
 
 
 
12
  warnings.filterwarnings("ignore")
13
 
14
 
@@ -125,15 +127,102 @@ def translate(sentence: str) -> str:
125
  f"์ „๋ฌธ์šฉ์–ด๋ฅผ ๋ฒˆ์—ญํ–ˆ์œผ๋ฉด ๋ฐ˜๋“œ์‹œ ์›์–ด๋ฅผ ๊ด„ํ˜ธ[]์— ๋„ฃ์–ด์„œ ๋”ฐ๋ผ ๋ถ™์—ฌ์•ผ ํ•ด. '์‹คํ–‰ํ๋ฆ„[control]'์ฒ˜๋Ÿผ. ๋ฐฉ๊ธˆ ๋ฒˆ์—ญํ•œ '{refined_translation}'์—์„œ, ์›๋ž˜ ๋ฌธ์žฅ '{sentence}'์— ์‚ฌ์šฉ๋œ ์›์–ด๋ฅผ ์šฉ์–ด ๋ฐ”๋กœ ๋’ค์— ๊ด„ํ˜ธ []์— ๋„ฃ์–ด์„œ ๋”ฐ๋ผ ๋ถ™์—ฌ์ค˜.",
126
  ),
127
  ]
128
- refined_translation = chainer(messages).invoke({})
 
 
 
 
 
129
  logger.info(refined_translation)
130
 
131
  refined_translation = refined_translation.replace("[", "(").replace("]", ")")
132
  return refined_translation
133
 
134
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  with gr.Blocks() as demo:
136
- with gr.Tab("CHAT"):
137
  chatbot = gr.Interface(
138
  fn=translate,
139
  inputs=gr.Textbox(label="Enter your text"),
@@ -150,6 +239,24 @@ with gr.Blocks() as demo:
150
  title=TITLE,
151
  description=DESCRIPTION,
152
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
 
155
  def main():
 
9
  from langchain_upstage import ChatUpstage
10
  from rapidfuzz import process
11
 
12
+ import pymupdf
13
+
14
  warnings.filterwarnings("ignore")
15
 
16
 
 
127
  f"์ „๋ฌธ์šฉ์–ด๋ฅผ ๋ฒˆ์—ญํ–ˆ์œผ๋ฉด ๋ฐ˜๋“œ์‹œ ์›์–ด๋ฅผ ๊ด„ํ˜ธ[]์— ๋„ฃ์–ด์„œ ๋”ฐ๋ผ ๋ถ™์—ฌ์•ผ ํ•ด. '์‹คํ–‰ํ๋ฆ„[control]'์ฒ˜๋Ÿผ. ๋ฐฉ๊ธˆ ๋ฒˆ์—ญํ•œ '{refined_translation}'์—์„œ, ์›๋ž˜ ๋ฌธ์žฅ '{sentence}'์— ์‚ฌ์šฉ๋œ ์›์–ด๋ฅผ ์šฉ์–ด ๋ฐ”๋กœ ๋’ค์— ๊ด„ํ˜ธ []์— ๋„ฃ์–ด์„œ ๋”ฐ๋ผ ๋ถ™์—ฌ์ค˜.",
128
  ),
129
  ]
130
+ try:
131
+ refined_translation = chainer(messages).invoke({})
132
+ except Exception as e:
133
+ logger.error(e)
134
+ break
135
+
136
  logger.info(refined_translation)
137
 
138
  refined_translation = refined_translation.replace("[", "(").replace("]", ")")
139
  return refined_translation
140
 
141
 
142
+ class PDFFile:
143
+ def __init__(self):
144
+ self.file_list = []
145
+
146
+ def read_pdf(self, file_path: str) -> str:
147
+ # Open the PDF file
148
+ document = pymupdf.open(file_path)
149
+ text = ""
150
+
151
+ # Iterate through the pages
152
+ for page_num in range(len(document)):
153
+ # Extract text from each page
154
+ page = document.load_page(page_num)
155
+ text += page.get_text()
156
+
157
+ # Close the PDF document
158
+ document.close()
159
+
160
+ return text
161
+
162
+ def remove_line_breaks(self, text: str) -> list[str]:
163
+ # remove only single line breaks, not paragraphs
164
+ # find line breaks and it is not followed by a period
165
+ for i in range(len(text)):
166
+ if i == 0 or i == len(text) - 1:
167
+ continue
168
+ if text[i] == "\n" and text[i - 1] != "." and text[i + 1] != "\n":
169
+ text = text[:i] + " " + text[i + 1 :]
170
+ return text
171
+
172
+ def upload_file(self, file_path: str) -> list[str]:
173
+ self.file_list.append(file_path)
174
+ return self.file_list
175
+
176
+ def transalte_pdf(
177
+ self,
178
+ remove_line_breaks: bool,
179
+ save_before_translation: bool,
180
+ ) -> str:
181
+ if not self.file_list:
182
+ return "No file uploaded yet."
183
+
184
+ file_out_list = []
185
+ for file in self.file_list:
186
+ directory = os.path.dirname(file)
187
+ filename = os.path.basename(file)
188
+
189
+ # remove extension
190
+ filename = ".".join(filename.split(".")[:-1])
191
+ pdf_text = self.read_pdf(file)
192
+ if remove_line_breaks:
193
+ pdf_text = self.remove_line_breaks(pdf_text)
194
+
195
+ if save_before_translation:
196
+ with open(f"{directory}/{filename}_pre.txt", "w") as f:
197
+ f.write(pdf_text)
198
+ file_out_list.append(f"{directory}/{filename}_pre.txt")
199
+
200
+ # translation = translate(pdf_text)
201
+ # Translation with divide and conquer with 50 sentences
202
+ translation = ""
203
+ # seperate the text into sentences
204
+ sentences = pdf_text.split(".")
205
+ for i in range(0, len(sentences), 50):
206
+ translation += translate(".".join(sentences[i : i + 50])) + ". "
207
+
208
+ with open(f"{directory}/{filename}_translated.txt", "w") as f:
209
+ f.write(translation)
210
+ file_out_list.append(f"{directory}/{filename}_translated.txt")
211
+
212
+ self.file_list = []
213
+ # Zip the files
214
+ # import zipfile
215
+
216
+ # with zipfile.ZipFile(f"{directory}/translated_files.zip", "w") as z:
217
+ # for file in file_out_list:
218
+ # z.write(file)
219
+
220
+ # return f"{directory}/translated_files.zip"
221
+ return file_out_list
222
+
223
+
224
  with gr.Blocks() as demo:
225
+ with gr.Tab("TEXT"):
226
  chatbot = gr.Interface(
227
  fn=translate,
228
  inputs=gr.Textbox(label="Enter your text"),
 
239
  title=TITLE,
240
  description=DESCRIPTION,
241
  )
242
+ with gr.Tab("PDF"):
243
+ pdf_file = PDFFile()
244
+ upload_button = gr.UploadButton(
245
+ label="Upload PDF",
246
+ file_types=[".pdf"],
247
+ )
248
+ upload_file_list_box = gr.File(label="Uploaded Files")
249
+
250
+ upload_button.upload(pdf_file.upload_file, upload_button, upload_file_list_box)
251
+
252
+ run_translator = gr.Interface(
253
+ fn=pdf_file.transalte_pdf,
254
+ inputs=[
255
+ gr.Checkbox(label="Remove line breaks"),
256
+ gr.Checkbox(label="Save before translation"),
257
+ ],
258
+ outputs=[gr.File(label="Download Translated Files")],
259
+ )
260
 
261
 
262
  def main():