Samarth991 commited on
Commit
e963fa4
1 Parent(s): 136eadc

adding app

Browse files
Files changed (3) hide show
  1. app.py +192 -0
  2. read_photodocument.py +381 -0
  3. requirements.txt +14 -0
app.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import re
4
+ from langchain.vectorstores import FAISS
5
+ from langchain.embeddings.base import Embeddings
6
+ from typing import List
7
+ from sentence_transformers import SentenceTransformer
8
+ from langchain_community.embeddings import HuggingFaceEmbeddings
9
+ from langchain.prompts import PromptTemplate
10
+ from langchain_community.llms.huggingface_hub import HuggingFaceHub
11
+ from read_photodocument import convert_PDF_to_Text
12
+ from doctr.io import DocumentFile
13
+ from doctr.models import ocr_predictor
14
+ import contextlib
15
+ from langchain.schema import Document
16
+ from langchain.text_splitter import CharacterTextSplitter
17
+ from langchain.chains.summarize import load_summarize_chain
18
+ import logging
19
+
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format="%(asctime)s %(levelname)s %(message)s",
23
+ datefmt="%m/%d/%Y %I:%M:%S",
24
+ )
25
+
26
+ DEVICE = 'cpu'
27
+ FILE_EXT = ['pdf','jpg','jpeg']
28
+ DEFAULT_SYSTEM_PROMPT = "As an intelligent AI your task is to extract text from the pdf containing image and create a summary and higlight vital point within it ."
29
+
30
+ MAX_NEW_TOKENS = 2048
31
+ DEFAULT_TEMPERATURE = 0.1
32
+ DEFAULT_MAX_NEW_TOKENS = 1024
33
+ MAX_INPUT_TOKEN_LENGTH = 2048
34
+
35
+ embedding_modelPath = 'multi-qa-mpnet-base-dot-v1'# "sentence-transformers/all-MiniLM-l6-v2"
36
+ local_embeddings = HuggingFaceEmbeddings(model_name=embedding_modelPath,model_kwargs = {'device':'cpu'},encode_kwargs = {'normalize_embeddings': False})
37
+
38
+
39
+ with contextlib.redirect_stdout(None):
40
+ ocr_model = ocr_predictor(
41
+ "db_resnet50",
42
+ "crnn_mobilenet_v3_large",
43
+ pretrained=True,
44
+ assume_straight_pages=True,
45
+ )
46
+
47
+ def loading_file():
48
+ return "Loading..."
49
+
50
+
51
+ def summarize_data(docs,llm_model,chain_type='refine'):
52
+ prompt_template = """
53
+ Write a concise summary of the following pointwise avoid repetion:
54
+ {text}
55
+ CONCISE SUMMARY:
56
+ """
57
+ refine_template = (
58
+ "Your job is to produce a final summary in points.\n"
59
+ "Existing summary up to a certain point: {existing_answer}\n"
60
+ "write the details of summary pointwise and avoid repetion."
61
+ )
62
+
63
+ prompt = PromptTemplate.from_template(prompt_template)
64
+ refine_prompt = PromptTemplate.from_template(refine_template)
65
+
66
+ chain = load_summarize_chain(llm=llm_model,
67
+ chain_type=chain_type,
68
+ # question_prompt=prompt,
69
+ # refine_prompt=,
70
+ return_intermediate_steps=False,
71
+ input_key="input_documents",
72
+ output_key="output_text",
73
+ )
74
+ summary = chain({"input_documents": docs}, return_only_outputs=True)
75
+ output_text = summary["output_text"].strip()
76
+ regex = r"CONCISE SUMMARY:(.*)"
77
+
78
+ matches = re.finditer(regex, output_text, re.DOTALL)
79
+ for matchNum, match in enumerate(matches, start=1):
80
+ for groupNum in range(0, len(match.groups())):
81
+ groupNum = groupNum + 1
82
+ lines = match.group(groupNum).strip().split("\n")
83
+ return lines
84
+
85
+
86
+ def process_documents(texts,data_chunk=1000,chunk_overlap=10):
87
+ text_splitter = CharacterTextSplitter(
88
+ separator="\n",
89
+ chunk_size=data_chunk,
90
+ chunk_overlap=chunk_overlap,
91
+ length_function=len
92
+ )
93
+
94
+ texts = text_splitter.split_text(texts)
95
+ docs = [Document(page_content=txt) for txt in texts]
96
+ return docs
97
+
98
+ def get_hugging_face_model(model_id='tiiuae/falcon-7b-instruct',temperature=0.01,max_tokens=4096,API_key=None):
99
+ llm = HuggingFaceHub(
100
+ huggingfacehub_api_token =API_key ,
101
+ repo_id=model_id,
102
+ model_kwargs={"temperature":temperature, "max_new_tokens":max_tokens}
103
+ )
104
+ return llm
105
+
106
+
107
+ def document_loader(temperature,max_tokens,api_key,model_name,file_path):
108
+ model = get_hugging_face_model(model_id=model_name,API_key=api_key,temperature=temperature,max_tokens=max_tokens)
109
+ converted_txt = None
110
+ if file_path.endswith('.pdf'):
111
+ conversion_stats = convert_PDF_to_Text(document_file=file_path,ocr_model=ocr_model)
112
+ converted_txt = conversion_stats["converted_text"]
113
+ num_pages = conversion_stats["num_pages"]
114
+ was_truncated = conversion_stats["truncated"]
115
+ print("Converted text {}\nNum Pages;{}".format(converted_txt,num_pages))
116
+
117
+ if converted_txt:
118
+ print("Document Processed ..")
119
+ texts = process_documents(documents=converted_txt)
120
+ lines = summarize_data(docs=texts,llm_model=model)
121
+ return lines
122
+ else:
123
+ return "Error in Processsing document "
124
+
125
+
126
+
127
+ iface = gr.Interface(
128
+ fn= document_loader,inputs = [
129
+ gr.Slider(0.01, 0.1, value=0.01, step=0.01 , label="temperature", info="Choose between 0.01 to 0.1"),
130
+ gr.Slider(512,MAX_INPUT_TOKEN_LENGTH,value=1024,step=512,label="max new tokens",info='Max new tokens'),
131
+ gr.Textbox(label="Add API key", type="password"),
132
+ gr.Dropdown(['tiiuae/falcon-7b-instruct','mistralai/Mistral-7B-v0.1'],label='Large Language Model',info='LLM Service'),
133
+ "file"
134
+ ]
135
+ ouputs="text",
136
+ description ="Summarize your PDF Document having Image • HuggingFace",
137
+ )
138
+
139
+ iface.launch()
140
+
141
+ # with gr.Blocks(css=css) as demo:
142
+ # with gr.Column(elem_id="col-container"):
143
+ # gr.HTML(title)
144
+
145
+ # with gr.Group():
146
+ # chatbot = gr.Chatbot(height=300)
147
+ # with gr.Row():
148
+ # sumarize_btn = gr.Button(value="Summarize", variant="primary", scale = 1)
149
+ # clean_chat_btn = gr.Button("Delete Chat")
150
+
151
+ # with gr.Column():
152
+ # LLM_option = gr.Dropdown(['tiiuae/falcon-7b-instruct','mistralai/Mistral-7B-v0.1'],label='Large Language Model Selection',info='LLM Service')
153
+
154
+ # with gr.Column():
155
+ # with gr.Box():
156
+ # file_extension = gr.Dropdown(FILE_EXT, label="File Extensions", info="Select type of file to upload !")
157
+ # pdf_doc = gr.File(label="Upload File", file_types=FILE_EXT, type="file")
158
+ # with gr.Accordion(label='Advanced options', open=False):
159
+ # max_new_tokens = gr.Slider(
160
+ # label='Max new tokens',
161
+ # minimum=512,
162
+ # maximum=MAX_NEW_TOKENS,
163
+ # step=1024,
164
+ # value=DEFAULT_MAX_NEW_TOKENS,
165
+ # )
166
+ # temperature = gr.Slider(
167
+ # label='Temperature',
168
+ # minimum=0.01,
169
+ # maximum=1.0,
170
+ # step=0.05,
171
+ # value=DEFAULT_TEMPERATURE,
172
+ # )
173
+ # with gr.Row():
174
+ # langchain_status = gr.Textbox(label="Status", placeholder="", interactive = False)
175
+ # load_pdf = gr.Button("Upload File & Generate Embeddings",).style(full_width = False)
176
+
177
+ # # chatbot = gr.Chatbot()l̥
178
+ # # question = gr.Textbox(label="Question", placeholder="Type your question and hit Enter")
179
+ # # submit_button = gr.Button("Send Message")
180
+
181
+ # if pdf_doc:
182
+ # load_pdf.click(loading_file, None, langchain_status, queue=False)
183
+ # load_pdf.click(document_loader, inputs=[pdf_doc,file_extension,temperature,max_new_tokens], outputs=[langchain_status], queue=False)
184
+
185
+ # #question.submit(add_text, inputs=[chatbot, question], outputs=[chatbot, question]).then(bot, chatbot, chatbot)
186
+ # #submit_btn.click(add_text, inputs=[chatbot, question], outputs=[chatbot, question]).then(bot, chatbot, chatbot)
187
+ # sumarize_btn.click()
188
+ # # submit_btn.then(chatf.highlight_found_text, [chatbot, sources], [sources])
189
+ # clean_chat_btn.click(clear_chat, [], chatbot)
190
+
191
+
192
+ # demo.launch()
read_photodocument.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ import os
5
+ import pprint as pp
6
+ import re
7
+ import shutil
8
+ import time
9
+ from datetime import date, datetime
10
+ from os.path import basename, dirname, join
11
+ from pathlib import Path
12
+
13
+ from cleantext import clean
14
+ from doctr.io import DocumentFile
15
+ from doctr.models import ocr_predictor
16
+ from libretranslatepy import LibreTranslateAPI
17
+ from natsort import natsorted
18
+ from spellchecker import SpellChecker
19
+ from tqdm.auto import tqdm
20
+ import nltk
21
+ import contextlib
22
+ nltk.download("stopwords") # TODO=find where this requirement originates from
23
+
24
+
25
+ def simple_rename(filepath, target_ext=".txt"):
26
+ _fp = Path(filepath)
27
+ basename = _fp.stem
28
+ return f"OCR_{basename}_{target_ext}"
29
+
30
+
31
+ def rm_local_text_files(name_contains="RESULT_"):
32
+ """
33
+ rm_local_text_files - remove local text files
34
+ Args:
35
+ name_contains (str, optional): [description]. Defaults to "OCR_".
36
+ """
37
+ files = [
38
+ f
39
+ for f in Path.cwd().iterdir()
40
+ if f.is_file() and f.suffix == ".txt" and name_contains in f.name
41
+ ]
42
+ logging.info(f"removing {len(files)} text files")
43
+ for f in files:
44
+ os.remove(f)
45
+ logging.info("done")
46
+
47
+
48
+ def corr(
49
+ s: str,
50
+ add_space_when_numerics=False,
51
+ exceptions=["e.g.", "i.e.", "etc.", "cf.", "vs.", "p."],
52
+ ) -> str:
53
+ """corrects spacing in a string
54
+ Args:
55
+ s (str): the string to correct
56
+ add_space_when_numerics (bool, optional): [add a space when a period is between two numbers, example 5.73]. Defaults to False.
57
+ exceptions (list, optional): [do not change these substrings]. Defaults to ['e.g.', 'i.e.', 'etc.', 'cf.', 'vs.', 'p.'].
58
+ Returns:
59
+ str: the corrected string
60
+ """
61
+ if add_space_when_numerics:
62
+ s = re.sub(r"(\d)\.(\d)", r"\1. \2", s)
63
+
64
+ s = re.sub(r"\s+", " ", s)
65
+ s = re.sub(r'\s([?.!"](?:\s|$))', r"\1", s)
66
+
67
+ # fix space before apostrophe
68
+ s = re.sub(r"\s\'", r"'", s)
69
+ # fix space after apostrophe
70
+ s = re.sub(r"'\s", r"'", s)
71
+ # fix space before comma
72
+ s = re.sub(r"\s,", r",", s)
73
+
74
+ for e in exceptions:
75
+ expected_sub = re.sub(r"\s", "", e)
76
+ s = s.replace(expected_sub, e)
77
+
78
+ return s
79
+
80
+
81
+ def fix_punct_spaces(string):
82
+ """
83
+ fix_punct_spaces - replace spaces around punctuation with punctuation. For example, "hello , there" -> "hello, there"
84
+ Parameters
85
+ ----------
86
+ string : str, required, input string to be corrected
87
+ Returns
88
+ -------
89
+ str, corrected string
90
+ """
91
+
92
+ fix_spaces = re.compile(r"\s*([?!.,]+(?:\s+[?!.,]+)*)\s*")
93
+ string = fix_spaces.sub(lambda x: "{} ".format(x.group(1).replace(" ", "")), string)
94
+ string = string.replace(" ' ", "'")
95
+ string = string.replace(' " ', '"')
96
+ return string.strip()
97
+
98
+
99
+ def clean_OCR(ugly_text: str):
100
+ """
101
+ clean_OCR - clean the OCR text files.
102
+ Parameters
103
+ ----------
104
+ ugly_text : str, required, input string to be cleaned
105
+ Returns
106
+ -------
107
+ str, cleaned string
108
+ """
109
+ # Remove all the newlines.
110
+ cleaned_text = ugly_text.replace("\n", " ")
111
+ # Remove all the tabs.
112
+ cleaned_text = cleaned_text.replace("\t", " ")
113
+ # Remove all the double spaces.
114
+ cleaned_text = cleaned_text.replace(" ", " ")
115
+ # Remove all the spaces at the beginning of the text.
116
+ cleaned_text = cleaned_text.lstrip()
117
+ # remove all instances of "- " and " - "
118
+ cleaned_text = cleaned_text.replace("- ", "")
119
+ cleaned_text = cleaned_text.replace(" -", "")
120
+ return fix_punct_spaces(cleaned_text)
121
+
122
+
123
+ def move2completed(from_dir, filename, new_folder="completed", verbose=False):
124
+
125
+ # this is the better version
126
+ old_filepath = join(from_dir, filename)
127
+
128
+ new_filedirectory = join(from_dir, new_folder)
129
+
130
+ if not os.path.isdir(new_filedirectory):
131
+ os.mkdir(new_filedirectory)
132
+ if verbose:
133
+ print("created new directory for files at: \n", new_filedirectory)
134
+ new_filepath = join(new_filedirectory, filename)
135
+
136
+ try:
137
+ shutil.move(old_filepath, new_filepath)
138
+ logging.info("successfully moved the file {} to */completed.".format(filename))
139
+ except:
140
+ logging.info(
141
+ "ERROR! unable to move file to \n{}. Please investigate".format(
142
+ new_filepath
143
+ )
144
+ )
145
+
146
+
147
+ """## pdf2text functions
148
+ """
149
+
150
+
151
+ custom_replace_list = {
152
+ "t0": "to",
153
+ "'$": "'s",
154
+ ",,": ", ",
155
+ "_ ": " ",
156
+ " '": "'",
157
+ }
158
+
159
+ replace_corr_exceptions = {
160
+ "i. e.": "i.e.",
161
+ "e. g.": "e.g.",
162
+ "e. g": "e.g.",
163
+ " ,": ",",
164
+ }
165
+
166
+
167
+ spell = SpellChecker()
168
+
169
+
170
+ def check_word_spelling(word: str) -> bool:
171
+ """
172
+ check_word_spelling - check the spelling of a word
173
+ Args:
174
+ word (str): word to check
175
+ Returns:
176
+ bool: True if word is spelled correctly, False if not
177
+ """
178
+
179
+ misspelled = spell.unknown([word])
180
+
181
+ return len(misspelled) == 0
182
+
183
+
184
+ def eval_and_replace(text: str, match_token: str = "- ") -> str:
185
+ """
186
+ eval_and_replace - conditionally replace all instances of a substring in a string based on whether the eliminated substring results in a valid word
187
+ Args:
188
+ text (str): text to evaluate
189
+ match_token (str, optional): token to replace. Defaults to "- ".
190
+ Returns:
191
+ str: text with replaced tokens
192
+ """
193
+
194
+ try:
195
+ if match_token not in text:
196
+ return text
197
+ else:
198
+ while True:
199
+ full_before_text = text.split(match_token, maxsplit=1)[0]
200
+ before_text = [
201
+ char for char in full_before_text.split()[-1] if char.isalpha()
202
+ ]
203
+ before_text = "".join(before_text)
204
+ full_after_text = text.split(match_token, maxsplit=1)[-1]
205
+ after_text = [char for char in full_after_text.split()[0] if char.isalpha()]
206
+ after_text = "".join(after_text)
207
+ full_text = before_text + after_text
208
+ if check_word_spelling(full_text):
209
+ text = full_before_text + full_after_text
210
+ else:
211
+ text = full_before_text + " " + full_after_text
212
+ if match_token not in text:
213
+ break
214
+ except Exception as e:
215
+ logging.error(f"Error spell-checking OCR output, returning default text:\t{e}")
216
+ return text
217
+
218
+
219
+ def cleantxt_ocr(ugly_text, lower=False, lang: str = "en") -> str:
220
+ """
221
+ cleantxt_ocr - clean text from OCR
222
+ Args:
223
+ ugly_text (str): text to clean
224
+ lower (bool, optional): _description_. Defaults to False.
225
+ lang (str, optional): _description_. Defaults to "en".
226
+ Returns:
227
+ str: cleaned text
228
+ """
229
+ # a wrapper for clean text with options different than default
230
+
231
+ # https://pypi.org/project/clean-text/
232
+ cleaned_text = clean(
233
+ ugly_text,
234
+ fix_unicode=True, # fix various unicode errors
235
+ to_ascii=True, # transliterate to closest ASCII representation
236
+ lower=lower, # lowercase text
237
+ no_line_breaks=True, # fully strip line breaks as opposed to only normalizing them
238
+ no_urls=True, # replace all URLs with a special token
239
+ no_emails=False, # replace all email addresses with a special token
240
+ no_phone_numbers=False, # replace all phone numbers with a special token
241
+ no_numbers=False, # replace all numbers with a special token
242
+ no_digits=False, # replace all digits with a special token
243
+ no_currency_symbols=False, # replace all currency symbols with a special token
244
+ no_punct=False, # remove punctuations
245
+ replace_with_punct="", # instead of removing punctuations you may replace them
246
+ replace_with_url="<URL>",
247
+ replace_with_email="<EMAIL>",
248
+ replace_with_phone_number="<PHONE>",
249
+ replace_with_number="<NUM>",
250
+ replace_with_digit="0",
251
+ replace_with_currency_symbol="<CUR>",
252
+ lang=lang, # set to 'de' for German special handling
253
+ )
254
+
255
+ return cleaned_text
256
+
257
+
258
+ def format_ocr_out(OCR_data):
259
+
260
+ if isinstance(OCR_data, list):
261
+ text = " ".join(OCR_data)
262
+ else:
263
+ text = str(OCR_data)
264
+ _clean = cleantxt_ocr(text)
265
+ return corr(_clean)
266
+
267
+
268
+ def postprocess(text: str) -> str:
269
+ """to be used after recombining the lines"""
270
+
271
+ proc = corr(cleantxt_ocr(text))
272
+
273
+ for k, v in custom_replace_list.items():
274
+ proc = proc.replace(str(k), str(v))
275
+
276
+ proc = corr(proc)
277
+
278
+ for k, v in replace_corr_exceptions.items():
279
+ proc = proc.replace(str(k), str(v))
280
+
281
+ return eval_and_replace(proc)
282
+
283
+
284
+ def result2text(result, as_text=False):
285
+ """Convert OCR result to text"""
286
+
287
+ full_doc = []
288
+ for i, page in enumerate(result.pages, start=1):
289
+ text = ""
290
+ for block in page.blocks:
291
+ text += "\n\t"
292
+ for line in block.lines:
293
+ for word in line.words:
294
+ # print(dir(word))
295
+ text += word.value + " "
296
+ full_doc.append(text)
297
+
298
+ return "\n".join(full_doc) if as_text else full_doc
299
+
300
+
301
+ def convert_PDF_to_Text(
302
+ PDF_file,
303
+ ocr_model=None,
304
+ max_pages: int = 20,
305
+ ):
306
+
307
+ st = time.perf_counter()
308
+ PDF_file = Path(PDF_file)
309
+ ocr_model = ocr_predictor(pretrained=True) if ocr_model is None else ocr_model
310
+ logging.info(f"starting OCR on {PDF_file.name}")
311
+ doc = DocumentFile.from_pdf(PDF_file)
312
+ truncated = False
313
+ if len(doc) > max_pages:
314
+ logging.warning(
315
+ f"PDF has {len(doc)} pages, which is more than {max_pages}.. truncating"
316
+ )
317
+ doc = doc[:max_pages]
318
+ truncated = True
319
+
320
+ # Analyze
321
+ logging.info(f"running OCR on {len(doc)} pages")
322
+ result = ocr_model(doc)
323
+ raw_text = result2text(result)
324
+ proc_text = [format_ocr_out(r) for r in raw_text]
325
+ fin_text = [postprocess(t) for t in proc_text]
326
+
327
+ ocr_results = "\n\n".join(fin_text)
328
+
329
+ fn_rt = time.perf_counter() - st
330
+
331
+ logging.info("OCR complete")
332
+
333
+ results_dict = {
334
+ "num_pages": len(doc),
335
+ "runtime": round(fn_rt, 2),
336
+ "date": str(date.today()),
337
+ "converted_text": ocr_results,
338
+ "truncated": truncated,
339
+ "length": len(ocr_results),
340
+ }
341
+
342
+ return results_dict
343
+
344
+
345
+ # @title translation functions
346
+
347
+ lt = LibreTranslateAPI("https://translate.astian.org/")
348
+
349
+
350
+ def translate_text(text, source_l, target_l="en"):
351
+
352
+ return str(lt.translate(text, source_l, target_l))
353
+
354
+
355
+ def translate_doc(filepath, lang_start, lang_end="en", verbose=False):
356
+ """translate a document from lang_start to lang_end
357
+ {'code': 'en', 'name': 'English'},
358
+ {'code': 'fr', 'name': 'French'},
359
+ {'code': 'de', 'name': 'German'},
360
+ {'code': 'it', 'name': 'Italian'},"""
361
+
362
+ src_folder = dirname(filepath)
363
+ src_folder = Path(src_folder)
364
+ trgt_folder = src_folder / f"translated_{lang_end}"
365
+ trgt_folder.mkdir(exist_ok=True)
366
+ with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
367
+ foreign_t = f.readlines()
368
+ in_name = basename(filepath)
369
+ translated_doc = []
370
+ for line in tqdm(
371
+ foreign_t, total=len(foreign_t), desc="translating {}...".format(in_name[:10])
372
+ ):
373
+ translated_line = translate_text(line, lang_start, lang_end)
374
+ translated_doc.append(translated_line)
375
+ t_out_name = "[To {}]".format(lang_end) + simple_rename(in_name) + ".txt"
376
+ out_path = join(trgt_folder, t_out_name)
377
+ with open(out_path, "w", encoding="utf-8", errors="ignore") as f_o:
378
+ f_o.writelines(translated_doc)
379
+ if verbose:
380
+ print("finished translating the document! - ", datetime.now())
381
+ return out_path
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio==3.0.11
2
+ tiktoken
3
+ chromadb
4
+ langchain
5
+ unstructured
6
+ unstructured[local-inference]
7
+ transformers
8
+ torch
9
+ faiss-cpu
10
+ sentence-transformers
11
+ chromadb
12
+ bitsandbytes
13
+ accelerate
14
+ doctr