Spaces:
Running
Running
Update bin_public/app/llama_func.py
Browse files- bin_public/app/llama_func.py +16 -12
bin_public/app/llama_func.py
CHANGED
@@ -15,34 +15,38 @@ from bin_public.utils.utils import *
|
|
15 |
|
16 |
def get_documents(file_src):
|
17 |
documents = []
|
18 |
-
index_name = ""
|
19 |
logging.debug("Loading documents...")
|
20 |
logging.debug(f"file_src: {file_src}")
|
21 |
for file in file_src:
|
22 |
-
logging.
|
23 |
-
index_name += file.name
|
24 |
if os.path.splitext(file.name)[1] == ".pdf":
|
25 |
logging.debug("Loading PDF...")
|
26 |
-
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
29 |
elif os.path.splitext(file.name)[1] == ".docx":
|
30 |
logging.debug("Loading DOCX...")
|
31 |
DocxReader = download_loader("DocxReader")
|
32 |
loader = DocxReader()
|
33 |
-
|
34 |
elif os.path.splitext(file.name)[1] == ".epub":
|
35 |
logging.debug("Loading EPUB...")
|
36 |
EpubReader = download_loader("EpubReader")
|
37 |
loader = EpubReader()
|
38 |
-
|
39 |
else:
|
40 |
logging.debug("Loading text file...")
|
41 |
with open(file.name, "r", encoding="utf-8") as f:
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
46 |
|
47 |
|
48 |
def construct_index(
|
|
|
15 |
|
16 |
def get_documents(file_src):
|
17 |
documents = []
|
|
|
18 |
logging.debug("Loading documents...")
|
19 |
logging.debug(f"file_src: {file_src}")
|
20 |
for file in file_src:
|
21 |
+
logging.info(f"loading file: {file.name}")
|
|
|
22 |
if os.path.splitext(file.name)[1] == ".pdf":
|
23 |
logging.debug("Loading PDF...")
|
24 |
+
pdftext = ""
|
25 |
+
with open(file.name, 'rb') as pdfFileObj:
|
26 |
+
pdfReader = PyPDF2.PdfReader(pdfFileObj)
|
27 |
+
for page in tqdm(pdfReader.pages):
|
28 |
+
pdftext += page.extract_text()
|
29 |
+
text_raw = pdftext
|
30 |
elif os.path.splitext(file.name)[1] == ".docx":
|
31 |
logging.debug("Loading DOCX...")
|
32 |
DocxReader = download_loader("DocxReader")
|
33 |
loader = DocxReader()
|
34 |
+
text_raw = loader.load_data(file=file.name)[0].text
|
35 |
elif os.path.splitext(file.name)[1] == ".epub":
|
36 |
logging.debug("Loading EPUB...")
|
37 |
EpubReader = download_loader("EpubReader")
|
38 |
loader = EpubReader()
|
39 |
+
text_raw = loader.load_data(file=file.name)[0].text
|
40 |
else:
|
41 |
logging.debug("Loading text file...")
|
42 |
with open(file.name, "r", encoding="utf-8") as f:
|
43 |
+
text_raw = f.read()
|
44 |
+
text = add_space(text_raw)
|
45 |
+
# text = block_split(text)
|
46 |
+
# documents += text
|
47 |
+
documents += [Document(text)]
|
48 |
+
logging.debug("Documents loaded.")
|
49 |
+
return documents
|
50 |
|
51 |
|
52 |
def construct_index(
|