Spaces:
Runtime error
Runtime error
ffreemt
commited on
Commit
·
f1dfff2
1
Parent(s):
4f331cc
Update epub files
Browse files
app.py
CHANGED
@@ -4,7 +4,7 @@ and https://github.com/PromtEngineer/localGPT/blob/main/ingest.py
|
|
4 |
|
5 |
https://python.langchain.com/en/latest/getting_started/tutorials.html
|
6 |
"""
|
7 |
-
# pylint: disable=broad-exception-caught, unused-import, invalid-name, line-too-long
|
8 |
import os
|
9 |
import time
|
10 |
from pathlib import Path
|
@@ -13,11 +13,15 @@ from types import SimpleNamespace
|
|
13 |
import gradio as gr
|
14 |
from charset_normalizer import detect
|
15 |
from chromadb.config import Settings
|
|
|
16 |
from langchain.chains import RetrievalQA
|
17 |
from langchain.docstore.document import Document
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
21 |
|
22 |
# from constants import CHROMA_SETTINGS, SOURCE_DIRECTORY, PERSIST_DIRECTORY
|
23 |
from langchain.embeddings import HuggingFaceInstructEmbeddings
|
@@ -36,7 +40,6 @@ from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
|
|
36 |
# import click
|
37 |
# from typing import List
|
38 |
|
39 |
-
|
40 |
# from utils import xlxs_to_csv
|
41 |
|
42 |
# load possible env such as OPENAI_API_KEY
|
@@ -87,15 +90,26 @@ def load_single_document(file_path: str | Path) -> Document:
|
|
87 |
loader = PDFMinerLoader(file_path)
|
88 |
elif file_path.endswith(".csv"):
|
89 |
loader = CSVLoader(file_path)
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
else:
|
92 |
if encoding is None:
|
93 |
logger.warning(
|
94 |
f" {file_path}'s encoding is None "
|
95 |
"Likely binary files, return empty str "
|
96 |
)
|
97 |
-
return ""
|
98 |
-
|
99 |
try:
|
100 |
loader = TextLoader(file_path)
|
101 |
except Exception as exc:
|
@@ -319,6 +333,9 @@ def main():
|
|
319 |
|
320 |
def respond(message, chat_history):
|
321 |
# bot_message = random.choice(["How are you?", "I love you", "I'm very hungry"])
|
|
|
|
|
|
|
322 |
res = ns.qa(message)
|
323 |
answer, docs = res["result"], res["source_documents"]
|
324 |
bot_message = f"{answer} ({docs})"
|
|
|
4 |
|
5 |
https://python.langchain.com/en/latest/getting_started/tutorials.html
|
6 |
"""
|
7 |
+
# pylint: disable=broad-exception-caught, unused-import, invalid-name, line-too-long, too-many-return-statements
|
8 |
import os
|
9 |
import time
|
10 |
from pathlib import Path
|
|
|
13 |
import gradio as gr
|
14 |
from charset_normalizer import detect
|
15 |
from chromadb.config import Settings
|
16 |
+
from epub2txt import epub2txt
|
17 |
from langchain.chains import RetrievalQA
|
18 |
from langchain.docstore.document import Document
|
19 |
+
from langchain.document_loaders import (
|
20 |
+
CSVLoader,
|
21 |
+
Docx2txtLoader,
|
22 |
+
PDFMinerLoader,
|
23 |
+
TextLoader,
|
24 |
+
)
|
25 |
|
26 |
# from constants import CHROMA_SETTINGS, SOURCE_DIRECTORY, PERSIST_DIRECTORY
|
27 |
from langchain.embeddings import HuggingFaceInstructEmbeddings
|
|
|
40 |
# import click
|
41 |
# from typing import List
|
42 |
|
|
|
43 |
# from utils import xlxs_to_csv
|
44 |
|
45 |
# load possible env such as OPENAI_API_KEY
|
|
|
90 |
loader = PDFMinerLoader(file_path)
|
91 |
elif file_path.endswith(".csv"):
|
92 |
loader = CSVLoader(file_path)
|
93 |
+
elif Path(file_path).suffix in [".docx"]:
|
94 |
+
try:
|
95 |
+
loader = Docx2txtLoader(file_path)
|
96 |
+
except Exception as exc:
|
97 |
+
logger.error(f" {file_path} errors: {exc}")
|
98 |
+
return Document(page_content="", metadata={"source": file_path})
|
99 |
+
elif Path(file_path).suffix in [".epub"]: # for epub? epub2txt unstructured
|
100 |
+
try:
|
101 |
+
_ = epub2txt(file_path)
|
102 |
+
except Exception as exc:
|
103 |
+
logger.error(f" {file_path} errors: {exc}")
|
104 |
+
return Document(page_content="", metadata={"source": file_path})
|
105 |
+
return Document(page_content=_, metadata={"source": file_path})
|
106 |
else:
|
107 |
if encoding is None:
|
108 |
logger.warning(
|
109 |
f" {file_path}'s encoding is None "
|
110 |
"Likely binary files, return empty str "
|
111 |
)
|
112 |
+
return Document(page_content="", metadata={"source": file_path})
|
|
|
113 |
try:
|
114 |
loader = TextLoader(file_path)
|
115 |
except Exception as exc:
|
|
|
333 |
|
334 |
def respond(message, chat_history):
|
335 |
# bot_message = random.choice(["How are you?", "I love you", "I'm very hungry"])
|
336 |
+
if ns.qa is None: # no files processed yet
|
337 |
+
return "Provide some file(s) for processsing first.", chat_history
|
338 |
+
|
339 |
res = ns.qa(message)
|
340 |
answer, docs = res["result"], res["source_documents"]
|
341 |
bot_message = f"{answer} ({docs})"
|