ffreemt commited on
Commit
f1dfff2
·
1 Parent(s): 4f331cc

Update epub files

Browse files
Files changed (1) hide show
  1. app.py +25 -8
app.py CHANGED
@@ -4,7 +4,7 @@ and https://github.com/PromtEngineer/localGPT/blob/main/ingest.py
4
 
5
  https://python.langchain.com/en/latest/getting_started/tutorials.html
6
  """
7
- # pylint: disable=broad-exception-caught, unused-import, invalid-name, line-too-long
8
  import os
9
  import time
10
  from pathlib import Path
@@ -13,11 +13,15 @@ from types import SimpleNamespace
13
  import gradio as gr
14
  from charset_normalizer import detect
15
  from chromadb.config import Settings
 
16
  from langchain.chains import RetrievalQA
17
  from langchain.docstore.document import Document
18
-
19
- # Docx2txtLoader
20
- from langchain.document_loaders import CSVLoader, PDFMinerLoader, TextLoader
 
 
 
21
 
22
  # from constants import CHROMA_SETTINGS, SOURCE_DIRECTORY, PERSIST_DIRECTORY
23
  from langchain.embeddings import HuggingFaceInstructEmbeddings
@@ -36,7 +40,6 @@ from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
36
  # import click
37
  # from typing import List
38
 
39
-
40
  # from utils import xlxs_to_csv
41
 
42
  # load possible env such as OPENAI_API_KEY
@@ -87,15 +90,26 @@ def load_single_document(file_path: str | Path) -> Document:
87
  loader = PDFMinerLoader(file_path)
88
  elif file_path.endswith(".csv"):
89
  loader = CSVLoader(file_path)
90
- # elif file_path.endswith(".epub"): # for epub? epub2txt unstructured
 
 
 
 
 
 
 
 
 
 
 
 
91
  else:
92
  if encoding is None:
93
  logger.warning(
94
  f" {file_path}'s encoding is None "
95
  "Likely binary files, return empty str "
96
  )
97
- return ""
98
-
99
  try:
100
  loader = TextLoader(file_path)
101
  except Exception as exc:
@@ -319,6 +333,9 @@ def main():
319
 
320
  def respond(message, chat_history):
321
  # bot_message = random.choice(["How are you?", "I love you", "I'm very hungry"])
 
 
 
322
  res = ns.qa(message)
323
  answer, docs = res["result"], res["source_documents"]
324
  bot_message = f"{answer} ({docs})"
 
4
 
5
  https://python.langchain.com/en/latest/getting_started/tutorials.html
6
  """
7
+ # pylint: disable=broad-exception-caught, unused-import, invalid-name, line-too-long, too-many-return-statements
8
  import os
9
  import time
10
  from pathlib import Path
 
13
  import gradio as gr
14
  from charset_normalizer import detect
15
  from chromadb.config import Settings
16
+ from epub2txt import epub2txt
17
  from langchain.chains import RetrievalQA
18
  from langchain.docstore.document import Document
19
+ from langchain.document_loaders import (
20
+ CSVLoader,
21
+ Docx2txtLoader,
22
+ PDFMinerLoader,
23
+ TextLoader,
24
+ )
25
 
26
  # from constants import CHROMA_SETTINGS, SOURCE_DIRECTORY, PERSIST_DIRECTORY
27
  from langchain.embeddings import HuggingFaceInstructEmbeddings
 
40
  # import click
41
  # from typing import List
42
 
 
43
  # from utils import xlxs_to_csv
44
 
45
  # load possible env such as OPENAI_API_KEY
 
90
  loader = PDFMinerLoader(file_path)
91
  elif file_path.endswith(".csv"):
92
  loader = CSVLoader(file_path)
93
+ elif Path(file_path).suffix in [".docx"]:
94
+ try:
95
+ loader = Docx2txtLoader(file_path)
96
+ except Exception as exc:
97
+ logger.error(f" {file_path} errors: {exc}")
98
+ return Document(page_content="", metadata={"source": file_path})
99
+ elif Path(file_path).suffix in [".epub"]: # for epub? epub2txt unstructured
100
+ try:
101
+ _ = epub2txt(file_path)
102
+ except Exception as exc:
103
+ logger.error(f" {file_path} errors: {exc}")
104
+ return Document(page_content="", metadata={"source": file_path})
105
+ return Document(page_content=_, metadata={"source": file_path})
106
  else:
107
  if encoding is None:
108
  logger.warning(
109
  f" {file_path}'s encoding is None "
110
  "Likely binary files, return empty str "
111
  )
112
+ return Document(page_content="", metadata={"source": file_path})
 
113
  try:
114
  loader = TextLoader(file_path)
115
  except Exception as exc:
 
333
 
334
  def respond(message, chat_history):
335
  # bot_message = random.choice(["How are you?", "I love you", "I'm very hungry"])
336
+ if ns.qa is None: # no files processed yet
337
+ return "Provide some file(s) for processsing first.", chat_history
338
+
339
  res = ns.qa(message)
340
  answer, docs = res["result"], res["source_documents"]
341
  bot_message = f"{answer} ({docs})"