dh-mc commited on
Commit
e8c6d72
1 Parent(s): 7806e7d

fixed bug on metadata url handling

Browse files
Files changed (2) hide show
  1. app_modules/qa_chain.py +4 -2
  2. ingest.py +41 -5
app_modules/qa_chain.py CHANGED
@@ -140,8 +140,10 @@ class QAChain:
140
 
141
  if self.llm is None:
142
  if self.llm_model_type == "openai":
 
 
143
  self.llm = ChatOpenAI(
144
- model_name="gpt-4",
145
  streaming=True,
146
  callbacks=callbacks,
147
  verbose=True,
@@ -536,7 +538,7 @@ class QAChain:
536
  result["answer"] = remove_extra_spaces(result["answer"])
537
 
538
  base_url = os.environ.get("PDF_FILE_BASE_URL")
539
- if base_url is not None:
540
  documents = result["source_documents"]
541
  for doc in documents:
542
  source = doc.metadata["source"]
 
140
 
141
  if self.llm is None:
142
  if self.llm_model_type == "openai":
143
+ MODEL_NAME = os.environ.get("OPENAI_MODEL_NAME") or "gpt-4"
144
+ print(f" using model: {MODEL_NAME}")
145
  self.llm = ChatOpenAI(
146
+ model_name=MODEL_NAME,
147
  streaming=True,
148
  callbacks=callbacks,
149
  verbose=True,
 
538
  result["answer"] = remove_extra_spaces(result["answer"])
539
 
540
  base_url = os.environ.get("PDF_FILE_BASE_URL")
541
+ if base_url is not None and len(base_url) > 0:
542
  documents = result["source_documents"]
543
  for doc in documents:
544
  source = doc.metadata["source"]
ingest.py CHANGED
@@ -13,9 +13,17 @@ from langchain.vectorstores.faiss import FAISS
13
  from app_modules.utils import *
14
 
15
 
16
- def load_documents(source_pdfs_path) -> List:
17
  loader = PyPDFDirectoryLoader(source_pdfs_path, silent_errors=True)
18
  documents = loader.load()
 
 
 
 
 
 
 
 
19
  return documents
20
 
21
 
@@ -55,6 +63,7 @@ hf_embeddings_model_name = (
55
  index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get("CHROMADB_INDEX_PATH")
56
  using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
57
  source_pdfs_path = os.environ.get("SOURCE_PDFS_PATH")
 
58
  chunk_size = os.environ.get("CHUNCK_SIZE")
59
  chunk_overlap = os.environ.get("CHUNK_OVERLAP")
60
 
@@ -69,11 +78,29 @@ print(f"Completed in {end - start:.3f}s")
69
  start = timer()
70
 
71
  if not os.path.isdir(index_path):
72
- print("The index persist directory is not present. Creating a new one.")
 
 
73
  os.mkdir(index_path)
74
 
75
- print(f"Loading PDF files from {source_pdfs_path}")
76
- sources = load_documents(source_pdfs_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  print(f"Splitting {len(sources)} PDF pages in to chunks ...")
78
 
79
  chunks = split_chunks(
@@ -83,12 +110,21 @@ if not os.path.isdir(index_path):
83
 
84
  index = generate_index(chunks, embeddings)
85
  else:
86
- print("The index persist directory is present. Loading index ...")
87
  index = (
88
  FAISS.load_local(index_path, embeddings)
89
  if using_faiss
90
  else Chroma(embedding_function=embeddings, persist_directory=index_path)
91
  )
 
 
 
 
 
 
 
 
 
92
 
93
  end = timer()
94
 
 
13
  from app_modules.utils import *
14
 
15
 
16
+ def load_documents(source_pdfs_path, urls) -> List:
17
  loader = PyPDFDirectoryLoader(source_pdfs_path, silent_errors=True)
18
  documents = loader.load()
19
+ if urls is not None and len(urls) > 0:
20
+ for doc in documents:
21
+ source = doc.metadata["source"]
22
+ filename = source.split("/")[-1]
23
+ for url in urls:
24
+ if url.endswith(filename):
25
+ doc.metadata["url"] = url
26
+ break
27
  return documents
28
 
29
 
 
63
  index_path = os.environ.get("FAISS_INDEX_PATH") or os.environ.get("CHROMADB_INDEX_PATH")
64
  using_faiss = os.environ.get("FAISS_INDEX_PATH") is not None
65
  source_pdfs_path = os.environ.get("SOURCE_PDFS_PATH")
66
+ source_urls = os.environ.get("SOURCE_URLS")
67
  chunk_size = os.environ.get("CHUNCK_SIZE")
68
  chunk_overlap = os.environ.get("CHUNK_OVERLAP")
69
 
 
78
  start = timer()
79
 
80
  if not os.path.isdir(index_path):
81
+ print(
82
+ f"The index persist directory {index_path} is not present. Creating a new one."
83
+ )
84
  os.mkdir(index_path)
85
 
86
+ if source_urls is not None:
87
+ # Open the file for reading
88
+ file = open(source_urls, "r")
89
+
90
+ # Read the contents of the file into a list of strings
91
+ lines = file.readlines()
92
+
93
+ # Close the file
94
+ file.close()
95
+
96
+ # Remove the newline characters from each string
97
+ source_urls = [line.strip() for line in lines]
98
+
99
+ print(
100
+ f"Loading {'' if source_urls is None else str(len(source_urls)) + ' '}PDF files from {source_pdfs_path}"
101
+ )
102
+ sources = load_documents(source_pdfs_path, source_urls)
103
+
104
  print(f"Splitting {len(sources)} PDF pages in to chunks ...")
105
 
106
  chunks = split_chunks(
 
110
 
111
  index = generate_index(chunks, embeddings)
112
  else:
113
+ print(f"The index persist directory {index_path} is present. Loading index ...")
114
  index = (
115
  FAISS.load_local(index_path, embeddings)
116
  if using_faiss
117
  else Chroma(embedding_function=embeddings, persist_directory=index_path)
118
  )
119
+ query = "hi"
120
+ print(f"Load relevant documents for standalone question: {query}")
121
+
122
+ start2 = timer()
123
+ docs = index.as_retriever().get_relevant_documents(query)
124
+ end = timer()
125
+
126
+ print(f"Completed in {end - start2:.3f}s")
127
+ print(docs)
128
 
129
  end = timer()
130