fracapuano commited on
Commit
a96d162
1 Parent(s): 11ce856

fix: now also storing the name of the file

Browse files
Files changed (1) hide show
  1. qa/utils.py +10 -6
qa/utils.py CHANGED
@@ -95,7 +95,7 @@ def get_text_splitter(
95
  return text_splitter
96
 
97
  @st.cache_data
98
- def text_to_docs(text: Union[Text, Tuple[Text]]) -> List[Document]:
99
  """
100
  Converts a string or frozenset of strings to a list of Documents
101
  with metadata.
@@ -112,6 +112,8 @@ def text_to_docs(text: Union[Text, Tuple[Text]]) -> List[Document]:
112
  # Add page numbers as metadata
113
  for i, doc in enumerate(page_docs):
114
  doc.metadata["page"] = i + 1
 
 
115
  # Split pages into chunks
116
  doc_chunks = []
117
  # Get the text splitter
@@ -122,15 +124,17 @@ def text_to_docs(text: Union[Text, Tuple[Text]]) -> List[Document]:
122
  chunks = text_splitter.split_text(doc.page_content)
123
  for i, chunk in enumerate(chunks):
124
  # Create a new document for each individual chunk
125
- doc = HashDocument(
126
- page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
 
127
  )
128
  # Add sources to metadata for retrieval later on
129
- doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
130
- doc_chunks.append(doc)
 
131
 
132
  return doc_chunks
133
-
134
 
135
  @st.cache_data
136
  def embed_docs(_docs: Tuple[Document]) -> VectorStore:
 
95
  return text_splitter
96
 
97
  @st.cache_data
98
+ def text_to_docs(text: Union[Text, Tuple[Text]], **kwargs) -> List[Document]:
99
  """
100
  Converts a string or frozenset of strings to a list of Documents
101
  with metadata.
 
112
  # Add page numbers as metadata
113
  for i, doc in enumerate(page_docs):
114
  doc.metadata["page"] = i + 1
115
+ doc.metadata["file_name"] = kwargs.get("file_name", "")
116
+
117
  # Split pages into chunks
118
  doc_chunks = []
119
  # Get the text splitter
 
124
  chunks = text_splitter.split_text(doc.page_content)
125
  for i, chunk in enumerate(chunks):
126
  # Create a new document for each individual chunk
127
+ new_doc = HashDocument(
128
+ page_content=chunk,
129
+ metadata={"file_name": doc.metadata["file_name"], "page": doc.metadata["page"], "chunk": i}
130
  )
131
  # Add sources to metadata for retrieval later on
132
+ new_doc.metadata["source"] = \
133
+ f"{new_doc.metadata['file_name']}/Page-{new_doc.metadata['page']}/Chunk-{new_doc.metadata['chunk']}"
134
+ doc_chunks.append(new_doc)
135
 
136
  return doc_chunks
137
+
138
 
139
  @st.cache_data
140
  def embed_docs(_docs: Tuple[Document]) -> VectorStore: