Sean-Case commited on
Commit
d4b0a2c
1 Parent(s): c2ff47a

Added csv/Excel file support

Browse files
Files changed (3) hide show
  1. app.py +25 -15
  2. chatfuncs/chatfuncs.py +36 -17
  3. chatfuncs/ingest.py +114 -14
app.py CHANGED
@@ -5,7 +5,7 @@ import os
5
 
6
  # Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
7
  #os.system("pip uninstall -y gradio")
8
- os.system("pip install gradio==3.42.0")
9
 
10
  from typing import TypeVar
11
  from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
@@ -25,7 +25,6 @@ PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
25
  #from chatfuncs.chatfuncs import *
26
  import chatfuncs.ingest as ing
27
 
28
-
29
  ## Load preset embeddings, vectorstore, and model
30
 
31
  embeddings_name = "BAAI/bge-base-en-v1.5"
@@ -107,7 +106,7 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
107
 
108
  if model_type == "Flan Alpaca (small, fast)":
109
  # Huggingface chat model
110
- hf_checkpoint = 'declare-lab/flan-alpaca-large'
111
 
112
  def create_hf_model(model_name):
113
 
@@ -140,9 +139,8 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
140
  return model_type, load_confirmation, model_type
141
 
142
  # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
143
- model_type = "Mistral Open Orca (larger, slow)"
144
-
145
- load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
146
 
147
  model_type = "Flan Alpaca (small, fast)"
148
  load_model(model_type, 0, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
@@ -183,7 +181,7 @@ with block:
183
 
184
  gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
185
 
186
- gr.Markdown("Chat with PDF or web page documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Mistral Open Orca (larger, slow)), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
187
 
188
  with gr.Row():
189
  current_source = gr.Textbox(label="Current data source(s)", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf", scale = 10)
@@ -192,10 +190,10 @@ with block:
192
  with gr.Tab("Chatbot"):
193
 
194
  with gr.Row():
195
- chat_height = 500
196
- chatbot = gr.Chatbot(height=chat_height, avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1)
197
  with gr.Accordion("Open this tab to see the source paragraphs used to generate the answer", open = False):
198
- sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here", height=chat_height, scale = 2)
199
 
200
  with gr.Row():
201
  message = gr.Textbox(
@@ -219,18 +217,23 @@ with block:
219
 
220
 
221
 
222
- with gr.Tab("Load in a different PDF file or web page to chat"):
223
  with gr.Accordion("PDF file", open = False):
224
  in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
225
  load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
226
 
227
  with gr.Accordion("Web page", open = False):
228
  with gr.Row():
229
- in_web = gr.Textbox(label="Enter webpage url")
230
- in_div = gr.Textbox(label="(Advanced) Webpage div for text extraction", value="p", placeholder="p")
231
- load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0)
 
 
 
 
 
232
 
233
- ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
234
 
235
  with gr.Tab("Advanced features"):
236
  with gr.Row():
@@ -264,6 +267,12 @@ with block:
264
  then(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
265
  then(docs_to_faiss_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state]).\
266
  then(chatf.hide_block, outputs = [examples_set])
 
 
 
 
 
 
267
 
268
  # Load in a webpage
269
 
@@ -289,6 +298,7 @@ with block:
289
  clear.click(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic])
290
  clear.click(lambda: None, None, chatbot, queue=False)
291
 
 
292
  chatbot.like(chatf.vote, [chat_history_state, instruction_prompt_out, model_type_state], None)
293
 
294
  block.queue(concurrency_count=1).launch(debug=True)
 
5
 
6
  # Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
7
  #os.system("pip uninstall -y gradio")
8
+ #os.system("pip install gradio==3.42.0")
9
 
10
  from typing import TypeVar
11
  from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
 
25
  #from chatfuncs.chatfuncs import *
26
  import chatfuncs.ingest as ing
27
 
 
28
  ## Load preset embeddings, vectorstore, and model
29
 
30
  embeddings_name = "BAAI/bge-base-en-v1.5"
 
106
 
107
  if model_type == "Flan Alpaca (small, fast)":
108
  # Huggingface chat model
109
+ hf_checkpoint = 'declare-lab/flan-alpaca-large'#'declare-lab/flan-alpaca-base' # # #
110
 
111
  def create_hf_model(model_name):
112
 
 
139
  return model_type, load_confirmation, model_type
140
 
141
  # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
142
+ #model_type = "Mistral Open Orca (larger, slow)"
143
+ #load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
 
144
 
145
  model_type = "Flan Alpaca (small, fast)"
146
  load_model(model_type, 0, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
 
181
 
182
  gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
183
 
184
+ gr.Markdown("Chat with PDF, web page or (new) csv/Excel documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Mistral Open Orca (larger, slow)), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
185
 
186
  with gr.Row():
187
  current_source = gr.Textbox(label="Current data source(s)", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf", scale = 10)
 
190
  with gr.Tab("Chatbot"):
191
 
192
  with gr.Row():
193
+ #chat_height = 500
194
+ chatbot = gr.Chatbot(avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1) # , height=chat_height
195
  with gr.Accordion("Open this tab to see the source paragraphs used to generate the answer", open = False):
196
+ sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here", scale = 1) # , height=chat_height
197
 
198
  with gr.Row():
199
  message = gr.Textbox(
 
217
 
218
 
219
 
220
+ with gr.Tab("Load in a different file to chat with"):
221
  with gr.Accordion("PDF file", open = False):
222
  in_pdf = gr.File(label="Upload pdf", file_count="multiple", file_types=['.pdf'])
223
  load_pdf = gr.Button(value="Load in file", variant="secondary", scale=0)
224
 
225
  with gr.Accordion("Web page", open = False):
226
  with gr.Row():
227
+ in_web = gr.Textbox(label="Enter web page url")
228
+ in_div = gr.Textbox(label="(Advanced) Web page div for text extraction", value="p", placeholder="p")
229
+ load_web = gr.Button(value="Load in webpage", variant="secondary", scale=0)
230
+
231
+ with gr.Accordion("CSV/Excel file", open = False):
232
+ in_csv = gr.File(label="Upload CSV/Excel file", file_count="multiple", file_types=['.csv', '.xlsx'])
233
+ in_text_column = gr.Textbox(label="Enter column name where text is stored")
234
+ load_csv = gr.Button(value="Load in CSV/Excel file", variant="secondary", scale=0)
235
 
236
+ ingest_embed_out = gr.Textbox(label="File/web page preparation progress")
237
 
238
  with gr.Tab("Advanced features"):
239
  with gr.Row():
 
267
  then(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
268
  then(docs_to_faiss_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state]).\
269
  then(chatf.hide_block, outputs = [examples_set])
270
+
271
+ # Load in a csv/excel file
272
+ load_csv_click = load_csv.click(ing.parse_csv_or_excel, inputs=[in_csv, in_text_column], outputs=[ingest_text, current_source]).\
273
+ then(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_text_column], outputs=[ingest_docs]).\
274
+ then(docs_to_faiss_save, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state]).\
275
+ then(chatf.hide_block, outputs = [examples_set])
276
 
277
  # Load in a webpage
278
 
 
298
  clear.click(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic])
299
  clear.click(lambda: None, None, chatbot, queue=False)
300
 
301
+ # Thumbs up or thumbs down voting function
302
  chatbot.like(chatf.vote, [chat_history_state, instruction_prompt_out, model_type_state], None)
303
 
304
  block.queue(concurrency_count=1).launch(debug=True)
chatfuncs/chatfuncs.py CHANGED
@@ -1,4 +1,5 @@
1
  import re
 
2
  import datetime
3
  from typing import TypeVar, Dict, List, Tuple
4
  import time
@@ -66,7 +67,7 @@ ner_model = []#SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base
66
  # Used to pull out keywords from chat history to add to user queries behind the scenes
67
  kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
68
 
69
-
70
  if torch.cuda.is_available():
71
  torch_device = "cuda"
72
  gpu_layers = 0
@@ -136,18 +137,6 @@ gpu_config = CtransInitConfig_gpu()
136
  cpu_config = CtransInitConfig_cpu()
137
 
138
 
139
- #@dataclass
140
- #class CtransGenGenerationConfig:
141
- # top_k: int = top_k
142
- # top_p: float = top_p
143
- # temperature: float = temperature
144
- # repetition_penalty: float = tinyllama_repetition_penalty
145
- # last_n_tokens: int = last_n_tokens
146
- # seed: int = seed
147
- # batch_size:int = batch_size
148
- # threads: int = threads
149
- # reset: bool = True
150
-
151
  class CtransGenGenerationConfig:
152
  def __init__(self, temperature=temperature,
153
  top_k=top_k,
@@ -333,7 +322,11 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
333
  #vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
334
 
335
  # Expand the found passages to the neighbouring context
336
- docs_keep_as_doc, doc_df = get_expanded_passages(vectorstore, docs_keep_out, width=3)
 
 
 
 
337
 
338
  if docs_keep_as_doc == []:
339
  {"answer": "I'm sorry, I couldn't find a relevant answer to this question.", "sources":"I'm sorry, I couldn't find a relevant source for this question."}
@@ -344,8 +337,9 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
344
  doc_df['meta_clean'] = [f"<b>{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}</b>" for d in doc_df['metadata']]
345
  doc_df['content_meta'] = doc_df['meta_clean'].astype(str) + ".<br><br>" + doc_df['page_content'].astype(str)
346
 
347
- modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['page_content'])]
348
- docs_content_string = ''.join(modified_page_content)
 
349
 
350
  sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace(" "," ")#.strip()
351
 
@@ -481,6 +475,19 @@ def adapt_q_from_chat_history(question, chat_history, extracted_memory, keyword_
481
 
482
  return new_question_kworded
483
 
 
 
 
 
 
 
 
 
 
 
 
 
 
484
  def create_doc_df(docs_keep_out):
485
  # Extract content and metadata from 'winning' passages.
486
  content=[]
@@ -489,11 +496,17 @@ def create_doc_df(docs_keep_out):
489
  page_section=[]
490
  score=[]
491
 
 
 
492
  for item in docs_keep_out:
493
  content.append(item[0].page_content)
494
  meta.append(item[0].metadata)
495
  meta_url.append(item[0].metadata['source'])
496
- page_section.append(item[0].metadata['page_section'])
 
 
 
 
497
  score.append(item[1])
498
 
499
  # Create df from 'winning' passages
@@ -728,6 +741,12 @@ def get_expanded_passages(vectorstore, docs, width):
728
  expanded_docs = []
729
  for doc, score in docs:
730
  search_source = doc.metadata['source']
 
 
 
 
 
 
731
  search_section = doc.metadata['page_section']
732
  parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_by_source[search_source]]
733
  search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
 
1
  import re
2
+ import os
3
  import datetime
4
  from typing import TypeVar, Dict, List, Tuple
5
  import time
 
67
  # Used to pull out keywords from chat history to add to user queries behind the scenes
68
  kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
69
 
70
+ # Currently set gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
71
  if torch.cuda.is_available():
72
  torch_device = "cuda"
73
  gpu_layers = 0
 
137
  cpu_config = CtransInitConfig_cpu()
138
 
139
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  class CtransGenGenerationConfig:
141
  def __init__(self, temperature=temperature,
142
  top_k=top_k,
 
322
  #vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
323
 
324
  # Expand the found passages to the neighbouring context
325
+ file_type = determine_file_type(doc_df['meta_url'][0])
326
+
327
+ # Only expand passages if not tabular data
328
+ if (file_type != ".csv") & (file_type != ".xlsx"):
329
+ docs_keep_as_doc, doc_df = get_expanded_passages(vectorstore, docs_keep_out, width=3)
330
 
331
  if docs_keep_as_doc == []:
332
  {"answer": "I'm sorry, I couldn't find a relevant answer to this question.", "sources":"I'm sorry, I couldn't find a relevant source for this question."}
 
337
  doc_df['meta_clean'] = [f"<b>{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}</b>" for d in doc_df['metadata']]
338
  doc_df['content_meta'] = doc_df['meta_clean'].astype(str) + ".<br><br>" + doc_df['page_content'].astype(str)
339
 
340
+ #modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['page_content'])]
341
+ modified_page_content = [f" SOURCE {i+1} - {word}" for i, word in enumerate(doc_df['content_meta'])]
342
+ docs_content_string = '<br><br>'.join(modified_page_content)
343
 
344
  sources_docs_content_string = '<br><br>'.join(doc_df['content_meta'])#.replace(" "," ")#.strip()
345
 
 
475
 
476
  return new_question_kworded
477
 
478
+ def determine_file_type(file_path):
479
+ """
480
+ Determine the file type based on its extension.
481
+
482
+ Parameters:
483
+ file_path (str): Path to the file.
484
+
485
+ Returns:
486
+ str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
487
+ """
488
+ return os.path.splitext(file_path)[1].lower()
489
+
490
+
491
  def create_doc_df(docs_keep_out):
492
  # Extract content and metadata from 'winning' passages.
493
  content=[]
 
496
  page_section=[]
497
  score=[]
498
 
499
+
500
+
501
  for item in docs_keep_out:
502
  content.append(item[0].page_content)
503
  meta.append(item[0].metadata)
504
  meta_url.append(item[0].metadata['source'])
505
+
506
+ file_extension = determine_file_type(item[0].metadata['source'])
507
+ if (file_extension != ".csv") & (file_extension != ".xlsx"):
508
+ page_section.append(item[0].metadata['page_section'])
509
+ else: page_section.append("")
510
  score.append(item[1])
511
 
512
  # Create df from 'winning' passages
 
741
  expanded_docs = []
742
  for doc, score in docs:
743
  search_source = doc.metadata['source']
744
+
745
+
746
+ #if file_type == ".csv" | file_type == ".xlsx":
747
+ # content_str, meta_first, meta_last = get_parent_content_and_meta(vstore_by_source[search_source], 0, search_index)
748
+
749
+ #else:
750
  search_section = doc.metadata['page_section']
751
  parent_vstore_meta_section = [doc.metadata['page_section'] for _, doc in vstore_by_source[search_source]]
752
  search_index = parent_vstore_meta_section.index(search_section) if search_section in parent_vstore_meta_section else -1
chatfuncs/ingest.py CHANGED
@@ -44,31 +44,32 @@ chunk_overlap = 0
44
  start_index = True
45
 
46
  ## Parse files
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- def parse_file(file_paths):
49
  """
50
  Accepts a list of file paths, determines each file's type based on its extension,
51
  and passes it to the relevant parsing function.
52
 
53
  Parameters:
54
  file_paths (list): List of file paths.
55
- div (str): (optional) Div to pull out of html file/url with BeautifulSoup
56
 
57
  Returns:
58
  dict: A dictionary with file paths as keys and their parsed content (or error message) as values.
59
  """
60
 
61
- def determine_file_type(file_path):
62
- """
63
- Determine the file type based on its extension.
64
-
65
- Parameters:
66
- file_path (str): Path to the file.
67
 
68
- Returns:
69
- str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
70
- """
71
- return os.path.splitext(file_path)[1].lower()
72
 
73
  if not isinstance(file_paths, list):
74
  raise ValueError("Expected a list of file paths.")
@@ -78,7 +79,9 @@ def parse_file(file_paths):
78
  '.docx': parse_docx,
79
  '.txt': parse_txt,
80
  '.html': parse_html,
81
- '.htm': parse_html # Considering both .html and .htm for HTML files
 
 
82
  }
83
 
84
  parsed_contents = {}
@@ -115,6 +118,64 @@ def text_regex_clean(text):
115
 
116
  return text
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  def parse_pdf(file) -> List[str]:
119
 
120
  """
@@ -308,8 +369,9 @@ def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document
308
  if ext == '.pdf':
309
  docs, page_docs = pdf_text_to_docs(content, chunk_size)
310
  elif ext in ['.html', '.htm', '.txt', '.docx']:
311
- # Assuming you want to process HTML similarly to PDF in this context
312
  docs = html_text_to_docs(content, chunk_size)
 
 
313
  else:
314
  print(f"Unsupported file type {ext} for {file_path}. Skipping.")
315
  continue
@@ -400,6 +462,44 @@ def html_text_to_docs(texts, metadatas, chunk_size:int = chunk_size):
400
 
401
  return documents
402
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  # # Functions for working with documents after loading them back in
404
 
405
  def pull_out_data(series):
 
44
  start_index = True
45
 
46
  ## Parse files
47
+ def determine_file_type(file_path):
48
+ """
49
+ Determine the file type based on its extension.
50
+
51
+ Parameters:
52
+ file_path (str): Path to the file.
53
+
54
+ Returns:
55
+ str: File extension (e.g., '.pdf', '.docx', '.txt', '.html').
56
+ """
57
+ return os.path.splitext(file_path)[1].lower()
58
 
59
+ def parse_file(file_paths, text_column='text'):
60
  """
61
  Accepts a list of file paths, determines each file's type based on its extension,
62
  and passes it to the relevant parsing function.
63
 
64
  Parameters:
65
  file_paths (list): List of file paths.
66
+ text_column (str): Name of the column in CSV/Excel files that contains the text content.
67
 
68
  Returns:
69
  dict: A dictionary with file paths as keys and their parsed content (or error message) as values.
70
  """
71
 
 
 
 
 
 
 
72
 
 
 
 
 
73
 
74
  if not isinstance(file_paths, list):
75
  raise ValueError("Expected a list of file paths.")
 
79
  '.docx': parse_docx,
80
  '.txt': parse_txt,
81
  '.html': parse_html,
82
+ '.htm': parse_html, # Considering both .html and .htm for HTML files
83
+ '.csv': lambda file_path: parse_csv_or_excel(file_path, text_column),
84
+ '.xlsx': lambda file_path: parse_csv_or_excel(file_path, text_column)
85
  }
86
 
87
  parsed_contents = {}
 
118
 
119
  return text
120
 
121
+ def parse_csv_or_excel(file_paths, text_column = "text"):
122
+ """
123
+ Read in a CSV or Excel file.
124
+
125
+ Parameters:
126
+ file_path (str): Path to the CSV file.
127
+ text_column (str): Name of the column in the CSV file that contains the text content.
128
+
129
+ Returns:
130
+ Pandas DataFrame: Dataframe output from file read
131
+ """
132
+
133
+ file_names = []
134
+ out_df = pd.DataFrame()
135
+
136
+ for file_path in file_paths:
137
+ file_extension = determine_file_type(file_path.name)
138
+ file_name = get_file_path_end(file_path.name)
139
+
140
+ if file_extension == ".csv":
141
+ df = pd.read_csv(file_path.name)
142
+ if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
143
+ df['source'] = file_name
144
+ df['page_section'] = ""
145
+ elif file_extension == ".xlsx":
146
+ df = pd.read_excel(file_path.name, engine='openpyxl')
147
+ if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name']
148
+ df['source'] = file_name
149
+ df['page_section'] = ""
150
+ else:
151
+ print(f"Unsupported file type: {file_extension}")
152
+ return pd.DataFrame(), ['Please choose a valid file type']
153
+
154
+ file_names.append(file_name)
155
+ out_df = pd.concat([out_df, df])
156
+
157
+ #if text_column not in df.columns:
158
+ # return f"Column '{text_column}' not found in {file_path}"
159
+ #text_out = " ".join(df[text_column].dropna().astype(str))
160
+ return out_df, file_names
161
+
162
+ def parse_excel(file_path, text_column):
163
+ """
164
+ Read text from an Excel file.
165
+
166
+ Parameters:
167
+ file_path (str): Path to the Excel file.
168
+ text_column (str): Name of the column in the Excel file that contains the text content.
169
+
170
+ Returns:
171
+ Pandas DataFrame: Dataframe output from file read
172
+ """
173
+ df = pd.read_excel(file_path, engine='openpyxl')
174
+ #if text_column not in df.columns:
175
+ # return f"Column '{text_column}' not found in {file_path}"
176
+ #text_out = " ".join(df[text_column].dropna().astype(str))
177
+ return df
178
+
179
  def parse_pdf(file) -> List[str]:
180
 
181
  """
 
369
  if ext == '.pdf':
370
  docs, page_docs = pdf_text_to_docs(content, chunk_size)
371
  elif ext in ['.html', '.htm', '.txt', '.docx']:
 
372
  docs = html_text_to_docs(content, chunk_size)
373
+ elif ext in ['.csv', '.xlsx']:
374
+ docs, page_docs = csv_excel_text_to_docs(content, chunk_size)
375
  else:
376
  print(f"Unsupported file type {ext} for {file_path}. Skipping.")
377
  continue
 
462
 
463
  return documents
464
 
465
+ def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]:
466
+ """Converts a DataFrame's content to a list of Documents with metadata."""
467
+
468
+ doc_sections = []
469
+ df[text_column] = df[text_column].astype(str) # Ensure column is a string column
470
+
471
+ # For each row in the dataframe
472
+ for idx, row in df.iterrows():
473
+ # Extract the text content for the document
474
+ doc_content = row[text_column]
475
+
476
+ # Generate metadata containing other columns' data
477
+ metadata = {"row": idx + 1}
478
+ for col, value in row.items():
479
+ if col != text_column:
480
+ metadata[col] = value
481
+
482
+ # If chunk_size is provided, split the text into chunks
483
+ if chunk_size:
484
+ # Assuming you have a text splitter function similar to the PDF handling
485
+ text_splitter = RecursiveCharacterTextSplitter(
486
+ chunk_size=chunk_size,
487
+ # Other arguments as required by the splitter
488
+ )
489
+ sections = text_splitter.split_text(doc_content)
490
+
491
+ # For each section, create a Document object
492
+ for i, section in enumerate(sections):
493
+ doc = Document(page_content=section,
494
+ metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"})
495
+ doc_sections.append(doc)
496
+ else:
497
+ # If no chunk_size is provided, create a single Document object for the row
498
+ doc = Document(page_content=doc_content, metadata=metadata)
499
+ doc_sections.append(doc)
500
+
501
+ return doc_sections
502
+
503
  # # Functions for working with documents after loading them back in
504
 
505
  def pull_out_data(series):