YchKhan commited on
Commit
c9d9111
1 Parent(s): a747794

Update split_files_to_excel.py

Browse files
Files changed (1) hide show
  1. split_files_to_excel.py +26 -1
split_files_to_excel.py CHANGED
@@ -20,6 +20,9 @@ from unstructured.partition.auto import partition
20
 
21
  from transformers import AutoTokenizer
22
 
 
 
 
23
  MODEL = "thenlper/gte-base"
24
  CHUNK_SIZE = 1000
25
  CHUNK_OVERLAP = 200
@@ -471,4 +474,26 @@ def build_index(docs, index, output_folder):
471
  with tempfile.TemporaryDirectory() as temp_dir:
472
  index.save_local(temp_dir)
473
  for f in os.listdir(temp_dir):
474
- output_folder.upload_file(f, os.path.join(temp_dir, f))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  from transformers import AutoTokenizer
22
 
23
+ import pandas as pd
24
+
25
+
26
  MODEL = "thenlper/gte-base"
27
  CHUNK_SIZE = 1000
28
  CHUNK_OVERLAP = 200
 
474
  with tempfile.TemporaryDirectory() as temp_dir:
475
  index.save_local(temp_dir)
476
  for f in os.listdir(temp_dir):
477
+ output_folder.upload_file(f, os.path.join(temp_dir, f))
478
+
479
+
480
+ def split_in_df(files):
481
+ documents = split_doc_in_chunks(files)
482
+ df = pd.DataFrame()
483
+ for document in documents:
484
+ content = document.page_content
485
+
486
+ metadata = document.metadata
487
+ metadata_keys = list(metadata.keys())
488
+ metadata_values = list(metadata.values())
489
+
490
+ doc_data = {'Content': content}
491
+
492
+ for key, value in zip(metadata_keys, metadata_values):
493
+ doc_data[key] = value
494
+
495
+ df = df.append(doc_data, ignore_index=True)
496
+
497
+ df.to_excel("dataframe.xlsx", index=False)
498
+
499
+ return "dataframe.xlsx"