Spaces:
Sleeping
Sleeping
File size: 2,837 Bytes
ee8fb16 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import os
import json
from pathlib import Path
from pypdf import PdfReader
from langchain.docstore.document import Document
from langchain_community.document_loaders import TextLoader, CSVLoader, PyPDFLoader
from langchain_community.document_loaders.excel import UnstructuredExcelLoader
def load_file(filepath):
# try:
print(f"Loading {filepath}")
if filepath.suffix == '.txt':
loader = TextLoader(str(filepath))
return loader.load()
elif filepath.suffix == '.csv':
loader = CSVLoader(file_path=str(filepath))
return loader.load()
elif filepath.suffix == '.pdf':
loader = PyPDFLoader(str(filepath))
return loader.load()
elif filepath.suffix == '.md':
# Load Markdown file as a Document using TextLoader
loader = TextLoader(str(filepath))
return loader.load()
elif filepath.suffix == '.xls' or filepath.suffix == '.xlsx':
loader = UnstructuredExcelLoader(str(filepath))
return loader.load()
elif filepath.suffix == '.json':
with open(filepath) as f:
json_data = json.load(f)
if isinstance(json_data, list): # Handle list of dictionaries
for item in json_data:
content = "\n".join([f"{k}: {v}" for k, v in item.items()])
return [Document(page_content=content, metadata={'source': str(filepath)})]
elif isinstance(json_data, dict): # Handle nested dictionaries
content = ""
for key, value in json_data.items():
content += f"**{key}**\n\n"
if isinstance(value, list):
for item in value:
if isinstance(item, dict):
content += "\n".join([f"{k}: {v}" for k, v in item.items()]) + "\n\n"
else:
content += str(item) + "\n\n"
else:
content += str(value) + "\n\n"
return [Document(page_content=content, metadata={'source': str(filepath)})]
else:
print(f"Unsupported JSON structure in {filepath}")
else:
print(f"Unsupported file type: {filepath}")
# except Exception as e:
# print(f"Error loading {filepath}: {e}")
def load_data_files(data_dir):
"""
Loads all data files from the specified directory, handling various file types.
Args:
data_dir: The directory containing the data files.
Returns:
A list of Document objects, each representing a loaded document.
"""
docs = []
for filepath in Path(data_dir).glob('**/*.*'):
docs.extend(load_file(filepath))
return docs
if __name__ == "__main__":
# Test with files in the 'examples' directory
docs = load_data_files("examples")
for doc in docs:
print(doc) |