File size: 2,837 Bytes
ee8fb16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import json
from pathlib import Path

from pypdf import PdfReader
from langchain.docstore.document import Document
from langchain_community.document_loaders import TextLoader, CSVLoader, PyPDFLoader
from langchain_community.document_loaders.excel import UnstructuredExcelLoader

def load_file(filepath):
# try:
    print(f"Loading {filepath}")
    if filepath.suffix == '.txt':
        loader = TextLoader(str(filepath))
        return loader.load()
    elif filepath.suffix == '.csv':
        loader = CSVLoader(file_path=str(filepath))
        return loader.load()
    elif filepath.suffix == '.pdf':
        loader = PyPDFLoader(str(filepath))
        return loader.load()
    elif filepath.suffix == '.md':
        # Load Markdown file as a Document using TextLoader
        loader = TextLoader(str(filepath))  
        return loader.load()
    elif filepath.suffix == '.xls' or filepath.suffix == '.xlsx':
        loader = UnstructuredExcelLoader(str(filepath))
        return loader.load()
    elif filepath.suffix == '.json':
        with open(filepath) as f:
            json_data = json.load(f)
        if isinstance(json_data, list):  # Handle list of dictionaries
            for item in json_data:
                content = "\n".join([f"{k}: {v}" for k, v in item.items()])
                return [Document(page_content=content, metadata={'source': str(filepath)})]
        elif isinstance(json_data, dict):  # Handle nested dictionaries
            content = ""
            for key, value in json_data.items():
                content += f"**{key}**\n\n"
                if isinstance(value, list):
                    for item in value:
                        if isinstance(item, dict):
                            content += "\n".join([f"{k}: {v}" for k, v in item.items()]) + "\n\n"
                        else:
                            content += str(item) + "\n\n"
                else:
                    content += str(value) + "\n\n"
            return [Document(page_content=content, metadata={'source': str(filepath)})]
        else:
            print(f"Unsupported JSON structure in {filepath}")
    else:
        print(f"Unsupported file type: {filepath}")
# except Exception as e:
#     print(f"Error loading {filepath}: {e}")

def load_data_files(data_dir):
    """
    Loads all data files from the specified directory, handling various file types.

    Args:
        data_dir: The directory containing the data files.

    Returns:
        A list of Document objects, each representing a loaded document.
    """
    docs = []
    for filepath in Path(data_dir).glob('**/*.*'):
        docs.extend(load_file(filepath))
    return docs


if __name__ == "__main__":
    # Test with files in the 'examples' directory
    docs = load_data_files("examples")
    for doc in docs:
        print(doc)