Spaces:
Runtime error
Runtime error
File size: 5,888 Bytes
129cd69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import csv
from io import TextIOWrapper
from typing import Any, Dict, List, Optional, Sequence
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.helpers import detect_file_encodings
from langchain.document_loaders.unstructured import (
UnstructuredFileLoader,
validate_unstructured_version,
)
class CSVLoader(BaseLoader):
"""Load a `CSV` file into a list of Documents.
Each document represents one row of the CSV file. Every row is converted into a
key/value pair and outputted to a new line in the document's page_content.
The source for each document loaded from csv is set to the value of the
`file_path` argument for all documents by default.
You can override this by setting the `source_column` argument to the
name of a column in the CSV file.
The source of each document will then be set to the value of the column
with the name specified in `source_column`.
Output Example:
.. code-block:: txt
column1: value1
column2: value2
column3: value3
"""
def __init__(
self,
file_path: str,
source_column: Optional[str] = None,
metadata_columns: Sequence[str] = (),
csv_args: Optional[Dict] = None,
encoding: Optional[str] = None,
autodetect_encoding: bool = False,
):
"""
Args:
file_path: The path to the CSV file.
source_column: The name of the column in the CSV file to use as the source.
Optional. Defaults to None.
metadata_columns: A sequence of column names to use as metadata. Optional.
csv_args: A dictionary of arguments to pass to the csv.DictReader.
Optional. Defaults to None.
encoding: The encoding of the CSV file. Optional. Defaults to None.
autodetect_encoding: Whether to try to autodetect the file encoding.
"""
self.file_path = file_path
self.source_column = source_column
self.metadata_columns = metadata_columns
self.encoding = encoding
self.csv_args = csv_args or {}
self.autodetect_encoding = autodetect_encoding
def load(self) -> List[Document]:
"""Load data into document objects."""
docs = []
try:
with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
docs = self.__read_file(csvfile)
except UnicodeDecodeError as e:
if self.autodetect_encoding:
detected_encodings = detect_file_encodings(self.file_path)
for encoding in detected_encodings:
try:
with open(
self.file_path, newline="", encoding=encoding.encoding
) as csvfile:
docs = self.__read_file(csvfile)
break
except UnicodeDecodeError:
continue
else:
raise RuntimeError(f"Error loading {self.file_path}") from e
except Exception as e:
raise RuntimeError(f"Error loading {self.file_path}") from e
return docs
def __read_file(self, csvfile: TextIOWrapper) -> List[Document]:
docs = []
csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore
for i, row in enumerate(csv_reader):
try:
source = (
row[self.source_column]
if self.source_column is not None
else self.file_path
)
except KeyError:
raise ValueError(
f"Source column '{self.source_column}' not found in CSV file."
)
content = "\n".join(
f"{k.strip()}: {v.strip() if v is not None else v}"
for k, v in row.items()
if k not in self.metadata_columns
)
metadata = {"source": source, "row": i}
for col in self.metadata_columns:
try:
metadata[col] = row[col]
except KeyError:
raise ValueError(f"Metadata column '{col}' not found in CSV file.")
doc = Document(page_content=content, metadata=metadata)
docs.append(doc)
return docs
class UnstructuredCSVLoader(UnstructuredFileLoader):
"""Load `CSV` files using `Unstructured`.
Like other
Unstructured loaders, UnstructuredCSVLoader can be used in both
"single" and "elements" mode. If you use the loader in "elements"
mode, the CSV file will be a single Unstructured Table element.
If you use the loader in "elements" mode, an HTML representation
of the table will be available in the "text_as_html" key in the
document metadata.
Examples
--------
from langchain.document_loaders.csv_loader import UnstructuredCSVLoader
loader = UnstructuredCSVLoader("stanley-cups.csv", mode="elements")
docs = loader.load()
"""
def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
):
"""
Args:
file_path: The path to the CSV file.
mode: The mode to use when loading the CSV file.
Optional. Defaults to "single".
**unstructured_kwargs: Keyword arguments to pass to unstructured.
"""
validate_unstructured_version(min_unstructured_version="0.6.8")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
def _get_elements(self) -> List:
from unstructured.partition.csv import partition_csv
return partition_csv(filename=self.file_path, **self.unstructured_kwargs)
|