Spaces:

jameszokah
/

marigen_api

Sleeping

marigen_api / parser /msword_parser.py

Update parser/msword_parser.py

17ec764 verified over 1 year ago

1.74 kB

	from typing import Iterator
	from langchain_core.documents import Document
	from langchain_community.document_loaders.base import BaseBlobParser
	from langchain_community.document_loaders.blob_loaders import Blob


	class MsWordParser(BaseBlobParser):
	"""Parse Microsoft Word documents from a blob."""

	# type: ignore[valid-type]
	def lazy_parse(self, blob: Blob) -> Iterator[Document]:
	"""Parse a Microsoft Word document into the Document iterator.

	Args:
	blob: The blob to parse.

	Returns: An iterator of Documents.
	"""
	try:
	from docx import Document as DocxDocument
	except ImportError as e:
	raise ImportError(
	"Could not import python-docx, please install with `pip install python-docx`."
	) from e

	supported_mime_types = [
	"application/msword",
	"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
	"application/vnd.openxmlformats-officedocument.themeManager+xml",
	]

	# Debugging: Print MIME type
	print(f"Blob MIME type: {blob.mimetype}")

	# type: ignore[attr-defined]
	if blob.mimetype not in supported_mime_types:
	raise ValueError(
	f"This blob type is not supported for this parser. Supported types are: {supported_mime_types}"
	)

	with blob.as_bytes_io() as word_document: # type: ignore[attr-defined]
	doc = DocxDocument(word_document)
	text = "\n\n".join([para.text for para in doc.paragraphs])
	metadata = {"source": blob.source} # type: ignore[attr-defined]
	yield Document(page_content=text, metadata=metadata)