Spaces:
Runtime error
Runtime error
File size: 5,750 Bytes
129cd69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import logging
from typing import Dict, Iterator, List, Union
import requests
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
logger = logging.getLogger(__name__)
class ServerUnavailableException(Exception):
"""Exception raised when the Grobid server is unavailable."""
pass
class GrobidParser(BaseBlobParser):
"""Load article `PDF` files using `Grobid`."""
def __init__(
self,
segment_sentences: bool,
grobid_server: str = "http://localhost:8070/api/processFulltextDocument",
) -> None:
self.segment_sentences = segment_sentences
self.grobid_server = grobid_server
try:
requests.get(grobid_server)
except requests.exceptions.RequestException:
logger.error(
"GROBID server does not appear up and running, \
please ensure Grobid is installed and the server is running"
)
raise ServerUnavailableException
def process_xml(
self, file_path: str, xml_data: str, segment_sentences: bool
) -> Iterator[Document]:
"""Process the XML file from Grobin."""
try:
from bs4 import BeautifulSoup
except ImportError:
raise ImportError(
"`bs4` package not found, please install it with " "`pip install bs4`"
)
soup = BeautifulSoup(xml_data, "xml")
sections = soup.find_all("div")
title = soup.find_all("title")[0].text
chunks = []
for section in sections:
sect = section.find("head")
if sect is not None:
for i, paragraph in enumerate(section.find_all("p")):
chunk_bboxes = []
paragraph_text = []
for i, sentence in enumerate(paragraph.find_all("s")):
paragraph_text.append(sentence.text)
sbboxes = []
for bbox in sentence.get("coords").split(";"):
box = bbox.split(",")
sbboxes.append(
{
"page": box[0],
"x": box[1],
"y": box[2],
"h": box[3],
"w": box[4],
}
)
chunk_bboxes.append(sbboxes)
if segment_sentences is True:
fpage, lpage = sbboxes[0]["page"], sbboxes[-1]["page"]
sentence_dict = {
"text": sentence.text,
"para": str(i),
"bboxes": [sbboxes],
"section_title": sect.text,
"section_number": sect.get("n"),
"pages": (fpage, lpage),
}
chunks.append(sentence_dict)
if segment_sentences is not True:
fpage, lpage = (
chunk_bboxes[0][0]["page"],
chunk_bboxes[-1][-1]["page"],
)
paragraph_dict = {
"text": "".join(paragraph_text),
"para": str(i),
"bboxes": chunk_bboxes,
"section_title": sect.text,
"section_number": sect.get("n"),
"pages": (fpage, lpage),
}
chunks.append(paragraph_dict)
yield from [
Document(
page_content=chunk["text"],
metadata=dict(
{
"text": str(chunk["text"]),
"para": str(chunk["para"]),
"bboxes": str(chunk["bboxes"]),
"pages": str(chunk["pages"]),
"section_title": str(chunk["section_title"]),
"section_number": str(chunk["section_number"]),
"paper_title": str(title),
"file_path": str(file_path),
}
),
)
for chunk in chunks
]
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
file_path = blob.source
if file_path is None:
raise ValueError("blob.source cannot be None.")
pdf = open(file_path, "rb")
files = {"input": (file_path, pdf, "application/pdf", {"Expires": "0"})}
try:
data: Dict[str, Union[str, List[str]]] = {}
for param in ["generateIDs", "consolidateHeader", "segmentSentences"]:
data[param] = "1"
data["teiCoordinates"] = ["head", "s"]
files = files or {}
r = requests.request(
"POST",
self.grobid_server,
headers=None,
params=None,
files=files,
data=data,
timeout=60,
)
xml_data = r.text
except requests.exceptions.ReadTimeout:
logger.error("GROBID server timed out. Return None.")
xml_data = None
if xml_data is None:
return iter([])
else:
return self.process_xml(file_path, xml_data, self.segment_sentences)
|