pdf-toweb / utils.py
Corran's picture
Create utils.py
a917763 verified
raw
history blame
3.28 kB
from packaging.version import Version, InvalidVersion
from bs4 import NavigableString,Tag
import requests
import xml.etree.ElementTree as ET
def Normalize_Section(section_number):
try:
# Parse the version string
version = Version(section_number)
# Return the normalized version string
return str(version)
except InvalidVersion:
# Handle invalid version strings if necessary
return ""
def Get_Bibliography(article):
bibliography = {}
# Iterate over each biblStruct element
for entry in article.find_all('biblStruct')[1:]:
xml_id = entry.get('xml:id')
# Extract relevant details
title = entry.find('title').text if entry.find('title') else None
authors = [author.persName for author in entry.find_all('author')]
authors = [" ".join([a.text for a in author.find_all()]) for author in authors]
journal = entry.monogr.find('title').text if entry.monogr.find('title') else None
volume = entry.find('biblScope', {'unit': 'volume'}).text if entry.find('biblScope', {'unit': 'volume'}) else None
issue = entry.find('biblScope', {'unit': 'issue'}).text if entry.find('biblScope', {'unit': 'issue'}) else None
pages = entry.find('biblScope', {'unit': 'page'}).text if entry.find('biblScope', {'unit': 'page'}) else None
year = entry.imprint.date.get('when') if entry.imprint.find('date') else None
doi = entry.find('idno', {'type': 'DOI'}).text if entry.find('idno', {'type': 'DOI'}) else None
# Store the bibliographic details in the dictionary
bibliography[xml_id] = {
'title': title,
'authors': authors,
'journal': journal,
'volume': volume,
'issue': issue,
'pages': pages,
'year': year,
'doi': doi
}
return bibliography
def GParse_Header(pdf):
files = {"input": ("",pdf,"application/pdf",{"Expires": "0"})}
data = {}
data['generateIDs']=1
data['consolidateHeader']=0
data['segmentSentences']=1
data["teiCoordinates"]=["head","s","p"]
headers = {"Accept":"application/xml, text/xml, */*; q=0.01"}
r = requests.request(
"POST",
"https://kaiserml-grobid.hf.space/api/processHeaderDocument/",
headers=headers,
params=None,
files=files,
data=data,
timeout=60,
)
return r.text
def GParse_Paper(pdf):
files = {"input": ("",pdf,"application/pdf",{"Expires": "0"})}
data = {}
data['generateIDs']=1
#data['segmentSentences']=1
data["teiCoordinates"]=["head"]
#data["teiCoordinates"]=["head","s","p","figure","formula","note","title"]
headers = {"Accept":"application/xml, text/xml"}
r = requests.request(
"POST",
"https://Kaiserml-grobid.hf.space/api/processFulltextDocument/",
headers=headers,
params=None,
files=files,
data=data,
timeout=60,
)
return r.text
def Resolve_GHeader(xml):
# Parse the XML
root = ET.fromstring(xml)
# Define the namespace map
ns = {'tei': 'http://www.tei-c.org/ns/1.0'}
# Find the title element using the namespace
title = root.find('.//tei:title', namespaces=ns).text
options = requests.get(f"https://api.openalex.org/autocomplete/works?q={title}").json()['results']
return options