|
from packaging.version import Version, InvalidVersion |
|
from bs4 import NavigableString,Tag |
|
import requests |
|
import xml.etree.ElementTree as ET |
|
|
|
def Normalize_Section(section_number): |
|
try: |
|
|
|
version = Version(section_number) |
|
|
|
return str(version) |
|
except InvalidVersion: |
|
|
|
return "" |
|
|
|
|
|
def Get_Bibliography(article): |
|
bibliography = {} |
|
|
|
|
|
for entry in article.find_all('biblStruct')[1:]: |
|
xml_id = entry.get('xml:id') |
|
|
|
|
|
title = entry.find('title').text if entry.find('title') else None |
|
authors = [author.persName for author in entry.find_all('author')] |
|
authors = [" ".join([a.text for a in author.find_all()]) for author in authors] |
|
journal = entry.monogr.find('title').text if entry.monogr.find('title') else None |
|
volume = entry.find('biblScope', {'unit': 'volume'}).text if entry.find('biblScope', {'unit': 'volume'}) else None |
|
issue = entry.find('biblScope', {'unit': 'issue'}).text if entry.find('biblScope', {'unit': 'issue'}) else None |
|
pages = entry.find('biblScope', {'unit': 'page'}).text if entry.find('biblScope', {'unit': 'page'}) else None |
|
year = entry.imprint.date.get('when') if entry.imprint.find('date') else None |
|
doi = entry.find('idno', {'type': 'DOI'}).text if entry.find('idno', {'type': 'DOI'}) else None |
|
|
|
|
|
bibliography[xml_id] = { |
|
'title': title, |
|
'authors': authors, |
|
'journal': journal, |
|
'volume': volume, |
|
'issue': issue, |
|
'pages': pages, |
|
'year': year, |
|
'doi': doi |
|
} |
|
|
|
return bibliography |
|
|
|
def GParse_Header(pdf): |
|
|
|
files = {"input": ("",pdf,"application/pdf",{"Expires": "0"})} |
|
data = {} |
|
data['generateIDs']=1 |
|
data['consolidateHeader']=0 |
|
data['segmentSentences']=1 |
|
data["teiCoordinates"]=["head","s","p"] |
|
|
|
headers = {"Accept":"application/xml, text/xml, */*; q=0.01"} |
|
r = requests.request( |
|
"POST", |
|
"https://kaiserml-grobid.hf.space/api/processHeaderDocument/", |
|
headers=headers, |
|
params=None, |
|
files=files, |
|
data=data, |
|
timeout=60, |
|
) |
|
return r.text |
|
|
|
def GParse_Paper(pdf): |
|
|
|
files = {"input": ("",pdf,"application/pdf",{"Expires": "0"})} |
|
data = {} |
|
data['generateIDs']=1 |
|
|
|
data["teiCoordinates"]=["head"] |
|
|
|
|
|
headers = {"Accept":"application/xml, text/xml"} |
|
r = requests.request( |
|
"POST", |
|
"https://Kaiserml-grobid.hf.space/api/processFulltextDocument/", |
|
headers=headers, |
|
params=None, |
|
files=files, |
|
data=data, |
|
timeout=60, |
|
) |
|
return r.text |
|
|
|
|
|
def Resolve_GHeader(xml): |
|
|
|
root = ET.fromstring(xml) |
|
|
|
|
|
ns = {'tei': 'http://www.tei-c.org/ns/1.0'} |
|
|
|
|
|
title = root.find('.//tei:title', namespaces=ns).text |
|
|
|
options = requests.get(f"https://api.openalex.org/autocomplete/works?q={title}").json()['results'] |
|
return options |