from packaging.version import Version, InvalidVersion from bs4 import NavigableString,Tag import requests import xml.etree.ElementTree as ET def Normalize_Section(section_number): try: # Parse the version string version = Version(section_number) # Return the normalized version string return str(version) except InvalidVersion: # Handle invalid version strings if necessary return "" def Get_Bibliography(article): bibliography = {} # Iterate over each biblStruct element for entry in article.find_all('biblStruct')[1:]: xml_id = entry.get('xml:id') # Extract relevant details title = entry.find('title').text if entry.find('title') else None authors = [author.persName for author in entry.find_all('author')] authors = [" ".join([a.text for a in author.find_all()]) for author in authors] journal = entry.monogr.find('title').text if entry.monogr.find('title') else None volume = entry.find('biblScope', {'unit': 'volume'}).text if entry.find('biblScope', {'unit': 'volume'}) else None issue = entry.find('biblScope', {'unit': 'issue'}).text if entry.find('biblScope', {'unit': 'issue'}) else None pages = entry.find('biblScope', {'unit': 'page'}).text if entry.find('biblScope', {'unit': 'page'}) else None year = entry.imprint.date.get('when') if entry.imprint.find('date') else None doi = entry.find('idno', {'type': 'DOI'}).text if entry.find('idno', {'type': 'DOI'}) else None # Store the bibliographic details in the dictionary bibliography[xml_id] = { 'title': title, 'authors': authors, 'journal': journal, 'volume': volume, 'issue': issue, 'pages': pages, 'year': year, 'doi': doi } return bibliography def GParse_Header(pdf): files = {"input": ("",pdf,"application/pdf",{"Expires": "0"})} data = {} data['generateIDs']=1 data['consolidateHeader']=0 data['segmentSentences']=1 data["teiCoordinates"]=["head","s","p"] headers = {"Accept":"application/xml, text/xml, */*; q=0.01"} r = requests.request( "POST", "https://kaiserml-grobid.hf.space/api/processHeaderDocument/", headers=headers, params=None, files=files, data=data, timeout=60, ) return r.text def GParse_Paper(pdf): files = {"input": ("",pdf,"application/pdf",{"Expires": "0"})} data = {} data['generateIDs']=1 #data['segmentSentences']=1 data["teiCoordinates"]=["head"] #data["teiCoordinates"]=["head","s","p","figure","formula","note","title"] headers = {"Accept":"application/xml, text/xml"} r = requests.request( "POST", "https://Kaiserml-grobid.hf.space/api/processFulltextDocument/", headers=headers, params=None, files=files, data=data, timeout=60, ) return r.text def Resolve_GHeader(xml): # Parse the XML root = ET.fromstring(xml) # Define the namespace map ns = {'tei': 'http://www.tei-c.org/ns/1.0'} # Find the title element using the namespace title = root.find('.//tei:title', namespaces=ns).text options = requests.get(f"https://api.openalex.org/autocomplete/works?q={title}").json()['results'] return options