# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents from bs4 import BeautifulSoup import requests from DefaultPackages import openFile, saveFile from NER import cleanText import pandas as pd class HTML(): def __init__(self, htmlFile, htmlLink): self.htmlLink = htmlLink self.htmlFile = htmlFile # def openHTMLFile(self): # headers = { # "User-Agent": ( # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " # "AppleWebKit/537.36 (KHTML, like Gecko) " # "Chrome/114.0.0.0 Safari/537.36" # ), # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", # "Referer": self.htmlLink, # "Connection": "keep-alive" # } # session = requests.Session() # session.headers.update(headers) # if self.htmlLink != "None": # try: # r = session.get(self.htmlLink, allow_redirects=True, timeout=15) # if r.status_code != 200: # print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}") # return BeautifulSoup("", 'html.parser') # soup = BeautifulSoup(r.content, 'html.parser') # except Exception as e: # print(f"❌ Exception fetching HTML: {e}") # return BeautifulSoup("", 'html.parser') # else: # with open(self.htmlFile) as fp: # soup = BeautifulSoup(fp, 'html.parser') # return soup from lxml.etree import ParserError, XMLSyntaxError def openHTMLFile(self): not_need_domain = ['https://broadinstitute.github.io/picard/', 'https://software.broadinstitute.org/gatk/best-practices/', 'https://www.ncbi.nlm.nih.gov/genbank/', 'https://www.mitomap.org/'] headers = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/114.0.0.0 Safari/537.36" ), "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Referer": self.htmlLink, "Connection": "keep-alive" } session = requests.Session() session.headers.update(headers) if self.htmlLink in not_need_domain: return BeautifulSoup("", 'html.parser') try: if self.htmlLink and self.htmlLink != "None": r = session.get(self.htmlLink, allow_redirects=True, timeout=15) if r.status_code != 200 or not r.text.strip(): print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}") return BeautifulSoup("", 'html.parser') soup = BeautifulSoup(r.content, 'html.parser') else: with open(self.htmlFile, encoding='utf-8') as fp: soup = BeautifulSoup(fp, 'html.parser') except (ParserError, XMLSyntaxError, OSError) as e: print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}") return BeautifulSoup("", 'html.parser') except Exception as e: print(f"❌ General exception for {self.htmlLink}: {e}") return BeautifulSoup("", 'html.parser') return soup def getText(self): try: soup = self.openHTMLFile() s = soup.find_all("html") text = "" if s: for t in range(len(s)): text = s[t].get_text() cl = cleanText.cleanGenText() text = cl.removeExtraSpaceBetweenWords(text) return text except: print("failed get text from html") return "" def getListSection(self, scienceDirect=None): try: json = {} text = "" textJson, textHTML = "","" if scienceDirect == None: # soup = self.openHTMLFile() # # get list of section # json = {} # for h2Pos in range(len(soup.find_all('h2'))): # if soup.find_all('h2')[h2Pos].text not in json: # json[soup.find_all('h2')[h2Pos].text] = [] # if h2Pos + 1 < len(soup.find_all('h2')): # content = soup.find_all('h2')[h2Pos].find_next("p") # nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p") # while content.text != nexth2Content.text: # json[soup.find_all('h2')[h2Pos].text].append(content.text) # content = content.find_next("p") # else: # content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True) # json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content) soup = self.openHTMLFile() h2_tags = soup.find_all('h2') json = {} for idx, h2 in enumerate(h2_tags): section_title = h2.get_text(strip=True) json.setdefault(section_title, []) # Get paragraphs until next H2 next_h2 = h2_tags[idx+1] if idx+1 < len(h2_tags) else None for p in h2.find_all_next("p"): if next_h2 and p == next_h2: break json[section_title].append(p.get_text(strip=True)) # format '''json = {'Abstract':[], 'Introduction':[], 'Methods'[], 'Results':[], 'Discussion':[], 'References':[], 'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[], 'Additional information':[], 'Electronic supplementary material':[], 'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}''' if scienceDirect!= None or len(json)==0: # Replace with your actual Elsevier API key api_key = os.environ["SCIENCE_DIRECT_API"] # ScienceDirect article DOI or PI (Example DOI) doi = self.htmlLink.split("https://doi.org/")[-1] #"10.1016/j.ajhg.2011.01.009" # Base URL for the Elsevier API base_url = "https://api.elsevier.com/content/article/doi/" # Set headers with API key headers = { "Accept": "application/json", "X-ELS-APIKey": api_key } # Make the API request response = requests.get(base_url + doi, headers=headers) # Check if the request was successful if response.status_code == 200: data = response.json() supp_data = data["full-text-retrieval-response"]#["coredata"]["link"] # if "originalText" in list(supp_data.keys()): # if type(supp_data["originalText"])==str: # json["originalText"] = [supp_data["originalText"]] # if type(supp_data["originalText"])==dict: # json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]] # else: # if type(supp_data)==dict: # for key in supp_data: # json[key] = [supp_data[key]] if type(data)==dict: json["fullText"] = data textJson = self.mergeTextInJson(json) textHTML = self.getText() if len(textHTML) > len(textJson): text = textHTML else: text = textJson return text #json except: print("failed all") return "" def getReference(self): # get reference to collect more next data ref = [] json = self.getListSection() for key in json["References"]: ct = cleanText.cleanGenText(key) cleanText, filteredWord = ct.cleanText() if cleanText not in ref: ref.append(cleanText) return ref def getSupMaterial(self): # check if there is material or not json = {} soup = self.openHTMLFile() for h2Pos in range(len(soup.find_all('h2'))): if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower(): #print(soup.find_all('h2')[h2Pos].find_next("a").get("href")) link, output = [],[] if soup.find_all('h2')[h2Pos].text not in json: json[soup.find_all('h2')[h2Pos].text] = [] for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True): link.append(l["href"]) if h2Pos + 1 < len(soup.find_all('h2')): nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"] if nexth2Link in link: link = link[:link.index(nexth2Link)] # only take links having "https" in that for i in link: if "https" in i: output.append(i) json[soup.find_all('h2')[h2Pos].text].extend(output) return json def extractTable(self): soup = self.openHTMLFile() df = [] if len(soup)>0: try: df = pd.read_html(str(soup)) except ValueError: df = [] print("No tables found in HTML file") return df def mergeTextInJson(self,jsonHTML): try: #cl = cleanText.cleanGenText() htmlText = "" if jsonHTML: # try: # for sec, entries in jsonHTML.items(): # for i, entry in enumerate(entries): # # Only process if it's actually text # if isinstance(entry, str): # if entry.strip(): # entry, filteredWord = cl.textPreprocessing(entry, keepPeriod=True) # else: # # Skip or convert dicts/lists to string if needed # entry = str(entry) # jsonHTML[sec][i] = entry # # Add spacing between sentences # if i - 1 >= 0 and jsonHTML[sec][i - 1] and jsonHTML[sec][i - 1][-1] != ".": # htmlText += ". " # htmlText += entry # # Add final period if needed # if entries and isinstance(entries[-1], str) and entries[-1] and entries[-1][-1] != ".": # htmlText += "." # htmlText += "\n\n" # except: htmlText += str(jsonHTML) return htmlText except: print("failed merge text in json") return ""