mtDNALocation / NER /html /extractHTML.py
VyLala's picture
Upload 54 files
4a80798 verified
# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
from bs4 import BeautifulSoup
import requests
from DefaultPackages import openFile, saveFile
from NER import cleanText
import pandas as pd
class HTML():
def __init__(self, htmlFile, htmlLink):
self.htmlLink = htmlLink
self.htmlFile = htmlFile
# def openHTMLFile(self):
# headers = {
# "User-Agent": (
# "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
# "AppleWebKit/537.36 (KHTML, like Gecko) "
# "Chrome/114.0.0.0 Safari/537.36"
# ),
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Referer": self.htmlLink,
# "Connection": "keep-alive"
# }
# session = requests.Session()
# session.headers.update(headers)
# if self.htmlLink != "None":
# try:
# r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
# if r.status_code != 200:
# print(f"❌ HTML GET failed: {r.status_code} β€” {self.htmlLink}")
# return BeautifulSoup("", 'html.parser')
# soup = BeautifulSoup(r.content, 'html.parser')
# except Exception as e:
# print(f"❌ Exception fetching HTML: {e}")
# return BeautifulSoup("", 'html.parser')
# else:
# with open(self.htmlFile) as fp:
# soup = BeautifulSoup(fp, 'html.parser')
# return soup
from lxml.etree import ParserError, XMLSyntaxError
def openHTMLFile(self):
not_need_domain = ['https://broadinstitute.github.io/picard/',
'https://software.broadinstitute.org/gatk/best-practices/',
'https://www.ncbi.nlm.nih.gov/genbank/',
'https://www.mitomap.org/']
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/114.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer": self.htmlLink,
"Connection": "keep-alive"
}
session = requests.Session()
session.headers.update(headers)
if self.htmlLink in not_need_domain:
return BeautifulSoup("", 'html.parser')
try:
if self.htmlLink and self.htmlLink != "None":
r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
if r.status_code != 200 or not r.text.strip():
print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
return BeautifulSoup("", 'html.parser')
soup = BeautifulSoup(r.content, 'html.parser')
else:
with open(self.htmlFile, encoding='utf-8') as fp:
soup = BeautifulSoup(fp, 'html.parser')
except (ParserError, XMLSyntaxError, OSError) as e:
print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
return BeautifulSoup("", 'html.parser')
except Exception as e:
print(f"❌ General exception for {self.htmlLink}: {e}")
return BeautifulSoup("", 'html.parser')
return soup
def getText(self):
try:
soup = self.openHTMLFile()
s = soup.find_all("html")
text = ""
if s:
for t in range(len(s)):
text = s[t].get_text()
cl = cleanText.cleanGenText()
text = cl.removeExtraSpaceBetweenWords(text)
return text
except:
print("failed get text from html")
return ""
def getListSection(self, scienceDirect=None):
try:
json = {}
text = ""
textJson, textHTML = "",""
if scienceDirect == None:
# soup = self.openHTMLFile()
# # get list of section
# json = {}
# for h2Pos in range(len(soup.find_all('h2'))):
# if soup.find_all('h2')[h2Pos].text not in json:
# json[soup.find_all('h2')[h2Pos].text] = []
# if h2Pos + 1 < len(soup.find_all('h2')):
# content = soup.find_all('h2')[h2Pos].find_next("p")
# nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
# while content.text != nexth2Content.text:
# json[soup.find_all('h2')[h2Pos].text].append(content.text)
# content = content.find_next("p")
# else:
# content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
# json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
soup = self.openHTMLFile()
h2_tags = soup.find_all('h2')
json = {}
for idx, h2 in enumerate(h2_tags):
section_title = h2.get_text(strip=True)
json.setdefault(section_title, [])
# Get paragraphs until next H2
next_h2 = h2_tags[idx+1] if idx+1 < len(h2_tags) else None
for p in h2.find_all_next("p"):
if next_h2 and p == next_h2:
break
json[section_title].append(p.get_text(strip=True))
# format
'''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
'Results':[], 'Discussion':[], 'References':[],
'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
'Additional information':[], 'Electronic supplementary material':[],
'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
if scienceDirect!= None or len(json)==0:
# Replace with your actual Elsevier API key
api_key = os.environ["SCIENCE_DIRECT_API"]
# ScienceDirect article DOI or PI (Example DOI)
doi = self.htmlLink.split("https://doi.org/")[-1] #"10.1016/j.ajhg.2011.01.009"
# Base URL for the Elsevier API
base_url = "https://api.elsevier.com/content/article/doi/"
# Set headers with API key
headers = {
"Accept": "application/json",
"X-ELS-APIKey": api_key
}
# Make the API request
response = requests.get(base_url + doi, headers=headers)
# Check if the request was successful
if response.status_code == 200:
data = response.json()
supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
# if "originalText" in list(supp_data.keys()):
# if type(supp_data["originalText"])==str:
# json["originalText"] = [supp_data["originalText"]]
# if type(supp_data["originalText"])==dict:
# json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
# else:
# if type(supp_data)==dict:
# for key in supp_data:
# json[key] = [supp_data[key]]
if type(data)==dict:
json["fullText"] = data
textJson = self.mergeTextInJson(json)
textHTML = self.getText()
if len(textHTML) > len(textJson):
text = textHTML
else: text = textJson
return text #json
except:
print("failed all")
return ""
def getReference(self):
# get reference to collect more next data
ref = []
json = self.getListSection()
for key in json["References"]:
ct = cleanText.cleanGenText(key)
cleanText, filteredWord = ct.cleanText()
if cleanText not in ref:
ref.append(cleanText)
return ref
def getSupMaterial(self):
# check if there is material or not
json = {}
soup = self.openHTMLFile()
for h2Pos in range(len(soup.find_all('h2'))):
if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
#print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
link, output = [],[]
if soup.find_all('h2')[h2Pos].text not in json:
json[soup.find_all('h2')[h2Pos].text] = []
for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
link.append(l["href"])
if h2Pos + 1 < len(soup.find_all('h2')):
nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
if nexth2Link in link:
link = link[:link.index(nexth2Link)]
# only take links having "https" in that
for i in link:
if "https" in i: output.append(i)
json[soup.find_all('h2')[h2Pos].text].extend(output)
return json
def extractTable(self):
soup = self.openHTMLFile()
df = []
if len(soup)>0:
try:
df = pd.read_html(str(soup))
except ValueError:
df = []
print("No tables found in HTML file")
return df
def mergeTextInJson(self,jsonHTML):
try:
#cl = cleanText.cleanGenText()
htmlText = ""
if jsonHTML:
# try:
# for sec, entries in jsonHTML.items():
# for i, entry in enumerate(entries):
# # Only process if it's actually text
# if isinstance(entry, str):
# if entry.strip():
# entry, filteredWord = cl.textPreprocessing(entry, keepPeriod=True)
# else:
# # Skip or convert dicts/lists to string if needed
# entry = str(entry)
# jsonHTML[sec][i] = entry
# # Add spacing between sentences
# if i - 1 >= 0 and jsonHTML[sec][i - 1] and jsonHTML[sec][i - 1][-1] != ".":
# htmlText += ". "
# htmlText += entry
# # Add final period if needed
# if entries and isinstance(entries[-1], str) and entries[-1] and entries[-1][-1] != ".":
# htmlText += "."
# htmlText += "\n\n"
# except:
htmlText += str(jsonHTML)
return htmlText
except:
print("failed merge text in json")
return ""