Spaces:

VyLala
/

mtDNALocation

Running

App Files Files Community

mtDNALocation / NER /html /extractHTML.py

VyLala

Upload 54 files

4a80798 verified 10 days ago

raw

history blame contribute delete

10.5 kB

	# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
	from bs4 import BeautifulSoup
	import requests
	from DefaultPackages import openFile, saveFile
	from NER import cleanText
	import pandas as pd
	class HTML():
	def __init__(self, htmlFile, htmlLink):
	self.htmlLink = htmlLink
	self.htmlFile = htmlFile
	# def openHTMLFile(self):
	# headers = {
	# "User-Agent": (
	# "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	# "AppleWebKit/537.36 (KHTML, like Gecko) "
	# "Chrome/114.0.0.0 Safari/537.36"
	# ),
	# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	# "Referer": self.htmlLink,
	# "Connection": "keep-alive"
	# }

	# session = requests.Session()
	# session.headers.update(headers)

	# if self.htmlLink != "None":
	# try:
	# r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
	# if r.status_code != 200:
	# print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}")
	# return BeautifulSoup("", 'html.parser')
	# soup = BeautifulSoup(r.content, 'html.parser')
	# except Exception as e:
	# print(f"❌ Exception fetching HTML: {e}")
	# return BeautifulSoup("", 'html.parser')
	# else:
	# with open(self.htmlFile) as fp:
	# soup = BeautifulSoup(fp, 'html.parser')
	# return soup
	from lxml.etree import ParserError, XMLSyntaxError

	def openHTMLFile(self):
	not_need_domain = ['https://broadinstitute.github.io/picard/',
	'https://software.broadinstitute.org/gatk/best-practices/',
	'https://www.ncbi.nlm.nih.gov/genbank/',
	'https://www.mitomap.org/']
	headers = {
	"User-Agent": (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/114.0.0.0 Safari/537.36"
	),
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Referer": self.htmlLink,
	"Connection": "keep-alive"
	}

	session = requests.Session()
	session.headers.update(headers)
	if self.htmlLink in not_need_domain:
	return BeautifulSoup("", 'html.parser')
	try:
	if self.htmlLink and self.htmlLink != "None":
	r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
	if r.status_code != 200 or not r.text.strip():
	print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
	return BeautifulSoup("", 'html.parser')
	soup = BeautifulSoup(r.content, 'html.parser')
	else:
	with open(self.htmlFile, encoding='utf-8') as fp:
	soup = BeautifulSoup(fp, 'html.parser')
	except (ParserError, XMLSyntaxError, OSError) as e:
	print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
	return BeautifulSoup("", 'html.parser')
	except Exception as e:
	print(f"❌ General exception for {self.htmlLink}: {e}")
	return BeautifulSoup("", 'html.parser')

	return soup

	def getText(self):
	try:
	soup = self.openHTMLFile()
	s = soup.find_all("html")
	text = ""
	if s:
	for t in range(len(s)):
	text = s[t].get_text()
	cl = cleanText.cleanGenText()
	text = cl.removeExtraSpaceBetweenWords(text)
	return text
	except:
	print("failed get text from html")
	return ""
	def getListSection(self, scienceDirect=None):
	try:
	json = {}
	text = ""
	textJson, textHTML = "",""
	if scienceDirect == None:
	# soup = self.openHTMLFile()
	# # get list of section
	# json = {}
	# for h2Pos in range(len(soup.find_all('h2'))):
	# if soup.find_all('h2')[h2Pos].text not in json:
	# json[soup.find_all('h2')[h2Pos].text] = []
	# if h2Pos + 1 < len(soup.find_all('h2')):
	# content = soup.find_all('h2')[h2Pos].find_next("p")
	# nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
	# while content.text != nexth2Content.text:
	# json[soup.find_all('h2')[h2Pos].text].append(content.text)
	# content = content.find_next("p")
	# else:
	# content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
	# json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)

	soup = self.openHTMLFile()
	h2_tags = soup.find_all('h2')
	json = {}

	for idx, h2 in enumerate(h2_tags):
	section_title = h2.get_text(strip=True)
	json.setdefault(section_title, [])

	# Get paragraphs until next H2
	next_h2 = h2_tags[idx+1] if idx+1 < len(h2_tags) else None
	for p in h2.find_all_next("p"):
	if next_h2 and p == next_h2:
	break
	json[section_title].append(p.get_text(strip=True))
	# format
	'''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
	'Results':[], 'Discussion':[], 'References':[],
	'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
	'Additional information':[], 'Electronic supplementary material':[],
	'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
	if scienceDirect!= None or len(json)==0:
	# Replace with your actual Elsevier API key
	api_key = os.environ["SCIENCE_DIRECT_API"]
	# ScienceDirect article DOI or PI (Example DOI)
	doi = self.htmlLink.split("https://doi.org/")[-1] #"10.1016/j.ajhg.2011.01.009"
	# Base URL for the Elsevier API
	base_url = "https://api.elsevier.com/content/article/doi/"
	# Set headers with API key
	headers = {
	"Accept": "application/json",
	"X-ELS-APIKey": api_key
	}
	# Make the API request
	response = requests.get(base_url + doi, headers=headers)
	# Check if the request was successful
	if response.status_code == 200:
	data = response.json()
	supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
	# if "originalText" in list(supp_data.keys()):
	# if type(supp_data["originalText"])==str:
	# json["originalText"] = [supp_data["originalText"]]
	# if type(supp_data["originalText"])==dict:
	# json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
	# else:
	# if type(supp_data)==dict:
	# for key in supp_data:
	# json[key] = [supp_data[key]]
	if type(data)==dict:
	json["fullText"] = data
	textJson = self.mergeTextInJson(json)
	textHTML = self.getText()
	if len(textHTML) > len(textJson):
	text = textHTML
	else: text = textJson
	return text #json
	except:
	print("failed all")
	return ""
	def getReference(self):
	# get reference to collect more next data
	ref = []
	json = self.getListSection()
	for key in json["References"]:
	ct = cleanText.cleanGenText(key)
	cleanText, filteredWord = ct.cleanText()
	if cleanText not in ref:
	ref.append(cleanText)
	return ref
	def getSupMaterial(self):
	# check if there is material or not
	json = {}
	soup = self.openHTMLFile()
	for h2Pos in range(len(soup.find_all('h2'))):
	if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
	#print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
	link, output = [],[]
	if soup.find_all('h2')[h2Pos].text not in json:
	json[soup.find_all('h2')[h2Pos].text] = []
	for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
	link.append(l["href"])
	if h2Pos + 1 < len(soup.find_all('h2')):
	nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
	if nexth2Link in link:
	link = link[:link.index(nexth2Link)]
	# only take links having "https" in that
	for i in link:
	if "https" in i: output.append(i)
	json[soup.find_all('h2')[h2Pos].text].extend(output)
	return json
	def extractTable(self):
	soup = self.openHTMLFile()
	df = []
	if len(soup)>0:
	try:
	df = pd.read_html(str(soup))
	except ValueError:
	df = []
	print("No tables found in HTML file")
	return df
	def mergeTextInJson(self,jsonHTML):
	try:
	#cl = cleanText.cleanGenText()
	htmlText = ""
	if jsonHTML:
	# try:
	# for sec, entries in jsonHTML.items():
	# for i, entry in enumerate(entries):
	# # Only process if it's actually text
	# if isinstance(entry, str):
	# if entry.strip():
	# entry, filteredWord = cl.textPreprocessing(entry, keepPeriod=True)
	# else:
	# # Skip or convert dicts/lists to string if needed
	# entry = str(entry)

	# jsonHTML[sec][i] = entry

	# # Add spacing between sentences
	# if i - 1 >= 0 and jsonHTML[sec][i - 1] and jsonHTML[sec][i - 1][-1] != ".":
	# htmlText += ". "
	# htmlText += entry

	# # Add final period if needed
	# if entries and isinstance(entries[-1], str) and entries[-1] and entries[-1][-1] != ".":
	# htmlText += "."
	# htmlText += "\n\n"
	# except:
	htmlText += str(jsonHTML)
	return htmlText
	except:
	print("failed merge text in json")
	return ""