Spaces:
Running
Running
File size: 10,548 Bytes
4a80798 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 |
# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
from bs4 import BeautifulSoup
import requests
from DefaultPackages import openFile, saveFile
from NER import cleanText
import pandas as pd
class HTML():
def __init__(self, htmlFile, htmlLink):
self.htmlLink = htmlLink
self.htmlFile = htmlFile
# def openHTMLFile(self):
# headers = {
# "User-Agent": (
# "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
# "AppleWebKit/537.36 (KHTML, like Gecko) "
# "Chrome/114.0.0.0 Safari/537.36"
# ),
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Referer": self.htmlLink,
# "Connection": "keep-alive"
# }
# session = requests.Session()
# session.headers.update(headers)
# if self.htmlLink != "None":
# try:
# r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
# if r.status_code != 200:
# print(f"β HTML GET failed: {r.status_code} β {self.htmlLink}")
# return BeautifulSoup("", 'html.parser')
# soup = BeautifulSoup(r.content, 'html.parser')
# except Exception as e:
# print(f"β Exception fetching HTML: {e}")
# return BeautifulSoup("", 'html.parser')
# else:
# with open(self.htmlFile) as fp:
# soup = BeautifulSoup(fp, 'html.parser')
# return soup
from lxml.etree import ParserError, XMLSyntaxError
def openHTMLFile(self):
not_need_domain = ['https://broadinstitute.github.io/picard/',
'https://software.broadinstitute.org/gatk/best-practices/',
'https://www.ncbi.nlm.nih.gov/genbank/',
'https://www.mitomap.org/']
headers = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/114.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer": self.htmlLink,
"Connection": "keep-alive"
}
session = requests.Session()
session.headers.update(headers)
if self.htmlLink in not_need_domain:
return BeautifulSoup("", 'html.parser')
try:
if self.htmlLink and self.htmlLink != "None":
r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
if r.status_code != 200 or not r.text.strip():
print(f"β HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
return BeautifulSoup("", 'html.parser')
soup = BeautifulSoup(r.content, 'html.parser')
else:
with open(self.htmlFile, encoding='utf-8') as fp:
soup = BeautifulSoup(fp, 'html.parser')
except (ParserError, XMLSyntaxError, OSError) as e:
print(f"π« HTML parse error for {self.htmlLink}: {type(e).__name__}")
return BeautifulSoup("", 'html.parser')
except Exception as e:
print(f"β General exception for {self.htmlLink}: {e}")
return BeautifulSoup("", 'html.parser')
return soup
def getText(self):
try:
soup = self.openHTMLFile()
s = soup.find_all("html")
text = ""
if s:
for t in range(len(s)):
text = s[t].get_text()
cl = cleanText.cleanGenText()
text = cl.removeExtraSpaceBetweenWords(text)
return text
except:
print("failed get text from html")
return ""
def getListSection(self, scienceDirect=None):
try:
json = {}
text = ""
textJson, textHTML = "",""
if scienceDirect == None:
# soup = self.openHTMLFile()
# # get list of section
# json = {}
# for h2Pos in range(len(soup.find_all('h2'))):
# if soup.find_all('h2')[h2Pos].text not in json:
# json[soup.find_all('h2')[h2Pos].text] = []
# if h2Pos + 1 < len(soup.find_all('h2')):
# content = soup.find_all('h2')[h2Pos].find_next("p")
# nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
# while content.text != nexth2Content.text:
# json[soup.find_all('h2')[h2Pos].text].append(content.text)
# content = content.find_next("p")
# else:
# content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
# json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)
soup = self.openHTMLFile()
h2_tags = soup.find_all('h2')
json = {}
for idx, h2 in enumerate(h2_tags):
section_title = h2.get_text(strip=True)
json.setdefault(section_title, [])
# Get paragraphs until next H2
next_h2 = h2_tags[idx+1] if idx+1 < len(h2_tags) else None
for p in h2.find_all_next("p"):
if next_h2 and p == next_h2:
break
json[section_title].append(p.get_text(strip=True))
# format
'''json = {'Abstract':[], 'Introduction':[], 'Methods'[],
'Results':[], 'Discussion':[], 'References':[],
'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],
'Additional information':[], 'Electronic supplementary material':[],
'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
if scienceDirect!= None or len(json)==0:
# Replace with your actual Elsevier API key
api_key = os.environ["SCIENCE_DIRECT_API"]
# ScienceDirect article DOI or PI (Example DOI)
doi = self.htmlLink.split("https://doi.org/")[-1] #"10.1016/j.ajhg.2011.01.009"
# Base URL for the Elsevier API
base_url = "https://api.elsevier.com/content/article/doi/"
# Set headers with API key
headers = {
"Accept": "application/json",
"X-ELS-APIKey": api_key
}
# Make the API request
response = requests.get(base_url + doi, headers=headers)
# Check if the request was successful
if response.status_code == 200:
data = response.json()
supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
# if "originalText" in list(supp_data.keys()):
# if type(supp_data["originalText"])==str:
# json["originalText"] = [supp_data["originalText"]]
# if type(supp_data["originalText"])==dict:
# json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
# else:
# if type(supp_data)==dict:
# for key in supp_data:
# json[key] = [supp_data[key]]
if type(data)==dict:
json["fullText"] = data
textJson = self.mergeTextInJson(json)
textHTML = self.getText()
if len(textHTML) > len(textJson):
text = textHTML
else: text = textJson
return text #json
except:
print("failed all")
return ""
def getReference(self):
# get reference to collect more next data
ref = []
json = self.getListSection()
for key in json["References"]:
ct = cleanText.cleanGenText(key)
cleanText, filteredWord = ct.cleanText()
if cleanText not in ref:
ref.append(cleanText)
return ref
def getSupMaterial(self):
# check if there is material or not
json = {}
soup = self.openHTMLFile()
for h2Pos in range(len(soup.find_all('h2'))):
if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
#print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
link, output = [],[]
if soup.find_all('h2')[h2Pos].text not in json:
json[soup.find_all('h2')[h2Pos].text] = []
for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
link.append(l["href"])
if h2Pos + 1 < len(soup.find_all('h2')):
nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
if nexth2Link in link:
link = link[:link.index(nexth2Link)]
# only take links having "https" in that
for i in link:
if "https" in i: output.append(i)
json[soup.find_all('h2')[h2Pos].text].extend(output)
return json
def extractTable(self):
soup = self.openHTMLFile()
df = []
if len(soup)>0:
try:
df = pd.read_html(str(soup))
except ValueError:
df = []
print("No tables found in HTML file")
return df
def mergeTextInJson(self,jsonHTML):
try:
#cl = cleanText.cleanGenText()
htmlText = ""
if jsonHTML:
# try:
# for sec, entries in jsonHTML.items():
# for i, entry in enumerate(entries):
# # Only process if it's actually text
# if isinstance(entry, str):
# if entry.strip():
# entry, filteredWord = cl.textPreprocessing(entry, keepPeriod=True)
# else:
# # Skip or convert dicts/lists to string if needed
# entry = str(entry)
# jsonHTML[sec][i] = entry
# # Add spacing between sentences
# if i - 1 >= 0 and jsonHTML[sec][i - 1] and jsonHTML[sec][i - 1][-1] != ".":
# htmlText += ". "
# htmlText += entry
# # Add final period if needed
# if entries and isinstance(entries[-1], str) and entries[-1] and entries[-1][-1] != ".":
# htmlText += "."
# htmlText += "\n\n"
# except:
htmlText += str(jsonHTML)
return htmlText
except:
print("failed merge text in json")
return ""
|