Spaces:

VyLala
/

mtDNALocation

Running

File size: 10,548 Bytes

4a80798

# reference: https://www.crummy.com/software/BeautifulSoup/bs4/doc/#for-html-documents
from bs4 import BeautifulSoup
import requests
from DefaultPackages import openFile, saveFile
from NER import cleanText
import pandas as pd
class HTML():
  def __init__(self, htmlFile, htmlLink):
    self.htmlLink = htmlLink
    self.htmlFile = htmlFile
  # def openHTMLFile(self):
  #   headers = {
  #       "User-Agent": (
  #           "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
  #           "AppleWebKit/537.36 (KHTML, like Gecko) "
  #           "Chrome/114.0.0.0 Safari/537.36"
  #       ),
  #       "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  #       "Referer": self.htmlLink,
  #       "Connection": "keep-alive"
  #   }

  #   session = requests.Session()
  #   session.headers.update(headers)

  #   if self.htmlLink != "None":
  #       try:
  #           r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
  #           if r.status_code != 200:
  #               print(f"❌ HTML GET failed: {r.status_code} — {self.htmlLink}")
  #               return BeautifulSoup("", 'html.parser')
  #           soup = BeautifulSoup(r.content, 'html.parser')
  #       except Exception as e:
  #           print(f"❌ Exception fetching HTML: {e}")
  #           return BeautifulSoup("", 'html.parser')
  #   else:
  #       with open(self.htmlFile) as fp:
  #           soup = BeautifulSoup(fp, 'html.parser')
  #   return soup
  from lxml.etree import ParserError, XMLSyntaxError
  
  def openHTMLFile(self):
      not_need_domain = ['https://broadinstitute.github.io/picard/',
              'https://software.broadinstitute.org/gatk/best-practices/',
              'https://www.ncbi.nlm.nih.gov/genbank/',
              'https://www.mitomap.org/']
      headers = {
          "User-Agent": (
              "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
              "AppleWebKit/537.36 (KHTML, like Gecko) "
              "Chrome/114.0.0.0 Safari/537.36"
          ),
          "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
          "Referer": self.htmlLink,
          "Connection": "keep-alive"
      }

      session = requests.Session()
      session.headers.update(headers)
      if self.htmlLink in not_need_domain:
        return BeautifulSoup("", 'html.parser')
      try:
          if self.htmlLink and self.htmlLink != "None":
              r = session.get(self.htmlLink, allow_redirects=True, timeout=15)
              if r.status_code != 200 or not r.text.strip():
                  print(f"❌ HTML GET failed ({r.status_code}) or empty page: {self.htmlLink}")
                  return BeautifulSoup("", 'html.parser')
              soup = BeautifulSoup(r.content, 'html.parser')
          else:
              with open(self.htmlFile, encoding='utf-8') as fp:
                  soup = BeautifulSoup(fp, 'html.parser')
      except (ParserError, XMLSyntaxError, OSError) as e:
          print(f"🚫 HTML parse error for {self.htmlLink}: {type(e).__name__}")
          return BeautifulSoup("", 'html.parser')
      except Exception as e:
          print(f"❌ General exception for {self.htmlLink}: {e}")
          return BeautifulSoup("", 'html.parser')

      return soup

  def getText(self):
    try:
      soup = self.openHTMLFile()
      s = soup.find_all("html")
      text = ""
      if s:
        for t in range(len(s)):
          text = s[t].get_text()
      cl = cleanText.cleanGenText()
      text = cl.removeExtraSpaceBetweenWords(text)
      return text
    except:
      print("failed get text from html")
      return "" 
  def getListSection(self, scienceDirect=None):
    try:  
        json = {}
        text = ""
        textJson, textHTML = "",""
        if scienceDirect == None:
          # soup = self.openHTMLFile()
          # # get list of section
          # json = {}
          # for h2Pos in range(len(soup.find_all('h2'))):
          #   if soup.find_all('h2')[h2Pos].text not in json:
          #     json[soup.find_all('h2')[h2Pos].text] = []
          #   if h2Pos + 1 < len(soup.find_all('h2')):
          #     content = soup.find_all('h2')[h2Pos].find_next("p")
          #     nexth2Content = soup.find_all('h2')[h2Pos+1].find_next("p")
          #     while content.text != nexth2Content.text:
          #       json[soup.find_all('h2')[h2Pos].text].append(content.text)
          #       content = content.find_next("p")
          #   else:
          #     content = soup.find_all('h2')[h2Pos].find_all_next("p",string=True)
          #     json[soup.find_all('h2')[h2Pos].text] = list(i.text for i in content)

            soup = self.openHTMLFile()
            h2_tags = soup.find_all('h2')
            json = {}
    
            for idx, h2 in enumerate(h2_tags):
                section_title = h2.get_text(strip=True)
                json.setdefault(section_title, [])
                
                # Get paragraphs until next H2
                next_h2 = h2_tags[idx+1] if idx+1 < len(h2_tags) else None
                for p in h2.find_all_next("p"):
                    if next_h2 and p == next_h2:
                        break
                    json[section_title].append(p.get_text(strip=True))  
          # format
        '''json = {'Abstract':[], 'Introduction':[], 'Methods'[],

        'Results':[], 'Discussion':[], 'References':[],

        'Acknowledgements':[], 'Author information':[], 'Ethics declarations':[],

        'Additional information':[], 'Electronic supplementary material':[],

        'Rights and permissions':[], 'About this article':[], 'Search':[], 'Navigation':[]}'''
        if scienceDirect!= None or len(json)==0:
          # Replace with your actual Elsevier API key
          api_key = os.environ["SCIENCE_DIRECT_API"]  
          # ScienceDirect article DOI or PI (Example DOI)
          doi =  self.htmlLink.split("https://doi.org/")[-1]  #"10.1016/j.ajhg.2011.01.009"
          # Base URL for the Elsevier API
          base_url = "https://api.elsevier.com/content/article/doi/"
          # Set headers with API key
          headers = {
              "Accept": "application/json",
              "X-ELS-APIKey": api_key
          }
          # Make the API request
          response = requests.get(base_url + doi, headers=headers)
    # Check if the request was successful
          if response.status_code == 200:
            data = response.json()
            supp_data = data["full-text-retrieval-response"]#["coredata"]["link"]
            # if "originalText" in list(supp_data.keys()):
            #   if type(supp_data["originalText"])==str:
            #     json["originalText"] = [supp_data["originalText"]]
            #   if type(supp_data["originalText"])==dict:
            #     json["originalText"] = [supp_data["originalText"][key] for key in supp_data["originalText"]]
            # else:
            #   if type(supp_data)==dict:
            #     for key in supp_data:
            #       json[key] = [supp_data[key]]
            if type(data)==dict:
                json["fullText"] = data
        textJson = self.mergeTextInJson(json)
        textHTML = self.getText()
        if len(textHTML) > len(textJson):
          text = textHTML
        else: text = textJson
        return text #json
    except:
        print("failed all")
        return ""  
  def getReference(self):
    # get reference to collect more next data
    ref = []
    json = self.getListSection()
    for key in json["References"]:
      ct = cleanText.cleanGenText(key)
      cleanText, filteredWord = ct.cleanText()
      if cleanText not in ref:
        ref.append(cleanText)
    return ref
  def getSupMaterial(self):
    # check if there is material or not
    json = {}
    soup = self.openHTMLFile()
    for h2Pos in range(len(soup.find_all('h2'))):
      if "supplementary" in soup.find_all('h2')[h2Pos].text.lower() or "material" in soup.find_all('h2')[h2Pos].text.lower() or "additional" in soup.find_all('h2')[h2Pos].text.lower() or "support" in soup.find_all('h2')[h2Pos].text.lower():
        #print(soup.find_all('h2')[h2Pos].find_next("a").get("href"))
        link, output = [],[]
        if soup.find_all('h2')[h2Pos].text not in json:
          json[soup.find_all('h2')[h2Pos].text] = []
        for l in soup.find_all('h2')[h2Pos].find_all_next("a",href=True):
            link.append(l["href"])
        if h2Pos + 1 < len(soup.find_all('h2')):
          nexth2Link = soup.find_all('h2')[h2Pos+1].find_next("a",href=True)["href"]
          if nexth2Link in link:
            link = link[:link.index(nexth2Link)]
        # only take links having "https" in that
        for i in link:
          if "https" in i:  output.append(i)
        json[soup.find_all('h2')[h2Pos].text].extend(output)
    return json
  def extractTable(self):
    soup = self.openHTMLFile()
    df = []
    if len(soup)>0:
      try:
        df = pd.read_html(str(soup))
      except ValueError:
        df = []
        print("No tables found in HTML file")
    return df
  def mergeTextInJson(self,jsonHTML):
    try:
      #cl = cleanText.cleanGenText()
      htmlText = ""
      if jsonHTML:
      #   try:
      #     for sec, entries in jsonHTML.items():
      #         for i, entry in enumerate(entries):
      #             # Only process if it's actually text
      #             if isinstance(entry, str):
      #                 if entry.strip():
      #                     entry, filteredWord = cl.textPreprocessing(entry, keepPeriod=True)
      #             else:
      #                 # Skip or convert dicts/lists to string if needed
      #                 entry = str(entry)

      #             jsonHTML[sec][i] = entry

      #             # Add spacing between sentences
      #             if i - 1 >= 0 and jsonHTML[sec][i - 1] and jsonHTML[sec][i - 1][-1] != ".":
      #                 htmlText += ". "
      #             htmlText += entry

      #         # Add final period if needed
      #         if entries and isinstance(entries[-1], str) and entries[-1] and entries[-1][-1] != ".":
      #             htmlText += "."
      #         htmlText += "\n\n"
      #   except:
        htmlText += str(jsonHTML)
      return htmlText
    except:
      print("failed merge text in json")
      return ""